aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ia64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/kernel')
-rw-r--r--arch/ia64/kernel/Makefile52
-rw-r--r--arch/ia64/kernel/acpi-ext.c100
-rw-r--r--arch/ia64/kernel/acpi.c841
-rw-r--r--arch/ia64/kernel/asm-offsets.c239
-rw-r--r--arch/ia64/kernel/brl_emu.c234
-rw-r--r--arch/ia64/kernel/cyclone.c109
-rw-r--r--arch/ia64/kernel/domain.c382
-rw-r--r--arch/ia64/kernel/efi.c832
-rw-r--r--arch/ia64/kernel/efi_stub.S86
-rw-r--r--arch/ia64/kernel/entry.S1587
-rw-r--r--arch/ia64/kernel/entry.h82
-rw-r--r--arch/ia64/kernel/fsys.S884
-rw-r--r--arch/ia64/kernel/gate-data.S3
-rw-r--r--arch/ia64/kernel/gate.S372
-rw-r--r--arch/ia64/kernel/gate.lds.S95
-rw-r--r--arch/ia64/kernel/head.S996
-rw-r--r--arch/ia64/kernel/ia64_ksyms.c127
-rw-r--r--arch/ia64/kernel/init_task.c46
-rw-r--r--arch/ia64/kernel/iosapic.c827
-rw-r--r--arch/ia64/kernel/irq.c238
-rw-r--r--arch/ia64/kernel/irq_ia64.c278
-rw-r--r--arch/ia64/kernel/irq_lsapic.c37
-rw-r--r--arch/ia64/kernel/ivt.S1619
-rw-r--r--arch/ia64/kernel/machvec.c70
-rw-r--r--arch/ia64/kernel/mca.c1470
-rw-r--r--arch/ia64/kernel/mca_asm.S928
-rw-r--r--arch/ia64/kernel/mca_drv.c639
-rw-r--r--arch/ia64/kernel/mca_drv.h113
-rw-r--r--arch/ia64/kernel/mca_drv_asm.S45
-rw-r--r--arch/ia64/kernel/minstate.h251
-rw-r--r--arch/ia64/kernel/module.c952
-rw-r--r--arch/ia64/kernel/pal.S302
-rw-r--r--arch/ia64/kernel/palinfo.c1023
-rw-r--r--arch/ia64/kernel/patch.c189
-rw-r--r--arch/ia64/kernel/perfmon.c6676
-rw-r--r--arch/ia64/kernel/perfmon_default_smpl.c306
-rw-r--r--arch/ia64/kernel/perfmon_generic.h45
-rw-r--r--arch/ia64/kernel/perfmon_itanium.h115
-rw-r--r--arch/ia64/kernel/perfmon_mckinley.h187
-rw-r--r--arch/ia64/kernel/process.c800
-rw-r--r--arch/ia64/kernel/ptrace.c1627
-rw-r--r--arch/ia64/kernel/sal.c302
-rw-r--r--arch/ia64/kernel/salinfo.c629
-rw-r--r--arch/ia64/kernel/semaphore.c165
-rw-r--r--arch/ia64/kernel/setup.c723
-rw-r--r--arch/ia64/kernel/sigframe.h25
-rw-r--r--arch/ia64/kernel/signal.c691
-rw-r--r--arch/ia64/kernel/smp.c376
-rw-r--r--arch/ia64/kernel/smpboot.c692
-rw-r--r--arch/ia64/kernel/sys_ia64.c298
-rw-r--r--arch/ia64/kernel/time.c255
-rw-r--r--arch/ia64/kernel/topology.c92
-rw-r--r--arch/ia64/kernel/traps.c609
-rw-r--r--arch/ia64/kernel/unaligned.c1521
-rw-r--r--arch/ia64/kernel/unwind.c2306
-rw-r--r--arch/ia64/kernel/unwind_decoder.c459
-rw-r--r--arch/ia64/kernel/unwind_i.h164
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S251
58 files changed, 35362 insertions, 0 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
new file mode 100644
index 000000000000..c1a02bbc252c
--- /dev/null
+++ b/arch/ia64/kernel/Makefile
@@ -0,0 +1,52 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head.o init_task.o vmlinux.lds
6
7obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \
8 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \
9 salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
10 unwind.o mca.o mca_asm.o topology.o
11
12obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o
13obj-$(CONFIG_IA64_GENERIC) += acpi-ext.o
14obj-$(CONFIG_IA64_HP_ZX1) += acpi-ext.o
15obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
16obj-$(CONFIG_IA64_PALINFO) += palinfo.o
17obj-$(CONFIG_IOSAPIC) += iosapic.o
18obj-$(CONFIG_MODULES) += module.o
19obj-$(CONFIG_SMP) += smp.o smpboot.o domain.o
20obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o
21obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
22obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
23mca_recovery-y += mca_drv.o mca_drv_asm.o
24
25# The gate DSO image is built using a special linker script.
26targets += gate.so gate-syms.o
27
28extra-y += gate.so gate-syms.o gate.lds gate.o
29
30# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
31CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
32
33CPPFLAGS_gate.lds := -P -C -U$(ARCH)
34
35quiet_cmd_gate = GATE $@
36 cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
37
38GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1
39$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
40 $(call if_changed,gate)
41
42$(obj)/built-in.o: $(obj)/gate-syms.o
43$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
44
45GATECFLAGS_gate-syms.o = -r
46$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
47 $(call if_changed,gate)
48
49# gate-data.o contains the gate DSO image as data in section .data.gate.
50# We must build gate.so before we can assemble it.
51# Note: kbuild does not track this dependency due to usage of .incbin
52$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c
new file mode 100644
index 000000000000..2623df5e2633
--- /dev/null
+++ b/arch/ia64/kernel/acpi-ext.c
@@ -0,0 +1,100 @@
1/*
2 * arch/ia64/kernel/acpi-ext.c
3 *
4 * Copyright (C) 2003 Hewlett-Packard
5 * Copyright (C) Alex Williamson
6 * Copyright (C) Bjorn Helgaas
7 *
8 * Vendor specific extensions to ACPI.
9 */
10
11#include <linux/config.h>
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/acpi.h>
15#include <linux/efi.h>
16
17#include <asm/acpi-ext.h>
18
19struct acpi_vendor_descriptor {
20 u8 guid_id;
21 efi_guid_t guid;
22};
23
24struct acpi_vendor_info {
25 struct acpi_vendor_descriptor *descriptor;
26 u8 *data;
27 u32 length;
28};
29
30acpi_status
31acpi_vendor_resource_match(struct acpi_resource *resource, void *context)
32{
33 struct acpi_vendor_info *info = (struct acpi_vendor_info *) context;
34 struct acpi_resource_vendor *vendor;
35 struct acpi_vendor_descriptor *descriptor;
36 u32 length;
37
38 if (resource->id != ACPI_RSTYPE_VENDOR)
39 return AE_OK;
40
41 vendor = (struct acpi_resource_vendor *) &resource->data;
42 descriptor = (struct acpi_vendor_descriptor *) vendor->reserved;
43 if (vendor->length <= sizeof(*info->descriptor) ||
44 descriptor->guid_id != info->descriptor->guid_id ||
45 efi_guidcmp(descriptor->guid, info->descriptor->guid))
46 return AE_OK;
47
48 length = vendor->length - sizeof(struct acpi_vendor_descriptor);
49 info->data = acpi_os_allocate(length);
50 if (!info->data)
51 return AE_NO_MEMORY;
52
53 memcpy(info->data, vendor->reserved + sizeof(struct acpi_vendor_descriptor), length);
54 info->length = length;
55 return AE_CTRL_TERMINATE;
56}
57
58acpi_status
59acpi_find_vendor_resource(acpi_handle obj, struct acpi_vendor_descriptor *id,
60 u8 **data, u32 *length)
61{
62 struct acpi_vendor_info info;
63
64 info.descriptor = id;
65 info.data = NULL;
66
67 acpi_walk_resources(obj, METHOD_NAME__CRS, acpi_vendor_resource_match, &info);
68 if (!info.data)
69 return AE_NOT_FOUND;
70
71 *data = info.data;
72 *length = info.length;
73 return AE_OK;
74}
75
76struct acpi_vendor_descriptor hp_ccsr_descriptor = {
77 .guid_id = 2,
78 .guid = EFI_GUID(0x69e9adf9, 0x924f, 0xab5f, 0xf6, 0x4a, 0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad)
79};
80
81acpi_status
82hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length)
83{
84 acpi_status status;
85 u8 *data;
86 u32 length;
87
88 status = acpi_find_vendor_resource(obj, &hp_ccsr_descriptor, &data, &length);
89
90 if (ACPI_FAILURE(status) || length != 16)
91 return AE_NOT_FOUND;
92
93 memcpy(csr_base, data, sizeof(*csr_base));
94 memcpy(csr_length, data + 8, sizeof(*csr_length));
95 acpi_os_free(data);
96
97 return AE_OK;
98}
99
100EXPORT_SYMBOL(hp_acpi_csr_space);
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
new file mode 100644
index 000000000000..a8e99c56a768
--- /dev/null
+++ b/arch/ia64/kernel/acpi.c
@@ -0,0 +1,841 @@
1/*
2 * acpi.c - Architecture-Specific Low-Level ACPI Support
3 *
4 * Copyright (C) 1999 VA Linux Systems
5 * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
6 * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co.
7 * David Mosberger-Tang <davidm@hpl.hp.com>
8 * Copyright (C) 2000 Intel Corp.
9 * Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
10 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
11 * Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
12 * Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
13 * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
14 *
15 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 *
17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or
20 * (at your option) any later version.
21 *
22 * This program is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * You should have received a copy of the GNU General Public License
28 * along with this program; if not, write to the Free Software
29 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 *
31 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
32 */
33
34#include <linux/config.h>
35#include <linux/module.h>
36#include <linux/init.h>
37#include <linux/kernel.h>
38#include <linux/sched.h>
39#include <linux/smp.h>
40#include <linux/string.h>
41#include <linux/types.h>
42#include <linux/irq.h>
43#include <linux/acpi.h>
44#include <linux/efi.h>
45#include <linux/mmzone.h>
46#include <linux/nodemask.h>
47#include <asm/io.h>
48#include <asm/iosapic.h>
49#include <asm/machvec.h>
50#include <asm/page.h>
51#include <asm/system.h>
52#include <asm/numa.h>
53#include <asm/sal.h>
54#include <asm/cyclone.h>
55
56#define BAD_MADT_ENTRY(entry, end) ( \
57 (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
58 ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
59
60#define PREFIX "ACPI: "
61
62void (*pm_idle) (void);
63EXPORT_SYMBOL(pm_idle);
64void (*pm_power_off) (void);
65EXPORT_SYMBOL(pm_power_off);
66
67unsigned char acpi_kbd_controller_present = 1;
68unsigned char acpi_legacy_devices;
69
70#define MAX_SAPICS 256
71u16 ia64_acpiid_to_sapicid[MAX_SAPICS] =
72 { [0 ... MAX_SAPICS - 1] = -1 };
73EXPORT_SYMBOL(ia64_acpiid_to_sapicid);
74
75const char *
76acpi_get_sysname (void)
77{
78#ifdef CONFIG_IA64_GENERIC
79 unsigned long rsdp_phys;
80 struct acpi20_table_rsdp *rsdp;
81 struct acpi_table_xsdt *xsdt;
82 struct acpi_table_header *hdr;
83
84 rsdp_phys = acpi_find_rsdp();
85 if (!rsdp_phys) {
86 printk(KERN_ERR "ACPI 2.0 RSDP not found, default to \"dig\"\n");
87 return "dig";
88 }
89
90 rsdp = (struct acpi20_table_rsdp *) __va(rsdp_phys);
91 if (strncmp(rsdp->signature, RSDP_SIG, sizeof(RSDP_SIG) - 1)) {
92 printk(KERN_ERR "ACPI 2.0 RSDP signature incorrect, default to \"dig\"\n");
93 return "dig";
94 }
95
96 xsdt = (struct acpi_table_xsdt *) __va(rsdp->xsdt_address);
97 hdr = &xsdt->header;
98 if (strncmp(hdr->signature, XSDT_SIG, sizeof(XSDT_SIG) - 1)) {
99 printk(KERN_ERR "ACPI 2.0 XSDT signature incorrect, default to \"dig\"\n");
100 return "dig";
101 }
102
103 if (!strcmp(hdr->oem_id, "HP")) {
104 return "hpzx1";
105 }
106 else if (!strcmp(hdr->oem_id, "SGI")) {
107 return "sn2";
108 }
109
110 return "dig";
111#else
112# if defined (CONFIG_IA64_HP_SIM)
113 return "hpsim";
114# elif defined (CONFIG_IA64_HP_ZX1)
115 return "hpzx1";
116# elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB)
117 return "hpzx1_swiotlb";
118# elif defined (CONFIG_IA64_SGI_SN2)
119 return "sn2";
120# elif defined (CONFIG_IA64_DIG)
121 return "dig";
122# else
123# error Unknown platform. Fix acpi.c.
124# endif
125#endif
126}
127
128#ifdef CONFIG_ACPI_BOOT
129
130#define ACPI_MAX_PLATFORM_INTERRUPTS 256
131
132/* Array to record platform interrupt vectors for generic interrupt routing. */
133int platform_intr_list[ACPI_MAX_PLATFORM_INTERRUPTS] = {
134 [0 ... ACPI_MAX_PLATFORM_INTERRUPTS - 1] = -1
135};
136
137enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_IOSAPIC;
138
139/*
140 * Interrupt routing API for device drivers. Provides interrupt vector for
141 * a generic platform event. Currently only CPEI is implemented.
142 */
143int
144acpi_request_vector (u32 int_type)
145{
146 int vector = -1;
147
148 if (int_type < ACPI_MAX_PLATFORM_INTERRUPTS) {
149 /* corrected platform error interrupt */
150 vector = platform_intr_list[int_type];
151 } else
152 printk(KERN_ERR "acpi_request_vector(): invalid interrupt type\n");
153 return vector;
154}
155
156char *
157__acpi_map_table (unsigned long phys_addr, unsigned long size)
158{
159 return __va(phys_addr);
160}
161
162/* --------------------------------------------------------------------------
163 Boot-time Table Parsing
164 -------------------------------------------------------------------------- */
165
166static int total_cpus __initdata;
167static int available_cpus __initdata;
168struct acpi_table_madt * acpi_madt __initdata;
169static u8 has_8259;
170
171
172static int __init
173acpi_parse_lapic_addr_ovr (
174 acpi_table_entry_header *header, const unsigned long end)
175{
176 struct acpi_table_lapic_addr_ovr *lapic;
177
178 lapic = (struct acpi_table_lapic_addr_ovr *) header;
179
180 if (BAD_MADT_ENTRY(lapic, end))
181 return -EINVAL;
182
183 if (lapic->address) {
184 iounmap(ipi_base_addr);
185 ipi_base_addr = ioremap(lapic->address, 0);
186 }
187 return 0;
188}
189
190
191static int __init
192acpi_parse_lsapic (acpi_table_entry_header *header, const unsigned long end)
193{
194 struct acpi_table_lsapic *lsapic;
195
196 lsapic = (struct acpi_table_lsapic *) header;
197
198 if (BAD_MADT_ENTRY(lsapic, end))
199 return -EINVAL;
200
201 if (lsapic->flags.enabled) {
202#ifdef CONFIG_SMP
203 smp_boot_data.cpu_phys_id[available_cpus] = (lsapic->id << 8) | lsapic->eid;
204#endif
205 ia64_acpiid_to_sapicid[lsapic->acpi_id] = (lsapic->id << 8) | lsapic->eid;
206 ++available_cpus;
207 }
208
209 total_cpus++;
210 return 0;
211}
212
213
214static int __init
215acpi_parse_lapic_nmi (acpi_table_entry_header *header, const unsigned long end)
216{
217 struct acpi_table_lapic_nmi *lacpi_nmi;
218
219 lacpi_nmi = (struct acpi_table_lapic_nmi*) header;
220
221 if (BAD_MADT_ENTRY(lacpi_nmi, end))
222 return -EINVAL;
223
224 /* TBD: Support lapic_nmi entries */
225 return 0;
226}
227
228
229static int __init
230acpi_parse_iosapic (acpi_table_entry_header *header, const unsigned long end)
231{
232 struct acpi_table_iosapic *iosapic;
233
234 iosapic = (struct acpi_table_iosapic *) header;
235
236 if (BAD_MADT_ENTRY(iosapic, end))
237 return -EINVAL;
238
239 iosapic_init(iosapic->address, iosapic->global_irq_base);
240
241 return 0;
242}
243
244
245static int __init
246acpi_parse_plat_int_src (
247 acpi_table_entry_header *header, const unsigned long end)
248{
249 struct acpi_table_plat_int_src *plintsrc;
250 int vector;
251
252 plintsrc = (struct acpi_table_plat_int_src *) header;
253
254 if (BAD_MADT_ENTRY(plintsrc, end))
255 return -EINVAL;
256
257 /*
258 * Get vector assignment for this interrupt, set attributes,
259 * and program the IOSAPIC routing table.
260 */
261 vector = iosapic_register_platform_intr(plintsrc->type,
262 plintsrc->global_irq,
263 plintsrc->iosapic_vector,
264 plintsrc->eid,
265 plintsrc->id,
266 (plintsrc->flags.polarity == 1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
267 (plintsrc->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
268
269 platform_intr_list[plintsrc->type] = vector;
270 return 0;
271}
272
273
274static int __init
275acpi_parse_int_src_ovr (
276 acpi_table_entry_header *header, const unsigned long end)
277{
278 struct acpi_table_int_src_ovr *p;
279
280 p = (struct acpi_table_int_src_ovr *) header;
281
282 if (BAD_MADT_ENTRY(p, end))
283 return -EINVAL;
284
285 iosapic_override_isa_irq(p->bus_irq, p->global_irq,
286 (p->flags.polarity == 1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
287 (p->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
288 return 0;
289}
290
291
292static int __init
293acpi_parse_nmi_src (acpi_table_entry_header *header, const unsigned long end)
294{
295 struct acpi_table_nmi_src *nmi_src;
296
297 nmi_src = (struct acpi_table_nmi_src*) header;
298
299 if (BAD_MADT_ENTRY(nmi_src, end))
300 return -EINVAL;
301
302 /* TBD: Support nimsrc entries */
303 return 0;
304}
305
306static void __init
307acpi_madt_oem_check (char *oem_id, char *oem_table_id)
308{
309 if (!strncmp(oem_id, "IBM", 3) &&
310 (!strncmp(oem_table_id, "SERMOW", 6))) {
311
312 /*
313 * Unfortunately ITC_DRIFT is not yet part of the
314 * official SAL spec, so the ITC_DRIFT bit is not
315 * set by the BIOS on this hardware.
316 */
317 sal_platform_features |= IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT;
318
319 cyclone_setup();
320 }
321}
322
323static int __init
324acpi_parse_madt (unsigned long phys_addr, unsigned long size)
325{
326 if (!phys_addr || !size)
327 return -EINVAL;
328
329 acpi_madt = (struct acpi_table_madt *) __va(phys_addr);
330
331 /* remember the value for reference after free_initmem() */
332#ifdef CONFIG_ITANIUM
333 has_8259 = 1; /* Firmware on old Itanium systems is broken */
334#else
335 has_8259 = acpi_madt->flags.pcat_compat;
336#endif
337 iosapic_system_init(has_8259);
338
339 /* Get base address of IPI Message Block */
340
341 if (acpi_madt->lapic_address)
342 ipi_base_addr = ioremap(acpi_madt->lapic_address, 0);
343
344 printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr);
345
346 acpi_madt_oem_check(acpi_madt->header.oem_id,
347 acpi_madt->header.oem_table_id);
348
349 return 0;
350}
351
352
353#ifdef CONFIG_ACPI_NUMA
354
355#undef SLIT_DEBUG
356
357#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
358
359static int __initdata srat_num_cpus; /* number of cpus */
360static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
361#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag))
362#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag))
363/* maps to convert between proximity domain and logical node ID */
364int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
365int __initdata nid_to_pxm_map[MAX_NUMNODES];
366static struct acpi_table_slit __initdata *slit_table;
367
368/*
369 * ACPI 2.0 SLIT (System Locality Information Table)
370 * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
371 */
372void __init
373acpi_numa_slit_init (struct acpi_table_slit *slit)
374{
375 u32 len;
376
377 len = sizeof(struct acpi_table_header) + 8
378 + slit->localities * slit->localities;
379 if (slit->header.length != len) {
380 printk(KERN_ERR "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n",
381 len, slit->header.length);
382 memset(numa_slit, 10, sizeof(numa_slit));
383 return;
384 }
385 slit_table = slit;
386}
387
388void __init
389acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa)
390{
391 /* record this node in proximity bitmap */
392 pxm_bit_set(pa->proximity_domain);
393
394 node_cpuid[srat_num_cpus].phys_id = (pa->apic_id << 8) | (pa->lsapic_eid);
395 /* nid should be overridden as logical node id later */
396 node_cpuid[srat_num_cpus].nid = pa->proximity_domain;
397 srat_num_cpus++;
398}
399
400void __init
401acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma)
402{
403 unsigned long paddr, size;
404 u8 pxm;
405 struct node_memblk_s *p, *q, *pend;
406
407 pxm = ma->proximity_domain;
408
409 /* fill node memory chunk structure */
410 paddr = ma->base_addr_hi;
411 paddr = (paddr << 32) | ma->base_addr_lo;
412 size = ma->length_hi;
413 size = (size << 32) | ma->length_lo;
414
415 /* Ignore disabled entries */
416 if (!ma->flags.enabled)
417 return;
418
419 /* record this node in proximity bitmap */
420 pxm_bit_set(pxm);
421
422 /* Insertion sort based on base address */
423 pend = &node_memblk[num_node_memblks];
424 for (p = &node_memblk[0]; p < pend; p++) {
425 if (paddr < p->start_paddr)
426 break;
427 }
428 if (p < pend) {
429 for (q = pend - 1; q >= p; q--)
430 *(q + 1) = *q;
431 }
432 p->start_paddr = paddr;
433 p->size = size;
434 p->nid = pxm;
435 num_node_memblks++;
436}
437
438void __init
439acpi_numa_arch_fixup (void)
440{
441 int i, j, node_from, node_to;
442
443 /* If there's no SRAT, fix the phys_id and mark node 0 online */
444 if (srat_num_cpus == 0) {
445 node_set_online(0);
446 node_cpuid[0].phys_id = hard_smp_processor_id();
447 return;
448 }
449
450 /*
451 * MCD - This can probably be dropped now. No need for pxm ID to node ID
452 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
453 */
454 /* calculate total number of nodes in system from PXM bitmap */
455 memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
456 memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
457 nodes_clear(node_online_map);
458 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
459 if (pxm_bit_test(i)) {
460 int nid = num_online_nodes();
461 pxm_to_nid_map[i] = nid;
462 nid_to_pxm_map[nid] = i;
463 node_set_online(nid);
464 }
465 }
466
467 /* set logical node id in memory chunk structure */
468 for (i = 0; i < num_node_memblks; i++)
469 node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
470
471 /* assign memory bank numbers for each chunk on each node */
472 for_each_online_node(i) {
473 int bank;
474
475 bank = 0;
476 for (j = 0; j < num_node_memblks; j++)
477 if (node_memblk[j].nid == i)
478 node_memblk[j].bank = bank++;
479 }
480
481 /* set logical node id in cpu structure */
482 for (i = 0; i < srat_num_cpus; i++)
483 node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
484
485 printk(KERN_INFO "Number of logical nodes in system = %d\n", num_online_nodes());
486 printk(KERN_INFO "Number of memory chunks in system = %d\n", num_node_memblks);
487
488 if (!slit_table) return;
489 memset(numa_slit, -1, sizeof(numa_slit));
490 for (i=0; i<slit_table->localities; i++) {
491 if (!pxm_bit_test(i))
492 continue;
493 node_from = pxm_to_nid_map[i];
494 for (j=0; j<slit_table->localities; j++) {
495 if (!pxm_bit_test(j))
496 continue;
497 node_to = pxm_to_nid_map[j];
498 node_distance(node_from, node_to) =
499 slit_table->entry[i*slit_table->localities + j];
500 }
501 }
502
503#ifdef SLIT_DEBUG
504 printk("ACPI 2.0 SLIT locality table:\n");
505 for_each_online_node(i) {
506 for_each_online_node(j)
507 printk("%03d ", node_distance(i,j));
508 printk("\n");
509 }
510#endif
511}
512#endif /* CONFIG_ACPI_NUMA */
513
514unsigned int
515acpi_register_gsi (u32 gsi, int edge_level, int active_high_low)
516{
517 if (has_8259 && gsi < 16)
518 return isa_irq_to_vector(gsi);
519
520 return iosapic_register_intr(gsi,
521 (active_high_low == ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
522 (edge_level == ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
523}
524EXPORT_SYMBOL(acpi_register_gsi);
525
526#ifdef CONFIG_ACPI_DEALLOCATE_IRQ
527void
528acpi_unregister_gsi (u32 gsi)
529{
530 iosapic_unregister_intr(gsi);
531}
532EXPORT_SYMBOL(acpi_unregister_gsi);
533#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */
534
535static int __init
536acpi_parse_fadt (unsigned long phys_addr, unsigned long size)
537{
538 struct acpi_table_header *fadt_header;
539 struct fadt_descriptor_rev2 *fadt;
540
541 if (!phys_addr || !size)
542 return -EINVAL;
543
544 fadt_header = (struct acpi_table_header *) __va(phys_addr);
545 if (fadt_header->revision != 3)
546 return -ENODEV; /* Only deal with ACPI 2.0 FADT */
547
548 fadt = (struct fadt_descriptor_rev2 *) fadt_header;
549
550 if (!(fadt->iapc_boot_arch & BAF_8042_KEYBOARD_CONTROLLER))
551 acpi_kbd_controller_present = 0;
552
553 if (fadt->iapc_boot_arch & BAF_LEGACY_DEVICES)
554 acpi_legacy_devices = 1;
555
556 acpi_register_gsi(fadt->sci_int, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
557 return 0;
558}
559
560
561unsigned long __init
562acpi_find_rsdp (void)
563{
564 unsigned long rsdp_phys = 0;
565
566 if (efi.acpi20)
567 rsdp_phys = __pa(efi.acpi20);
568 else if (efi.acpi)
569 printk(KERN_WARNING PREFIX "v1.0/r0.71 tables no longer supported\n");
570 return rsdp_phys;
571}
572
573
574int __init
575acpi_boot_init (void)
576{
577
578 /*
579 * MADT
580 * ----
581 * Parse the Multiple APIC Description Table (MADT), if exists.
582 * Note that this table provides platform SMP configuration
583 * information -- the successor to MPS tables.
584 */
585
586 if (acpi_table_parse(ACPI_APIC, acpi_parse_madt) < 1) {
587 printk(KERN_ERR PREFIX "Can't find MADT\n");
588 goto skip_madt;
589 }
590
591 /* Local APIC */
592
593 if (acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0) < 0)
594 printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n");
595
596 if (acpi_table_parse_madt(ACPI_MADT_LSAPIC, acpi_parse_lsapic, NR_CPUS) < 1)
597 printk(KERN_ERR PREFIX "Error parsing MADT - no LAPIC entries\n");
598
599 if (acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0) < 0)
600 printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
601
602 /* I/O APIC */
603
604 if (acpi_table_parse_madt(ACPI_MADT_IOSAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1)
605 printk(KERN_ERR PREFIX "Error parsing MADT - no IOSAPIC entries\n");
606
607 /* System-Level Interrupt Routing */
608
609 if (acpi_table_parse_madt(ACPI_MADT_PLAT_INT_SRC, acpi_parse_plat_int_src, ACPI_MAX_PLATFORM_INTERRUPTS) < 0)
610 printk(KERN_ERR PREFIX "Error parsing platform interrupt source entry\n");
611
612 if (acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, 0) < 0)
613 printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n");
614
615 if (acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, 0) < 0)
616 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
617 skip_madt:
618
619 /*
620 * FADT says whether a legacy keyboard controller is present.
621 * The FADT also contains an SCI_INT line, by which the system
622 * gets interrupts such as power and sleep buttons. If it's not
623 * on a Legacy interrupt, it needs to be setup.
624 */
625 if (acpi_table_parse(ACPI_FADT, acpi_parse_fadt) < 1)
626 printk(KERN_ERR PREFIX "Can't find FADT\n");
627
628#ifdef CONFIG_SMP
629 if (available_cpus == 0) {
630 printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n");
631 printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id());
632 smp_boot_data.cpu_phys_id[available_cpus] = hard_smp_processor_id();
633 available_cpus = 1; /* We've got at least one of these, no? */
634 }
635 smp_boot_data.cpu_count = available_cpus;
636
637 smp_build_cpu_map();
638# ifdef CONFIG_ACPI_NUMA
639 if (srat_num_cpus == 0) {
640 int cpu, i = 1;
641 for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++)
642 if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id())
643 node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu];
644 }
645 build_cpu_to_node_map();
646# endif
647#endif
648 /* Make boot-up look pretty */
649 printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, total_cpus);
650 return 0;
651}
652
653int
654acpi_gsi_to_irq (u32 gsi, unsigned int *irq)
655{
656 int vector;
657
658 if (has_8259 && gsi < 16)
659 *irq = isa_irq_to_vector(gsi);
660 else {
661 vector = gsi_to_vector(gsi);
662 if (vector == -1)
663 return -1;
664
665 *irq = vector;
666 }
667 return 0;
668}
669
670/*
671 * ACPI based hotplug CPU support
672 */
673#ifdef CONFIG_ACPI_HOTPLUG_CPU
674static
675int
676acpi_map_cpu2node(acpi_handle handle, int cpu, long physid)
677{
678#ifdef CONFIG_ACPI_NUMA
679 int pxm_id;
680
681 pxm_id = acpi_get_pxm(handle);
682
683 /*
684 * Assuming that the container driver would have set the proximity
685 * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
686 */
687 node_cpuid[cpu].nid = (pxm_id < 0) ? 0:
688 pxm_to_nid_map[pxm_id];
689
690 node_cpuid[cpu].phys_id = physid;
691#endif
692 return(0);
693}
694
695
696int
697acpi_map_lsapic(acpi_handle handle, int *pcpu)
698{
699 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
700 union acpi_object *obj;
701 struct acpi_table_lsapic *lsapic;
702 cpumask_t tmp_map;
703 long physid;
704 int cpu;
705
706 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
707 return -EINVAL;
708
709 if (!buffer.length || !buffer.pointer)
710 return -EINVAL;
711
712 obj = buffer.pointer;
713 if (obj->type != ACPI_TYPE_BUFFER ||
714 obj->buffer.length < sizeof(*lsapic)) {
715 acpi_os_free(buffer.pointer);
716 return -EINVAL;
717 }
718
719 lsapic = (struct acpi_table_lsapic *)obj->buffer.pointer;
720
721 if ((lsapic->header.type != ACPI_MADT_LSAPIC) ||
722 (!lsapic->flags.enabled)) {
723 acpi_os_free(buffer.pointer);
724 return -EINVAL;
725 }
726
727 physid = ((lsapic->id <<8) | (lsapic->eid));
728
729 acpi_os_free(buffer.pointer);
730 buffer.length = ACPI_ALLOCATE_BUFFER;
731 buffer.pointer = NULL;
732
733 cpus_complement(tmp_map, cpu_present_map);
734 cpu = first_cpu(tmp_map);
735 if(cpu >= NR_CPUS)
736 return -EINVAL;
737
738 acpi_map_cpu2node(handle, cpu, physid);
739
740 cpu_set(cpu, cpu_present_map);
741 ia64_cpu_to_sapicid[cpu] = physid;
742 ia64_acpiid_to_sapicid[lsapic->acpi_id] = ia64_cpu_to_sapicid[cpu];
743
744 *pcpu = cpu;
745 return(0);
746}
747EXPORT_SYMBOL(acpi_map_lsapic);
748
749
750int
751acpi_unmap_lsapic(int cpu)
752{
753 int i;
754
755 for (i=0; i<MAX_SAPICS; i++) {
756 if (ia64_acpiid_to_sapicid[i] == ia64_cpu_to_sapicid[cpu]) {
757 ia64_acpiid_to_sapicid[i] = -1;
758 break;
759 }
760 }
761 ia64_cpu_to_sapicid[cpu] = -1;
762 cpu_clear(cpu,cpu_present_map);
763
764#ifdef CONFIG_ACPI_NUMA
765 /* NUMA specific cleanup's */
766#endif
767
768 return(0);
769}
770EXPORT_SYMBOL(acpi_unmap_lsapic);
771#endif /* CONFIG_ACPI_HOTPLUG_CPU */
772
773
774#ifdef CONFIG_ACPI_NUMA
775acpi_status __init
776acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret)
777{
778 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
779 union acpi_object *obj;
780 struct acpi_table_iosapic *iosapic;
781 unsigned int gsi_base;
782 int node;
783
784 /* Only care about objects w/ a method that returns the MADT */
785 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
786 return AE_OK;
787
788 if (!buffer.length || !buffer.pointer)
789 return AE_OK;
790
791 obj = buffer.pointer;
792 if (obj->type != ACPI_TYPE_BUFFER ||
793 obj->buffer.length < sizeof(*iosapic)) {
794 acpi_os_free(buffer.pointer);
795 return AE_OK;
796 }
797
798 iosapic = (struct acpi_table_iosapic *)obj->buffer.pointer;
799
800 if (iosapic->header.type != ACPI_MADT_IOSAPIC) {
801 acpi_os_free(buffer.pointer);
802 return AE_OK;
803 }
804
805 gsi_base = iosapic->global_irq_base;
806
807 acpi_os_free(buffer.pointer);
808 buffer.length = ACPI_ALLOCATE_BUFFER;
809 buffer.pointer = NULL;
810
811 /*
812 * OK, it's an IOSAPIC MADT entry, look for a _PXM method to tell
813 * us which node to associate this with.
814 */
815 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PXM", NULL, &buffer)))
816 return AE_OK;
817
818 if (!buffer.length || !buffer.pointer)
819 return AE_OK;
820
821 obj = buffer.pointer;
822
823 if (obj->type != ACPI_TYPE_INTEGER ||
824 obj->integer.value >= MAX_PXM_DOMAINS) {
825 acpi_os_free(buffer.pointer);
826 return AE_OK;
827 }
828
829 node = pxm_to_nid_map[obj->integer.value];
830 acpi_os_free(buffer.pointer);
831
832 if (node >= MAX_NUMNODES || !node_online(node) ||
833 cpus_empty(node_to_cpumask(node)))
834 return AE_OK;
835
836 /* We know a gsi to node mapping! */
837 map_iosapic_to_node(gsi_base, node);
838 return AE_OK;
839}
840#endif /* CONFIG_NUMA */
841#endif /* CONFIG_ACPI_BOOT */
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
new file mode 100644
index 000000000000..7d1ae2982c53
--- /dev/null
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -0,0 +1,239 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/config.h>
8
9#include <linux/sched.h>
10
11#include <asm-ia64/processor.h>
12#include <asm-ia64/ptrace.h>
13#include <asm-ia64/siginfo.h>
14#include <asm-ia64/sigcontext.h>
15#include <asm-ia64/mca.h>
16
17#include "../kernel/sigframe.h"
18
19#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
21
22#define BLANK() asm volatile("\n->" : : )
23
24void foo(void)
25{
26 DEFINE(IA64_TASK_SIZE, sizeof (struct task_struct));
27 DEFINE(IA64_THREAD_INFO_SIZE, sizeof (struct thread_info));
28 DEFINE(IA64_PT_REGS_SIZE, sizeof (struct pt_regs));
29 DEFINE(IA64_SWITCH_STACK_SIZE, sizeof (struct switch_stack));
30 DEFINE(IA64_SIGINFO_SIZE, sizeof (struct siginfo));
31 DEFINE(IA64_CPU_SIZE, sizeof (struct cpuinfo_ia64));
32 DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
33 DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
34
35 BLANK();
36
37 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
38 DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
39
40 BLANK();
41
42 DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked));
43 DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid));
44 DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader));
45 DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending));
46 DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid));
47 DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent));
48 DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand));
49 DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal));
50 DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
51 DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp));
52 DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack));
53
54 BLANK();
55
56 DEFINE(IA64_SIGHAND_SIGLOCK_OFFSET,offsetof (struct sighand_struct, siglock));
57
58 BLANK();
59
60 DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct,
61 group_stop_count));
62 DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending));
63
64 BLANK();
65
66 DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6));
67 DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7));
68 DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd));
69 DEFINE(IA64_PT_REGS_AR_SSD_OFFSET, offsetof (struct pt_regs, ar_ssd));
70 DEFINE(IA64_PT_REGS_R8_OFFSET, offsetof (struct pt_regs, r8));
71 DEFINE(IA64_PT_REGS_R9_OFFSET, offsetof (struct pt_regs, r9));
72 DEFINE(IA64_PT_REGS_R10_OFFSET, offsetof (struct pt_regs, r10));
73 DEFINE(IA64_PT_REGS_R11_OFFSET, offsetof (struct pt_regs, r11));
74 DEFINE(IA64_PT_REGS_CR_IPSR_OFFSET, offsetof (struct pt_regs, cr_ipsr));
75 DEFINE(IA64_PT_REGS_CR_IIP_OFFSET, offsetof (struct pt_regs, cr_iip));
76 DEFINE(IA64_PT_REGS_CR_IFS_OFFSET, offsetof (struct pt_regs, cr_ifs));
77 DEFINE(IA64_PT_REGS_AR_UNAT_OFFSET, offsetof (struct pt_regs, ar_unat));
78 DEFINE(IA64_PT_REGS_AR_PFS_OFFSET, offsetof (struct pt_regs, ar_pfs));
79 DEFINE(IA64_PT_REGS_AR_RSC_OFFSET, offsetof (struct pt_regs, ar_rsc));
80 DEFINE(IA64_PT_REGS_AR_RNAT_OFFSET, offsetof (struct pt_regs, ar_rnat));
81
82 DEFINE(IA64_PT_REGS_AR_BSPSTORE_OFFSET, offsetof (struct pt_regs, ar_bspstore));
83 DEFINE(IA64_PT_REGS_PR_OFFSET, offsetof (struct pt_regs, pr));
84 DEFINE(IA64_PT_REGS_B0_OFFSET, offsetof (struct pt_regs, b0));
85 DEFINE(IA64_PT_REGS_LOADRS_OFFSET, offsetof (struct pt_regs, loadrs));
86 DEFINE(IA64_PT_REGS_R1_OFFSET, offsetof (struct pt_regs, r1));
87 DEFINE(IA64_PT_REGS_R12_OFFSET, offsetof (struct pt_regs, r12));
88 DEFINE(IA64_PT_REGS_R13_OFFSET, offsetof (struct pt_regs, r13));
89 DEFINE(IA64_PT_REGS_AR_FPSR_OFFSET, offsetof (struct pt_regs, ar_fpsr));
90 DEFINE(IA64_PT_REGS_R15_OFFSET, offsetof (struct pt_regs, r15));
91 DEFINE(IA64_PT_REGS_R14_OFFSET, offsetof (struct pt_regs, r14));
92 DEFINE(IA64_PT_REGS_R2_OFFSET, offsetof (struct pt_regs, r2));
93 DEFINE(IA64_PT_REGS_R3_OFFSET, offsetof (struct pt_regs, r3));
94 DEFINE(IA64_PT_REGS_R16_OFFSET, offsetof (struct pt_regs, r16));
95 DEFINE(IA64_PT_REGS_R17_OFFSET, offsetof (struct pt_regs, r17));
96 DEFINE(IA64_PT_REGS_R18_OFFSET, offsetof (struct pt_regs, r18));
97 DEFINE(IA64_PT_REGS_R19_OFFSET, offsetof (struct pt_regs, r19));
98 DEFINE(IA64_PT_REGS_R20_OFFSET, offsetof (struct pt_regs, r20));
99 DEFINE(IA64_PT_REGS_R21_OFFSET, offsetof (struct pt_regs, r21));
100 DEFINE(IA64_PT_REGS_R22_OFFSET, offsetof (struct pt_regs, r22));
101 DEFINE(IA64_PT_REGS_R23_OFFSET, offsetof (struct pt_regs, r23));
102 DEFINE(IA64_PT_REGS_R24_OFFSET, offsetof (struct pt_regs, r24));
103 DEFINE(IA64_PT_REGS_R25_OFFSET, offsetof (struct pt_regs, r25));
104 DEFINE(IA64_PT_REGS_R26_OFFSET, offsetof (struct pt_regs, r26));
105 DEFINE(IA64_PT_REGS_R27_OFFSET, offsetof (struct pt_regs, r27));
106 DEFINE(IA64_PT_REGS_R28_OFFSET, offsetof (struct pt_regs, r28));
107 DEFINE(IA64_PT_REGS_R29_OFFSET, offsetof (struct pt_regs, r29));
108 DEFINE(IA64_PT_REGS_R30_OFFSET, offsetof (struct pt_regs, r30));
109 DEFINE(IA64_PT_REGS_R31_OFFSET, offsetof (struct pt_regs, r31));
110 DEFINE(IA64_PT_REGS_AR_CCV_OFFSET, offsetof (struct pt_regs, ar_ccv));
111 DEFINE(IA64_PT_REGS_F6_OFFSET, offsetof (struct pt_regs, f6));
112 DEFINE(IA64_PT_REGS_F7_OFFSET, offsetof (struct pt_regs, f7));
113 DEFINE(IA64_PT_REGS_F8_OFFSET, offsetof (struct pt_regs, f8));
114 DEFINE(IA64_PT_REGS_F9_OFFSET, offsetof (struct pt_regs, f9));
115 DEFINE(IA64_PT_REGS_F10_OFFSET, offsetof (struct pt_regs, f10));
116 DEFINE(IA64_PT_REGS_F11_OFFSET, offsetof (struct pt_regs, f11));
117
118 BLANK();
119
120 DEFINE(IA64_SWITCH_STACK_CALLER_UNAT_OFFSET, offsetof (struct switch_stack, caller_unat));
121 DEFINE(IA64_SWITCH_STACK_AR_FPSR_OFFSET, offsetof (struct switch_stack, ar_fpsr));
122 DEFINE(IA64_SWITCH_STACK_F2_OFFSET, offsetof (struct switch_stack, f2));
123 DEFINE(IA64_SWITCH_STACK_F3_OFFSET, offsetof (struct switch_stack, f3));
124 DEFINE(IA64_SWITCH_STACK_F4_OFFSET, offsetof (struct switch_stack, f4));
125 DEFINE(IA64_SWITCH_STACK_F5_OFFSET, offsetof (struct switch_stack, f5));
126 DEFINE(IA64_SWITCH_STACK_F12_OFFSET, offsetof (struct switch_stack, f12));
127 DEFINE(IA64_SWITCH_STACK_F13_OFFSET, offsetof (struct switch_stack, f13));
128 DEFINE(IA64_SWITCH_STACK_F14_OFFSET, offsetof (struct switch_stack, f14));
129 DEFINE(IA64_SWITCH_STACK_F15_OFFSET, offsetof (struct switch_stack, f15));
130 DEFINE(IA64_SWITCH_STACK_F16_OFFSET, offsetof (struct switch_stack, f16));
131 DEFINE(IA64_SWITCH_STACK_F17_OFFSET, offsetof (struct switch_stack, f17));
132 DEFINE(IA64_SWITCH_STACK_F18_OFFSET, offsetof (struct switch_stack, f18));
133 DEFINE(IA64_SWITCH_STACK_F19_OFFSET, offsetof (struct switch_stack, f19));
134 DEFINE(IA64_SWITCH_STACK_F20_OFFSET, offsetof (struct switch_stack, f20));
135 DEFINE(IA64_SWITCH_STACK_F21_OFFSET, offsetof (struct switch_stack, f21));
136 DEFINE(IA64_SWITCH_STACK_F22_OFFSET, offsetof (struct switch_stack, f22));
137 DEFINE(IA64_SWITCH_STACK_F23_OFFSET, offsetof (struct switch_stack, f23));
138 DEFINE(IA64_SWITCH_STACK_F24_OFFSET, offsetof (struct switch_stack, f24));
139 DEFINE(IA64_SWITCH_STACK_F25_OFFSET, offsetof (struct switch_stack, f25));
140 DEFINE(IA64_SWITCH_STACK_F26_OFFSET, offsetof (struct switch_stack, f26));
141 DEFINE(IA64_SWITCH_STACK_F27_OFFSET, offsetof (struct switch_stack, f27));
142 DEFINE(IA64_SWITCH_STACK_F28_OFFSET, offsetof (struct switch_stack, f28));
143 DEFINE(IA64_SWITCH_STACK_F29_OFFSET, offsetof (struct switch_stack, f29));
144 DEFINE(IA64_SWITCH_STACK_F30_OFFSET, offsetof (struct switch_stack, f30));
145 DEFINE(IA64_SWITCH_STACK_F31_OFFSET, offsetof (struct switch_stack, f31));
146 DEFINE(IA64_SWITCH_STACK_R4_OFFSET, offsetof (struct switch_stack, r4));
147 DEFINE(IA64_SWITCH_STACK_R5_OFFSET, offsetof (struct switch_stack, r5));
148 DEFINE(IA64_SWITCH_STACK_R6_OFFSET, offsetof (struct switch_stack, r6));
149 DEFINE(IA64_SWITCH_STACK_R7_OFFSET, offsetof (struct switch_stack, r7));
150 DEFINE(IA64_SWITCH_STACK_B0_OFFSET, offsetof (struct switch_stack, b0));
151 DEFINE(IA64_SWITCH_STACK_B1_OFFSET, offsetof (struct switch_stack, b1));
152 DEFINE(IA64_SWITCH_STACK_B2_OFFSET, offsetof (struct switch_stack, b2));
153 DEFINE(IA64_SWITCH_STACK_B3_OFFSET, offsetof (struct switch_stack, b3));
154 DEFINE(IA64_SWITCH_STACK_B4_OFFSET, offsetof (struct switch_stack, b4));
155 DEFINE(IA64_SWITCH_STACK_B5_OFFSET, offsetof (struct switch_stack, b5));
156 DEFINE(IA64_SWITCH_STACK_AR_PFS_OFFSET, offsetof (struct switch_stack, ar_pfs));
157 DEFINE(IA64_SWITCH_STACK_AR_LC_OFFSET, offsetof (struct switch_stack, ar_lc));
158 DEFINE(IA64_SWITCH_STACK_AR_UNAT_OFFSET, offsetof (struct switch_stack, ar_unat));
159 DEFINE(IA64_SWITCH_STACK_AR_RNAT_OFFSET, offsetof (struct switch_stack, ar_rnat));
160 DEFINE(IA64_SWITCH_STACK_AR_BSPSTORE_OFFSET, offsetof (struct switch_stack, ar_bspstore));
161 DEFINE(IA64_SWITCH_STACK_PR_OFFSET, offsetof (struct switch_stack, pr));
162
163 BLANK();
164
165 DEFINE(IA64_SIGCONTEXT_IP_OFFSET, offsetof (struct sigcontext, sc_ip));
166 DEFINE(IA64_SIGCONTEXT_AR_BSP_OFFSET, offsetof (struct sigcontext, sc_ar_bsp));
167 DEFINE(IA64_SIGCONTEXT_AR_FPSR_OFFSET, offsetof (struct sigcontext, sc_ar_fpsr));
168 DEFINE(IA64_SIGCONTEXT_AR_RNAT_OFFSET, offsetof (struct sigcontext, sc_ar_rnat));
169 DEFINE(IA64_SIGCONTEXT_AR_UNAT_OFFSET, offsetof (struct sigcontext, sc_ar_unat));
170 DEFINE(IA64_SIGCONTEXT_B0_OFFSET, offsetof (struct sigcontext, sc_br[0]));
171 DEFINE(IA64_SIGCONTEXT_CFM_OFFSET, offsetof (struct sigcontext, sc_cfm));
172 DEFINE(IA64_SIGCONTEXT_FLAGS_OFFSET, offsetof (struct sigcontext, sc_flags));
173 DEFINE(IA64_SIGCONTEXT_FR6_OFFSET, offsetof (struct sigcontext, sc_fr[6]));
174 DEFINE(IA64_SIGCONTEXT_PR_OFFSET, offsetof (struct sigcontext, sc_pr));
175 DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12]));
176 DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base));
177 DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs));
178
179 BLANK();
180
181 DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal));
182
183 BLANK();
184
185 DEFINE(IA64_SIGFRAME_ARG0_OFFSET, offsetof (struct sigframe, arg0));
186 DEFINE(IA64_SIGFRAME_ARG1_OFFSET, offsetof (struct sigframe, arg1));
187 DEFINE(IA64_SIGFRAME_ARG2_OFFSET, offsetof (struct sigframe, arg2));
188 DEFINE(IA64_SIGFRAME_HANDLER_OFFSET, offsetof (struct sigframe, handler));
189 DEFINE(IA64_SIGFRAME_SIGCONTEXT_OFFSET, offsetof (struct sigframe, sc));
190 BLANK();
191 /* for assembly files which can't include sched.h: */
192 DEFINE(IA64_CLONE_VFORK, CLONE_VFORK);
193 DEFINE(IA64_CLONE_VM, CLONE_VM);
194
195 BLANK();
196 DEFINE(IA64_CPUINFO_NSEC_PER_CYC_OFFSET,
197 offsetof (struct cpuinfo_ia64, nsec_per_cyc));
198 DEFINE(IA64_CPUINFO_PTCE_BASE_OFFSET,
199 offsetof (struct cpuinfo_ia64, ptce_base));
200 DEFINE(IA64_CPUINFO_PTCE_COUNT_OFFSET,
201 offsetof (struct cpuinfo_ia64, ptce_count));
202 DEFINE(IA64_CPUINFO_PTCE_STRIDE_OFFSET,
203 offsetof (struct cpuinfo_ia64, ptce_stride));
204 BLANK();
205 DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET,
206 offsetof (struct timespec, tv_nsec));
207
208 DEFINE(CLONE_SETTLS_BIT, 19);
209#if CLONE_SETTLS != (1<<19)
210# error "CLONE_SETTLS_BIT incorrect, please fix"
211#endif
212
213 BLANK();
214 DEFINE(IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET,
215 offsetof (struct ia64_mca_cpu, proc_state_dump));
216 DEFINE(IA64_MCA_CPU_STACK_OFFSET,
217 offsetof (struct ia64_mca_cpu, stack));
218 DEFINE(IA64_MCA_CPU_STACKFRAME_OFFSET,
219 offsetof (struct ia64_mca_cpu, stackframe));
220 DEFINE(IA64_MCA_CPU_RBSTORE_OFFSET,
221 offsetof (struct ia64_mca_cpu, rbstore));
222 DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET,
223 offsetof (struct ia64_mca_cpu, init_stack));
224 BLANK();
225 /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
226 DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr));
227 DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source));
228 DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift));
229 DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc));
230 DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset));
231 DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle));
232 DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter));
233 DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter));
234 DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask));
235 DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU);
236 DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
237 DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
238 DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
239}
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
new file mode 100644
index 000000000000..0b286ca164f9
--- /dev/null
+++ b/arch/ia64/kernel/brl_emu.c
@@ -0,0 +1,234 @@
1/*
2 * Emulation of the "brl" instruction for IA64 processors that
3 * don't support it in hardware.
4 * Author: Stephan Zeisset, Intel Corp. <Stephan.Zeisset@intel.com>
5 *
6 * 02/22/02 D. Mosberger Clear si_flgs, si_isr, and si_imm to avoid
7 * leaking kernel bits.
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <asm/uaccess.h>
13#include <asm/processor.h>
14
15extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5;
16
17struct illegal_op_return {
18 unsigned long fkt, arg1, arg2, arg3;
19};
20
21/*
22 * The unimplemented bits of a virtual address must be set
23 * to the value of the most significant implemented bit.
24 * unimpl_va_mask includes all unimplemented bits and
25 * the most significant implemented bit, so the result
26 * of an and operation with the mask must be all 0's
27 * or all 1's for the address to be valid.
28 */
29#define unimplemented_virtual_address(va) ( \
30 ((va) & local_cpu_data->unimpl_va_mask) != 0 && \
31 ((va) & local_cpu_data->unimpl_va_mask) != local_cpu_data->unimpl_va_mask \
32)
33
34/*
35 * The unimplemented bits of a physical address must be 0.
36 * unimpl_pa_mask includes all unimplemented bits, so the result
37 * of an and operation with the mask must be all 0's for the
38 * address to be valid.
39 */
40#define unimplemented_physical_address(pa) ( \
41 ((pa) & local_cpu_data->unimpl_pa_mask) != 0 \
42)
43
44/*
45 * Handle an illegal operation fault that was caused by an
46 * unimplemented "brl" instruction.
47 * If we are not successful (e.g because the illegal operation
48 * wasn't caused by a "brl" after all), we return -1.
49 * If we are successful, we return either 0 or the address
50 * of a "fixup" function for manipulating preserved register
51 * state.
52 */
53
54struct illegal_op_return
55ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
56{
57 unsigned long bundle[2];
58 unsigned long opcode, btype, qp, offset, cpl;
59 unsigned long next_ip;
60 struct siginfo siginfo;
61 struct illegal_op_return rv;
62 long tmp_taken, unimplemented_address;
63
64 rv.fkt = (unsigned long) -1;
65
66 /*
67 * Decode the instruction bundle.
68 */
69
70 if (copy_from_user(bundle, (void *) (regs->cr_iip), sizeof(bundle)))
71 return rv;
72
73 next_ip = (unsigned long) regs->cr_iip + 16;
74
75 /* "brl" must be in slot 2. */
76 if (ia64_psr(regs)->ri != 1) return rv;
77
78 /* Must be "mlx" template */
79 if ((bundle[0] & 0x1e) != 0x4) return rv;
80
81 opcode = (bundle[1] >> 60);
82 btype = ((bundle[1] >> 29) & 0x7);
83 qp = ((bundle[1] >> 23) & 0x3f);
84 offset = ((bundle[1] & 0x0800000000000000L) << 4)
85 | ((bundle[1] & 0x00fffff000000000L) >> 32)
86 | ((bundle[1] & 0x00000000007fffffL) << 40)
87 | ((bundle[0] & 0xffff000000000000L) >> 24);
88
89 tmp_taken = regs->pr & (1L << qp);
90
91 switch(opcode) {
92
93 case 0xC:
94 /*
95 * Long Branch.
96 */
97 if (btype != 0) return rv;
98 rv.fkt = 0;
99 if (!(tmp_taken)) {
100 /*
101 * Qualifying predicate is 0.
102 * Skip instruction.
103 */
104 regs->cr_iip = next_ip;
105 ia64_psr(regs)->ri = 0;
106 return rv;
107 }
108 break;
109
110 case 0xD:
111 /*
112 * Long Call.
113 */
114 rv.fkt = 0;
115 if (!(tmp_taken)) {
116 /*
117 * Qualifying predicate is 0.
118 * Skip instruction.
119 */
120 regs->cr_iip = next_ip;
121 ia64_psr(regs)->ri = 0;
122 return rv;
123 }
124
125 /*
126 * BR[btype] = IP+16
127 */
128 switch(btype) {
129 case 0:
130 regs->b0 = next_ip;
131 break;
132 case 1:
133 rv.fkt = (unsigned long) &ia64_set_b1;
134 break;
135 case 2:
136 rv.fkt = (unsigned long) &ia64_set_b2;
137 break;
138 case 3:
139 rv.fkt = (unsigned long) &ia64_set_b3;
140 break;
141 case 4:
142 rv.fkt = (unsigned long) &ia64_set_b4;
143 break;
144 case 5:
145 rv.fkt = (unsigned long) &ia64_set_b5;
146 break;
147 case 6:
148 regs->b6 = next_ip;
149 break;
150 case 7:
151 regs->b7 = next_ip;
152 break;
153 }
154 rv.arg1 = next_ip;
155
156 /*
157 * AR[PFS].pfm = CFM
158 * AR[PFS].pec = AR[EC]
159 * AR[PFS].ppl = PSR.cpl
160 */
161 cpl = ia64_psr(regs)->cpl;
162 regs->ar_pfs = ((regs->cr_ifs & 0x3fffffffff)
163 | (ar_ec << 52) | (cpl << 62));
164
165 /*
166 * CFM.sof -= CFM.sol
167 * CFM.sol = 0
168 * CFM.sor = 0
169 * CFM.rrb.gr = 0
170 * CFM.rrb.fr = 0
171 * CFM.rrb.pr = 0
172 */
173 regs->cr_ifs = ((regs->cr_ifs & 0xffffffc00000007f)
174 - ((regs->cr_ifs >> 7) & 0x7f));
175
176 break;
177
178 default:
179 /*
180 * Unknown opcode.
181 */
182 return rv;
183
184 }
185
186 regs->cr_iip += offset;
187 ia64_psr(regs)->ri = 0;
188
189 if (ia64_psr(regs)->it == 0)
190 unimplemented_address = unimplemented_physical_address(regs->cr_iip);
191 else
192 unimplemented_address = unimplemented_virtual_address(regs->cr_iip);
193
194 if (unimplemented_address) {
195 /*
196 * The target address contains unimplemented bits.
197 */
198 printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n");
199 siginfo.si_signo = SIGILL;
200 siginfo.si_errno = 0;
201 siginfo.si_flags = 0;
202 siginfo.si_isr = 0;
203 siginfo.si_imm = 0;
204 siginfo.si_code = ILL_BADIADDR;
205 force_sig_info(SIGILL, &siginfo, current);
206 } else if (ia64_psr(regs)->tb) {
207 /*
208 * Branch Tracing is enabled.
209 * Force a taken branch signal.
210 */
211 siginfo.si_signo = SIGTRAP;
212 siginfo.si_errno = 0;
213 siginfo.si_code = TRAP_BRANCH;
214 siginfo.si_flags = 0;
215 siginfo.si_isr = 0;
216 siginfo.si_addr = 0;
217 siginfo.si_imm = 0;
218 force_sig_info(SIGTRAP, &siginfo, current);
219 } else if (ia64_psr(regs)->ss) {
220 /*
221 * Single Step is enabled.
222 * Force a trace signal.
223 */
224 siginfo.si_signo = SIGTRAP;
225 siginfo.si_errno = 0;
226 siginfo.si_code = TRAP_TRACE;
227 siginfo.si_flags = 0;
228 siginfo.si_isr = 0;
229 siginfo.si_addr = 0;
230 siginfo.si_imm = 0;
231 force_sig_info(SIGTRAP, &siginfo, current);
232 }
233 return rv;
234}
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
new file mode 100644
index 000000000000..768c7e46957c
--- /dev/null
+++ b/arch/ia64/kernel/cyclone.c
@@ -0,0 +1,109 @@
1#include <linux/module.h>
2#include <linux/smp.h>
3#include <linux/time.h>
4#include <linux/errno.h>
5#include <asm/io.h>
6
7/* IBM Summit (EXA) Cyclone counter code*/
8#define CYCLONE_CBAR_ADDR 0xFEB00CD0
9#define CYCLONE_PMCC_OFFSET 0x51A0
10#define CYCLONE_MPMC_OFFSET 0x51D0
11#define CYCLONE_MPCS_OFFSET 0x51A8
12#define CYCLONE_TIMER_FREQ 100000000
13
14int use_cyclone;
15void __init cyclone_setup(void)
16{
17 use_cyclone = 1;
18}
19
20
21struct time_interpolator cyclone_interpolator = {
22 .source = TIME_SOURCE_MMIO64,
23 .shift = 16,
24 .frequency = CYCLONE_TIMER_FREQ,
25 .drift = -100,
26 .mask = (1LL << 40) - 1
27};
28
29int __init init_cyclone_clock(void)
30{
31 u64* reg;
32 u64 base; /* saved cyclone base address */
33 u64 offset; /* offset from pageaddr to cyclone_timer register */
34 int i;
35 u32* volatile cyclone_timer; /* Cyclone MPMC0 register */
36
37 if (!use_cyclone)
38 return -ENODEV;
39
40 printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
41
42 /* find base address */
43 offset = (CYCLONE_CBAR_ADDR);
44 reg = (u64*)ioremap_nocache(offset, sizeof(u64));
45 if(!reg){
46 printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
47 use_cyclone = 0;
48 return -ENODEV;
49 }
50 base = readq(reg);
51 if(!base){
52 printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
53 use_cyclone = 0;
54 return -ENODEV;
55 }
56 iounmap(reg);
57
58 /* setup PMCC */
59 offset = (base + CYCLONE_PMCC_OFFSET);
60 reg = (u64*)ioremap_nocache(offset, sizeof(u64));
61 if(!reg){
62 printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
63 use_cyclone = 0;
64 return -ENODEV;
65 }
66 writel(0x00000001,reg);
67 iounmap(reg);
68
69 /* setup MPCS */
70 offset = (base + CYCLONE_MPCS_OFFSET);
71 reg = (u64*)ioremap_nocache(offset, sizeof(u64));
72 if(!reg){
73 printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
74 use_cyclone = 0;
75 return -ENODEV;
76 }
77 writel(0x00000001,reg);
78 iounmap(reg);
79
80 /* map in cyclone_timer */
81 offset = (base + CYCLONE_MPMC_OFFSET);
82 cyclone_timer = (u32*)ioremap_nocache(offset, sizeof(u32));
83 if(!cyclone_timer){
84 printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
85 use_cyclone = 0;
86 return -ENODEV;
87 }
88
89 /*quick test to make sure its ticking*/
90 for(i=0; i<3; i++){
91 u32 old = readl(cyclone_timer);
92 int stall = 100;
93 while(stall--) barrier();
94 if(readl(cyclone_timer) == old){
95 printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
96 iounmap(cyclone_timer);
97 cyclone_timer = 0;
98 use_cyclone = 0;
99 return -ENODEV;
100 }
101 }
102 /* initialize last tick */
103 cyclone_interpolator.addr = cyclone_timer;
104 register_time_interpolator(&cyclone_interpolator);
105
106 return 0;
107}
108
109__initcall(init_cyclone_clock);
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
new file mode 100644
index 000000000000..fe532c970438
--- /dev/null
+++ b/arch/ia64/kernel/domain.c
@@ -0,0 +1,382 @@
1/*
2 * arch/ia64/kernel/domain.c
3 * Architecture specific sched-domains builder.
4 *
5 * Copyright (C) 2004 Jesse Barnes
6 * Copyright (C) 2004 Silicon Graphics, Inc.
7 */
8
9#include <linux/sched.h>
10#include <linux/percpu.h>
11#include <linux/slab.h>
12#include <linux/cpumask.h>
13#include <linux/init.h>
14#include <linux/topology.h>
15#include <linux/nodemask.h>
16
17#define SD_NODES_PER_DOMAIN 6
18
19#ifdef CONFIG_NUMA
20/**
21 * find_next_best_node - find the next node to include in a sched_domain
22 * @node: node whose sched_domain we're building
23 * @used_nodes: nodes already in the sched_domain
24 *
25 * Find the next node to include in a given scheduling domain. Simply
26 * finds the closest node not already in the @used_nodes map.
27 *
28 * Should use nodemask_t.
29 */
30static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
31{
32 int i, n, val, min_val, best_node = 0;
33
34 min_val = INT_MAX;
35
36 for (i = 0; i < MAX_NUMNODES; i++) {
37 /* Start at @node */
38 n = (node + i) % MAX_NUMNODES;
39
40 if (!nr_cpus_node(n))
41 continue;
42
43 /* Skip already used nodes */
44 if (test_bit(n, used_nodes))
45 continue;
46
47 /* Simple min distance search */
48 val = node_distance(node, n);
49
50 if (val < min_val) {
51 min_val = val;
52 best_node = n;
53 }
54 }
55
56 set_bit(best_node, used_nodes);
57 return best_node;
58}
59
60/**
61 * sched_domain_node_span - get a cpumask for a node's sched_domain
62 * @node: node whose cpumask we're constructing
63 * @size: number of nodes to include in this span
64 *
65 * Given a node, construct a good cpumask for its sched_domain to span. It
66 * should be one that prevents unnecessary balancing, but also spreads tasks
67 * out optimally.
68 */
69static cpumask_t __devinit sched_domain_node_span(int node)
70{
71 int i;
72 cpumask_t span, nodemask;
73 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
74
75 cpus_clear(span);
76 bitmap_zero(used_nodes, MAX_NUMNODES);
77
78 nodemask = node_to_cpumask(node);
79 cpus_or(span, span, nodemask);
80 set_bit(node, used_nodes);
81
82 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
83 int next_node = find_next_best_node(node, used_nodes);
84 nodemask = node_to_cpumask(next_node);
85 cpus_or(span, span, nodemask);
86 }
87
88 return span;
89}
90#endif
91
92/*
93 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
94 * can switch it on easily if needed.
95 */
96#ifdef CONFIG_SCHED_SMT
97static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
98static struct sched_group sched_group_cpus[NR_CPUS];
99static int __devinit cpu_to_cpu_group(int cpu)
100{
101 return cpu;
102}
103#endif
104
105static DEFINE_PER_CPU(struct sched_domain, phys_domains);
106static struct sched_group sched_group_phys[NR_CPUS];
107static int __devinit cpu_to_phys_group(int cpu)
108{
109#ifdef CONFIG_SCHED_SMT
110 return first_cpu(cpu_sibling_map[cpu]);
111#else
112 return cpu;
113#endif
114}
115
116#ifdef CONFIG_NUMA
117/*
118 * The init_sched_build_groups can't handle what we want to do with node
119 * groups, so roll our own. Now each node has its own list of groups which
120 * gets dynamically allocated.
121 */
122static DEFINE_PER_CPU(struct sched_domain, node_domains);
123static struct sched_group *sched_group_nodes[MAX_NUMNODES];
124
125static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
126static struct sched_group sched_group_allnodes[MAX_NUMNODES];
127
128static int __devinit cpu_to_allnodes_group(int cpu)
129{
130 return cpu_to_node(cpu);
131}
132#endif
133
134/*
135 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
136 */
137void __devinit arch_init_sched_domains(void)
138{
139 int i;
140 cpumask_t cpu_default_map;
141
142 /*
143 * Setup mask for cpus without special case scheduling requirements.
144 * For now this just excludes isolated cpus, but could be used to
145 * exclude other special cases in the future.
146 */
147 cpus_complement(cpu_default_map, cpu_isolated_map);
148 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
149
150 /*
151 * Set up domains. Isolated domains just stay on the dummy domain.
152 */
153 for_each_cpu_mask(i, cpu_default_map) {
154 int group;
155 struct sched_domain *sd = NULL, *p;
156 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
157
158 cpus_and(nodemask, nodemask, cpu_default_map);
159
160#ifdef CONFIG_NUMA
161 if (num_online_cpus()
162 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
163 sd = &per_cpu(allnodes_domains, i);
164 *sd = SD_ALLNODES_INIT;
165 sd->span = cpu_default_map;
166 group = cpu_to_allnodes_group(i);
167 sd->groups = &sched_group_allnodes[group];
168 p = sd;
169 } else
170 p = NULL;
171
172 sd = &per_cpu(node_domains, i);
173 *sd = SD_NODE_INIT;
174 sd->span = sched_domain_node_span(cpu_to_node(i));
175 sd->parent = p;
176 cpus_and(sd->span, sd->span, cpu_default_map);
177#endif
178
179 p = sd;
180 sd = &per_cpu(phys_domains, i);
181 group = cpu_to_phys_group(i);
182 *sd = SD_CPU_INIT;
183 sd->span = nodemask;
184 sd->parent = p;
185 sd->groups = &sched_group_phys[group];
186
187#ifdef CONFIG_SCHED_SMT
188 p = sd;
189 sd = &per_cpu(cpu_domains, i);
190 group = cpu_to_cpu_group(i);
191 *sd = SD_SIBLING_INIT;
192 sd->span = cpu_sibling_map[i];
193 cpus_and(sd->span, sd->span, cpu_default_map);
194 sd->parent = p;
195 sd->groups = &sched_group_cpus[group];
196#endif
197 }
198
199#ifdef CONFIG_SCHED_SMT
200 /* Set up CPU (sibling) groups */
201 for_each_cpu_mask(i, cpu_default_map) {
202 cpumask_t this_sibling_map = cpu_sibling_map[i];
203 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
204 if (i != first_cpu(this_sibling_map))
205 continue;
206
207 init_sched_build_groups(sched_group_cpus, this_sibling_map,
208 &cpu_to_cpu_group);
209 }
210#endif
211
212 /* Set up physical groups */
213 for (i = 0; i < MAX_NUMNODES; i++) {
214 cpumask_t nodemask = node_to_cpumask(i);
215
216 cpus_and(nodemask, nodemask, cpu_default_map);
217 if (cpus_empty(nodemask))
218 continue;
219
220 init_sched_build_groups(sched_group_phys, nodemask,
221 &cpu_to_phys_group);
222 }
223
224#ifdef CONFIG_NUMA
225 init_sched_build_groups(sched_group_allnodes, cpu_default_map,
226 &cpu_to_allnodes_group);
227
228 for (i = 0; i < MAX_NUMNODES; i++) {
229 /* Set up node groups */
230 struct sched_group *sg, *prev;
231 cpumask_t nodemask = node_to_cpumask(i);
232 cpumask_t domainspan;
233 cpumask_t covered = CPU_MASK_NONE;
234 int j;
235
236 cpus_and(nodemask, nodemask, cpu_default_map);
237 if (cpus_empty(nodemask))
238 continue;
239
240 domainspan = sched_domain_node_span(i);
241 cpus_and(domainspan, domainspan, cpu_default_map);
242
243 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
244 sched_group_nodes[i] = sg;
245 for_each_cpu_mask(j, nodemask) {
246 struct sched_domain *sd;
247 sd = &per_cpu(node_domains, j);
248 sd->groups = sg;
249 if (sd->groups == NULL) {
250 /* Turn off balancing if we have no groups */
251 sd->flags = 0;
252 }
253 }
254 if (!sg) {
255 printk(KERN_WARNING
256 "Can not alloc domain group for node %d\n", i);
257 continue;
258 }
259 sg->cpu_power = 0;
260 sg->cpumask = nodemask;
261 cpus_or(covered, covered, nodemask);
262 prev = sg;
263
264 for (j = 0; j < MAX_NUMNODES; j++) {
265 cpumask_t tmp, notcovered;
266 int n = (i + j) % MAX_NUMNODES;
267
268 cpus_complement(notcovered, covered);
269 cpus_and(tmp, notcovered, cpu_default_map);
270 cpus_and(tmp, tmp, domainspan);
271 if (cpus_empty(tmp))
272 break;
273
274 nodemask = node_to_cpumask(n);
275 cpus_and(tmp, tmp, nodemask);
276 if (cpus_empty(tmp))
277 continue;
278
279 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
280 if (!sg) {
281 printk(KERN_WARNING
282 "Can not alloc domain group for node %d\n", j);
283 break;
284 }
285 sg->cpu_power = 0;
286 sg->cpumask = tmp;
287 cpus_or(covered, covered, tmp);
288 prev->next = sg;
289 prev = sg;
290 }
291 prev->next = sched_group_nodes[i];
292 }
293#endif
294
295 /* Calculate CPU power for physical packages and nodes */
296 for_each_cpu_mask(i, cpu_default_map) {
297 int power;
298 struct sched_domain *sd;
299#ifdef CONFIG_SCHED_SMT
300 sd = &per_cpu(cpu_domains, i);
301 power = SCHED_LOAD_SCALE;
302 sd->groups->cpu_power = power;
303#endif
304
305 sd = &per_cpu(phys_domains, i);
306 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
307 (cpus_weight(sd->groups->cpumask)-1) / 10;
308 sd->groups->cpu_power = power;
309
310#ifdef CONFIG_NUMA
311 sd = &per_cpu(allnodes_domains, i);
312 if (sd->groups) {
313 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
314 (cpus_weight(sd->groups->cpumask)-1) / 10;
315 sd->groups->cpu_power = power;
316 }
317#endif
318 }
319
320#ifdef CONFIG_NUMA
321 for (i = 0; i < MAX_NUMNODES; i++) {
322 struct sched_group *sg = sched_group_nodes[i];
323 int j;
324
325 if (sg == NULL)
326 continue;
327next_sg:
328 for_each_cpu_mask(j, sg->cpumask) {
329 struct sched_domain *sd;
330 int power;
331
332 sd = &per_cpu(phys_domains, j);
333 if (j != first_cpu(sd->groups->cpumask)) {
334 /*
335 * Only add "power" once for each
336 * physical package.
337 */
338 continue;
339 }
340 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
341 (cpus_weight(sd->groups->cpumask)-1) / 10;
342
343 sg->cpu_power += power;
344 }
345 sg = sg->next;
346 if (sg != sched_group_nodes[i])
347 goto next_sg;
348 }
349#endif
350
351 /* Attach the domains */
352 for_each_online_cpu(i) {
353 struct sched_domain *sd;
354#ifdef CONFIG_SCHED_SMT
355 sd = &per_cpu(cpu_domains, i);
356#else
357 sd = &per_cpu(phys_domains, i);
358#endif
359 cpu_attach_domain(sd, i);
360 }
361}
362
363void __devinit arch_destroy_sched_domains(void)
364{
365#ifdef CONFIG_NUMA
366 int i;
367 for (i = 0; i < MAX_NUMNODES; i++) {
368 struct sched_group *oldsg, *sg = sched_group_nodes[i];
369 if (sg == NULL)
370 continue;
371 sg = sg->next;
372next_sg:
373 oldsg = sg;
374 sg = sg->next;
375 kfree(oldsg);
376 if (oldsg != sched_group_nodes[i])
377 goto next_sg;
378 sched_group_nodes[i] = NULL;
379 }
380#endif
381}
382
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
new file mode 100644
index 000000000000..4a3b1aac43e7
--- /dev/null
+++ b/arch/ia64/kernel/efi.c
@@ -0,0 +1,832 @@
1/*
2 * Extensible Firmware Interface
3 *
4 * Based on Extensible Firmware Interface Specification version 0.9 April 30, 1999
5 *
6 * Copyright (C) 1999 VA Linux Systems
7 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
8 * Copyright (C) 1999-2003 Hewlett-Packard Co.
9 * David Mosberger-Tang <davidm@hpl.hp.com>
10 * Stephane Eranian <eranian@hpl.hp.com>
11 *
12 * All EFI Runtime Services are not implemented yet as EFI only
13 * supports physical mode addressing on SoftSDV. This is to be fixed
14 * in a future version. --drummond 1999-07-20
15 *
16 * Implemented EFI runtime services and virtual mode calls. --davidm
17 *
18 * Goutham Rao: <goutham.rao@intel.com>
19 * Skip non-WB memory and ignore empty memory ranges.
20 */
21#include <linux/config.h>
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/init.h>
25#include <linux/types.h>
26#include <linux/time.h>
27#include <linux/efi.h>
28
29#include <asm/io.h>
30#include <asm/kregs.h>
31#include <asm/meminit.h>
32#include <asm/pgtable.h>
33#include <asm/processor.h>
34#include <asm/mca.h>
35
36#define EFI_DEBUG 0
37
38extern efi_status_t efi_call_phys (void *, ...);
39
40struct efi efi;
41EXPORT_SYMBOL(efi);
42static efi_runtime_services_t *runtime;
43static unsigned long mem_limit = ~0UL, max_addr = ~0UL;
44
45#define efi_call_virt(f, args...) (*(f))(args)
46
47#define STUB_GET_TIME(prefix, adjust_arg) \
48static efi_status_t \
49prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc) \
50{ \
51 struct ia64_fpreg fr[6]; \
52 efi_time_cap_t *atc = NULL; \
53 efi_status_t ret; \
54 \
55 if (tc) \
56 atc = adjust_arg(tc); \
57 ia64_save_scratch_fpregs(fr); \
58 ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), atc); \
59 ia64_load_scratch_fpregs(fr); \
60 return ret; \
61}
62
63#define STUB_SET_TIME(prefix, adjust_arg) \
64static efi_status_t \
65prefix##_set_time (efi_time_t *tm) \
66{ \
67 struct ia64_fpreg fr[6]; \
68 efi_status_t ret; \
69 \
70 ia64_save_scratch_fpregs(fr); \
71 ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), adjust_arg(tm)); \
72 ia64_load_scratch_fpregs(fr); \
73 return ret; \
74}
75
76#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg) \
77static efi_status_t \
78prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm) \
79{ \
80 struct ia64_fpreg fr[6]; \
81 efi_status_t ret; \
82 \
83 ia64_save_scratch_fpregs(fr); \
84 ret = efi_call_##prefix((efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time), \
85 adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm)); \
86 ia64_load_scratch_fpregs(fr); \
87 return ret; \
88}
89
90#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg) \
91static efi_status_t \
92prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm) \
93{ \
94 struct ia64_fpreg fr[6]; \
95 efi_time_t *atm = NULL; \
96 efi_status_t ret; \
97 \
98 if (tm) \
99 atm = adjust_arg(tm); \
100 ia64_save_scratch_fpregs(fr); \
101 ret = efi_call_##prefix((efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time), \
102 enabled, atm); \
103 ia64_load_scratch_fpregs(fr); \
104 return ret; \
105}
106
107#define STUB_GET_VARIABLE(prefix, adjust_arg) \
108static efi_status_t \
109prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr, \
110 unsigned long *data_size, void *data) \
111{ \
112 struct ia64_fpreg fr[6]; \
113 u32 *aattr = NULL; \
114 efi_status_t ret; \
115 \
116 if (attr) \
117 aattr = adjust_arg(attr); \
118 ia64_save_scratch_fpregs(fr); \
119 ret = efi_call_##prefix((efi_get_variable_t *) __va(runtime->get_variable), \
120 adjust_arg(name), adjust_arg(vendor), aattr, \
121 adjust_arg(data_size), adjust_arg(data)); \
122 ia64_load_scratch_fpregs(fr); \
123 return ret; \
124}
125
126#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg) \
127static efi_status_t \
128prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) \
129{ \
130 struct ia64_fpreg fr[6]; \
131 efi_status_t ret; \
132 \
133 ia64_save_scratch_fpregs(fr); \
134 ret = efi_call_##prefix((efi_get_next_variable_t *) __va(runtime->get_next_variable), \
135 adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor)); \
136 ia64_load_scratch_fpregs(fr); \
137 return ret; \
138}
139
140#define STUB_SET_VARIABLE(prefix, adjust_arg) \
141static efi_status_t \
142prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, unsigned long attr, \
143 unsigned long data_size, void *data) \
144{ \
145 struct ia64_fpreg fr[6]; \
146 efi_status_t ret; \
147 \
148 ia64_save_scratch_fpregs(fr); \
149 ret = efi_call_##prefix((efi_set_variable_t *) __va(runtime->set_variable), \
150 adjust_arg(name), adjust_arg(vendor), attr, data_size, \
151 adjust_arg(data)); \
152 ia64_load_scratch_fpregs(fr); \
153 return ret; \
154}
155
156#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg) \
157static efi_status_t \
158prefix##_get_next_high_mono_count (u32 *count) \
159{ \
160 struct ia64_fpreg fr[6]; \
161 efi_status_t ret; \
162 \
163 ia64_save_scratch_fpregs(fr); \
164 ret = efi_call_##prefix((efi_get_next_high_mono_count_t *) \
165 __va(runtime->get_next_high_mono_count), adjust_arg(count)); \
166 ia64_load_scratch_fpregs(fr); \
167 return ret; \
168}
169
170#define STUB_RESET_SYSTEM(prefix, adjust_arg) \
171static void \
172prefix##_reset_system (int reset_type, efi_status_t status, \
173 unsigned long data_size, efi_char16_t *data) \
174{ \
175 struct ia64_fpreg fr[6]; \
176 efi_char16_t *adata = NULL; \
177 \
178 if (data) \
179 adata = adjust_arg(data); \
180 \
181 ia64_save_scratch_fpregs(fr); \
182 efi_call_##prefix((efi_reset_system_t *) __va(runtime->reset_system), \
183 reset_type, status, data_size, adata); \
184 /* should not return, but just in case... */ \
185 ia64_load_scratch_fpregs(fr); \
186}
187
188#define phys_ptr(arg) ((__typeof__(arg)) ia64_tpa(arg))
189
190STUB_GET_TIME(phys, phys_ptr)
191STUB_SET_TIME(phys, phys_ptr)
192STUB_GET_WAKEUP_TIME(phys, phys_ptr)
193STUB_SET_WAKEUP_TIME(phys, phys_ptr)
194STUB_GET_VARIABLE(phys, phys_ptr)
195STUB_GET_NEXT_VARIABLE(phys, phys_ptr)
196STUB_SET_VARIABLE(phys, phys_ptr)
197STUB_GET_NEXT_HIGH_MONO_COUNT(phys, phys_ptr)
198STUB_RESET_SYSTEM(phys, phys_ptr)
199
200#define id(arg) arg
201
202STUB_GET_TIME(virt, id)
203STUB_SET_TIME(virt, id)
204STUB_GET_WAKEUP_TIME(virt, id)
205STUB_SET_WAKEUP_TIME(virt, id)
206STUB_GET_VARIABLE(virt, id)
207STUB_GET_NEXT_VARIABLE(virt, id)
208STUB_SET_VARIABLE(virt, id)
209STUB_GET_NEXT_HIGH_MONO_COUNT(virt, id)
210STUB_RESET_SYSTEM(virt, id)
211
212void
213efi_gettimeofday (struct timespec *ts)
214{
215 efi_time_t tm;
216
217 memset(ts, 0, sizeof(ts));
218 if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS)
219 return;
220
221 ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second);
222 ts->tv_nsec = tm.nanosecond;
223}
224
225static int
226is_available_memory (efi_memory_desc_t *md)
227{
228 if (!(md->attribute & EFI_MEMORY_WB))
229 return 0;
230
231 switch (md->type) {
232 case EFI_LOADER_CODE:
233 case EFI_LOADER_DATA:
234 case EFI_BOOT_SERVICES_CODE:
235 case EFI_BOOT_SERVICES_DATA:
236 case EFI_CONVENTIONAL_MEMORY:
237 return 1;
238 }
239 return 0;
240}
241
242/*
243 * Trim descriptor MD so its starts at address START_ADDR. If the descriptor covers
244 * memory that is normally available to the kernel, issue a warning that some memory
245 * is being ignored.
246 */
247static void
248trim_bottom (efi_memory_desc_t *md, u64 start_addr)
249{
250 u64 num_skipped_pages;
251
252 if (md->phys_addr >= start_addr || !md->num_pages)
253 return;
254
255 num_skipped_pages = (start_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
256 if (num_skipped_pages > md->num_pages)
257 num_skipped_pages = md->num_pages;
258
259 if (is_available_memory(md))
260 printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
261 "at 0x%lx\n", __FUNCTION__,
262 (num_skipped_pages << EFI_PAGE_SHIFT) >> 10,
263 md->phys_addr, start_addr - IA64_GRANULE_SIZE);
264 /*
265 * NOTE: Don't set md->phys_addr to START_ADDR because that could cause the memory
266 * descriptor list to become unsorted. In such a case, md->num_pages will be
267 * zero, so the Right Thing will happen.
268 */
269 md->phys_addr += num_skipped_pages << EFI_PAGE_SHIFT;
270 md->num_pages -= num_skipped_pages;
271}
272
273static void
274trim_top (efi_memory_desc_t *md, u64 end_addr)
275{
276 u64 num_dropped_pages, md_end_addr;
277
278 md_end_addr = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
279
280 if (md_end_addr <= end_addr || !md->num_pages)
281 return;
282
283 num_dropped_pages = (md_end_addr - end_addr) >> EFI_PAGE_SHIFT;
284 if (num_dropped_pages > md->num_pages)
285 num_dropped_pages = md->num_pages;
286
287 if (is_available_memory(md))
288 printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
289 "at 0x%lx\n", __FUNCTION__,
290 (num_dropped_pages << EFI_PAGE_SHIFT) >> 10,
291 md->phys_addr, end_addr);
292 md->num_pages -= num_dropped_pages;
293}
294
295/*
296 * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
297 * has memory that is available for OS use.
298 */
299void
300efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
301{
302 int prev_valid = 0;
303 struct range {
304 u64 start;
305 u64 end;
306 } prev, curr;
307 void *efi_map_start, *efi_map_end, *p, *q;
308 efi_memory_desc_t *md, *check_md;
309 u64 efi_desc_size, start, end, granule_addr, last_granule_addr, first_non_wb_addr = 0;
310 unsigned long total_mem = 0;
311
312 efi_map_start = __va(ia64_boot_param->efi_memmap);
313 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
314 efi_desc_size = ia64_boot_param->efi_memdesc_size;
315
316 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
317 md = p;
318
319 /* skip over non-WB memory descriptors; that's all we're interested in... */
320 if (!(md->attribute & EFI_MEMORY_WB))
321 continue;
322
323 /*
324 * granule_addr is the base of md's first granule.
325 * [granule_addr - first_non_wb_addr) is guaranteed to
326 * be contiguous WB memory.
327 */
328 granule_addr = GRANULEROUNDDOWN(md->phys_addr);
329 first_non_wb_addr = max(first_non_wb_addr, granule_addr);
330
331 if (first_non_wb_addr < md->phys_addr) {
332 trim_bottom(md, granule_addr + IA64_GRANULE_SIZE);
333 granule_addr = GRANULEROUNDDOWN(md->phys_addr);
334 first_non_wb_addr = max(first_non_wb_addr, granule_addr);
335 }
336
337 for (q = p; q < efi_map_end; q += efi_desc_size) {
338 check_md = q;
339
340 if ((check_md->attribute & EFI_MEMORY_WB) &&
341 (check_md->phys_addr == first_non_wb_addr))
342 first_non_wb_addr += check_md->num_pages << EFI_PAGE_SHIFT;
343 else
344 break; /* non-WB or hole */
345 }
346
347 last_granule_addr = GRANULEROUNDDOWN(first_non_wb_addr);
348 if (last_granule_addr < md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT))
349 trim_top(md, last_granule_addr);
350
351 if (is_available_memory(md)) {
352 if (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) >= max_addr) {
353 if (md->phys_addr >= max_addr)
354 continue;
355 md->num_pages = (max_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
356 first_non_wb_addr = max_addr;
357 }
358
359 if (total_mem >= mem_limit)
360 continue;
361
362 if (total_mem + (md->num_pages << EFI_PAGE_SHIFT) > mem_limit) {
363 unsigned long limit_addr = md->phys_addr;
364
365 limit_addr += mem_limit - total_mem;
366 limit_addr = GRANULEROUNDDOWN(limit_addr);
367
368 if (md->phys_addr > limit_addr)
369 continue;
370
371 md->num_pages = (limit_addr - md->phys_addr) >>
372 EFI_PAGE_SHIFT;
373 first_non_wb_addr = max_addr = md->phys_addr +
374 (md->num_pages << EFI_PAGE_SHIFT);
375 }
376 total_mem += (md->num_pages << EFI_PAGE_SHIFT);
377
378 if (md->num_pages == 0)
379 continue;
380
381 curr.start = PAGE_OFFSET + md->phys_addr;
382 curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
383
384 if (!prev_valid) {
385 prev = curr;
386 prev_valid = 1;
387 } else {
388 if (curr.start < prev.start)
389 printk(KERN_ERR "Oops: EFI memory table not ordered!\n");
390
391 if (prev.end == curr.start) {
392 /* merge two consecutive memory ranges */
393 prev.end = curr.end;
394 } else {
395 start = PAGE_ALIGN(prev.start);
396 end = prev.end & PAGE_MASK;
397 if ((end > start) && (*callback)(start, end, arg) < 0)
398 return;
399 prev = curr;
400 }
401 }
402 }
403 }
404 if (prev_valid) {
405 start = PAGE_ALIGN(prev.start);
406 end = prev.end & PAGE_MASK;
407 if (end > start)
408 (*callback)(start, end, arg);
409 }
410}
411
412/*
413 * Look for the PAL_CODE region reported by EFI and maps it using an
414 * ITR to enable safe PAL calls in virtual mode. See IA-64 Processor
415 * Abstraction Layer chapter 11 in ADAG
416 */
417
418void *
419efi_get_pal_addr (void)
420{
421 void *efi_map_start, *efi_map_end, *p;
422 efi_memory_desc_t *md;
423 u64 efi_desc_size;
424 int pal_code_count = 0;
425 u64 vaddr, mask;
426
427 efi_map_start = __va(ia64_boot_param->efi_memmap);
428 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
429 efi_desc_size = ia64_boot_param->efi_memdesc_size;
430
431 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
432 md = p;
433 if (md->type != EFI_PAL_CODE)
434 continue;
435
436 if (++pal_code_count > 1) {
437 printk(KERN_ERR "Too many EFI Pal Code memory ranges, dropped @ %lx\n",
438 md->phys_addr);
439 continue;
440 }
441 /*
442 * The only ITLB entry in region 7 that is used is the one installed by
443 * __start(). That entry covers a 64MB range.
444 */
445 mask = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1);
446 vaddr = PAGE_OFFSET + md->phys_addr;
447
448 /*
449 * We must check that the PAL mapping won't overlap with the kernel
450 * mapping.
451 *
452 * PAL code is guaranteed to be aligned on a power of 2 between 4k and
453 * 256KB and that only one ITR is needed to map it. This implies that the
454 * PAL code is always aligned on its size, i.e., the closest matching page
455 * size supported by the TLB. Therefore PAL code is guaranteed never to
456 * cross a 64MB unless it is bigger than 64MB (very unlikely!). So for
457 * now the following test is enough to determine whether or not we need a
458 * dedicated ITR for the PAL code.
459 */
460 if ((vaddr & mask) == (KERNEL_START & mask)) {
461 printk(KERN_INFO "%s: no need to install ITR for PAL code\n",
462 __FUNCTION__);
463 continue;
464 }
465
466 if (md->num_pages << EFI_PAGE_SHIFT > IA64_GRANULE_SIZE)
467 panic("Woah! PAL code size bigger than a granule!");
468
469#if EFI_DEBUG
470 mask = ~((1 << IA64_GRANULE_SHIFT) - 1);
471
472 printk(KERN_INFO "CPU %d: mapping PAL code [0x%lx-0x%lx) into [0x%lx-0x%lx)\n",
473 smp_processor_id(), md->phys_addr,
474 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
475 vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
476#endif
477 return __va(md->phys_addr);
478 }
479 printk(KERN_WARNING "%s: no PAL-code memory-descriptor found",
480 __FUNCTION__);
481 return NULL;
482}
483
484void
485efi_map_pal_code (void)
486{
487 void *pal_vaddr = efi_get_pal_addr ();
488 u64 psr;
489
490 if (!pal_vaddr)
491 return;
492
493 /*
494 * Cannot write to CRx with PSR.ic=1
495 */
496 psr = ia64_clear_ic();
497 ia64_itr(0x1, IA64_TR_PALCODE, GRANULEROUNDDOWN((unsigned long) pal_vaddr),
498 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
499 IA64_GRANULE_SHIFT);
500 ia64_set_psr(psr); /* restore psr */
501 ia64_srlz_i();
502}
503
504void __init
505efi_init (void)
506{
507 void *efi_map_start, *efi_map_end;
508 efi_config_table_t *config_tables;
509 efi_char16_t *c16;
510 u64 efi_desc_size;
511 char *cp, *end, vendor[100] = "unknown";
512 extern char saved_command_line[];
513 int i;
514
515 /* it's too early to be able to use the standard kernel command line support... */
516 for (cp = saved_command_line; *cp; ) {
517 if (memcmp(cp, "mem=", 4) == 0) {
518 cp += 4;
519 mem_limit = memparse(cp, &end);
520 if (end != cp)
521 break;
522 cp = end;
523 } else if (memcmp(cp, "max_addr=", 9) == 0) {
524 cp += 9;
525 max_addr = GRANULEROUNDDOWN(memparse(cp, &end));
526 if (end != cp)
527 break;
528 cp = end;
529 } else {
530 while (*cp != ' ' && *cp)
531 ++cp;
532 while (*cp == ' ')
533 ++cp;
534 }
535 }
536 if (max_addr != ~0UL)
537 printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20);
538
539 efi.systab = __va(ia64_boot_param->efi_systab);
540
541 /*
542 * Verify the EFI Table
543 */
544 if (efi.systab == NULL)
545 panic("Woah! Can't find EFI system table.\n");
546 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
547 panic("Woah! EFI system table signature incorrect\n");
548 if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0)
549 printk(KERN_WARNING "Warning: EFI system table major version mismatch: "
550 "got %d.%02d, expected %d.%02d\n",
551 efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff,
552 EFI_SYSTEM_TABLE_REVISION >> 16, EFI_SYSTEM_TABLE_REVISION & 0xffff);
553
554 config_tables = __va(efi.systab->tables);
555
556 /* Show what we know for posterity */
557 c16 = __va(efi.systab->fw_vendor);
558 if (c16) {
559 for (i = 0;i < (int) sizeof(vendor) && *c16; ++i)
560 vendor[i] = *c16++;
561 vendor[i] = '\0';
562 }
563
564 printk(KERN_INFO "EFI v%u.%.02u by %s:",
565 efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor);
566
567 for (i = 0; i < (int) efi.systab->nr_tables; i++) {
568 if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
569 efi.mps = __va(config_tables[i].table);
570 printk(" MPS=0x%lx", config_tables[i].table);
571 } else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
572 efi.acpi20 = __va(config_tables[i].table);
573 printk(" ACPI 2.0=0x%lx", config_tables[i].table);
574 } else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
575 efi.acpi = __va(config_tables[i].table);
576 printk(" ACPI=0x%lx", config_tables[i].table);
577 } else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
578 efi.smbios = __va(config_tables[i].table);
579 printk(" SMBIOS=0x%lx", config_tables[i].table);
580 } else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) {
581 efi.sal_systab = __va(config_tables[i].table);
582 printk(" SALsystab=0x%lx", config_tables[i].table);
583 } else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
584 efi.hcdp = __va(config_tables[i].table);
585 printk(" HCDP=0x%lx", config_tables[i].table);
586 }
587 }
588 printk("\n");
589
590 runtime = __va(efi.systab->runtime);
591 efi.get_time = phys_get_time;
592 efi.set_time = phys_set_time;
593 efi.get_wakeup_time = phys_get_wakeup_time;
594 efi.set_wakeup_time = phys_set_wakeup_time;
595 efi.get_variable = phys_get_variable;
596 efi.get_next_variable = phys_get_next_variable;
597 efi.set_variable = phys_set_variable;
598 efi.get_next_high_mono_count = phys_get_next_high_mono_count;
599 efi.reset_system = phys_reset_system;
600
601 efi_map_start = __va(ia64_boot_param->efi_memmap);
602 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
603 efi_desc_size = ia64_boot_param->efi_memdesc_size;
604
605#if EFI_DEBUG
606 /* print EFI memory map: */
607 {
608 efi_memory_desc_t *md;
609 void *p;
610
611 for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) {
612 md = p;
613 printk("mem%02u: type=%u, attr=0x%lx, range=[0x%016lx-0x%016lx) (%luMB)\n",
614 i, md->type, md->attribute, md->phys_addr,
615 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
616 md->num_pages >> (20 - EFI_PAGE_SHIFT));
617 }
618 }
619#endif
620
621 efi_map_pal_code();
622 efi_enter_virtual_mode();
623}
624
625void
626efi_enter_virtual_mode (void)
627{
628 void *efi_map_start, *efi_map_end, *p;
629 efi_memory_desc_t *md;
630 efi_status_t status;
631 u64 efi_desc_size;
632
633 efi_map_start = __va(ia64_boot_param->efi_memmap);
634 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
635 efi_desc_size = ia64_boot_param->efi_memdesc_size;
636
637 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
638 md = p;
639 if (md->attribute & EFI_MEMORY_RUNTIME) {
640 /*
641 * Some descriptors have multiple bits set, so the order of
642 * the tests is relevant.
643 */
644 if (md->attribute & EFI_MEMORY_WB) {
645 md->virt_addr = (u64) __va(md->phys_addr);
646 } else if (md->attribute & EFI_MEMORY_UC) {
647 md->virt_addr = (u64) ioremap(md->phys_addr, 0);
648 } else if (md->attribute & EFI_MEMORY_WC) {
649#if 0
650 md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
651 | _PAGE_D
652 | _PAGE_MA_WC
653 | _PAGE_PL_0
654 | _PAGE_AR_RW));
655#else
656 printk(KERN_INFO "EFI_MEMORY_WC mapping\n");
657 md->virt_addr = (u64) ioremap(md->phys_addr, 0);
658#endif
659 } else if (md->attribute & EFI_MEMORY_WT) {
660#if 0
661 md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
662 | _PAGE_D | _PAGE_MA_WT
663 | _PAGE_PL_0
664 | _PAGE_AR_RW));
665#else
666 printk(KERN_INFO "EFI_MEMORY_WT mapping\n");
667 md->virt_addr = (u64) ioremap(md->phys_addr, 0);
668#endif
669 }
670 }
671 }
672
673 status = efi_call_phys(__va(runtime->set_virtual_address_map),
674 ia64_boot_param->efi_memmap_size,
675 efi_desc_size, ia64_boot_param->efi_memdesc_version,
676 ia64_boot_param->efi_memmap);
677 if (status != EFI_SUCCESS) {
678 printk(KERN_WARNING "warning: unable to switch EFI into virtual mode "
679 "(status=%lu)\n", status);
680 return;
681 }
682
683 /*
684 * Now that EFI is in virtual mode, we call the EFI functions more efficiently:
685 */
686 efi.get_time = virt_get_time;
687 efi.set_time = virt_set_time;
688 efi.get_wakeup_time = virt_get_wakeup_time;
689 efi.set_wakeup_time = virt_set_wakeup_time;
690 efi.get_variable = virt_get_variable;
691 efi.get_next_variable = virt_get_next_variable;
692 efi.set_variable = virt_set_variable;
693 efi.get_next_high_mono_count = virt_get_next_high_mono_count;
694 efi.reset_system = virt_reset_system;
695}
696
697/*
698 * Walk the EFI memory map looking for the I/O port range. There can only be one entry of
699 * this type, other I/O port ranges should be described via ACPI.
700 */
701u64
702efi_get_iobase (void)
703{
704 void *efi_map_start, *efi_map_end, *p;
705 efi_memory_desc_t *md;
706 u64 efi_desc_size;
707
708 efi_map_start = __va(ia64_boot_param->efi_memmap);
709 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
710 efi_desc_size = ia64_boot_param->efi_memdesc_size;
711
712 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
713 md = p;
714 if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
715 if (md->attribute & EFI_MEMORY_UC)
716 return md->phys_addr;
717 }
718 }
719 return 0;
720}
721
722u32
723efi_mem_type (unsigned long phys_addr)
724{
725 void *efi_map_start, *efi_map_end, *p;
726 efi_memory_desc_t *md;
727 u64 efi_desc_size;
728
729 efi_map_start = __va(ia64_boot_param->efi_memmap);
730 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
731 efi_desc_size = ia64_boot_param->efi_memdesc_size;
732
733 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
734 md = p;
735
736 if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
737 return md->type;
738 }
739 return 0;
740}
741
742u64
743efi_mem_attributes (unsigned long phys_addr)
744{
745 void *efi_map_start, *efi_map_end, *p;
746 efi_memory_desc_t *md;
747 u64 efi_desc_size;
748
749 efi_map_start = __va(ia64_boot_param->efi_memmap);
750 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
751 efi_desc_size = ia64_boot_param->efi_memdesc_size;
752
753 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
754 md = p;
755
756 if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
757 return md->attribute;
758 }
759 return 0;
760}
761EXPORT_SYMBOL(efi_mem_attributes);
762
763int
764valid_phys_addr_range (unsigned long phys_addr, unsigned long *size)
765{
766 void *efi_map_start, *efi_map_end, *p;
767 efi_memory_desc_t *md;
768 u64 efi_desc_size;
769
770 efi_map_start = __va(ia64_boot_param->efi_memmap);
771 efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
772 efi_desc_size = ia64_boot_param->efi_memdesc_size;
773
774 for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
775 md = p;
776
777 if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) {
778 if (!(md->attribute & EFI_MEMORY_WB))
779 return 0;
780
781 if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr)
782 *size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr;
783 return 1;
784 }
785 }
786 return 0;
787}
788
789int __init
790efi_uart_console_only(void)
791{
792 efi_status_t status;
793 char *s, name[] = "ConOut";
794 efi_guid_t guid = EFI_GLOBAL_VARIABLE_GUID;
795 efi_char16_t *utf16, name_utf16[32];
796 unsigned char data[1024];
797 unsigned long size = sizeof(data);
798 struct efi_generic_dev_path *hdr, *end_addr;
799 int uart = 0;
800
801 /* Convert to UTF-16 */
802 utf16 = name_utf16;
803 s = name;
804 while (*s)
805 *utf16++ = *s++ & 0x7f;
806 *utf16 = 0;
807
808 status = efi.get_variable(name_utf16, &guid, NULL, &size, data);
809 if (status != EFI_SUCCESS) {
810 printk(KERN_ERR "No EFI %s variable?\n", name);
811 return 0;
812 }
813
814 hdr = (struct efi_generic_dev_path *) data;
815 end_addr = (struct efi_generic_dev_path *) ((u8 *) data + size);
816 while (hdr < end_addr) {
817 if (hdr->type == EFI_DEV_MSG &&
818 hdr->sub_type == EFI_DEV_MSG_UART)
819 uart = 1;
820 else if (hdr->type == EFI_DEV_END_PATH ||
821 hdr->type == EFI_DEV_END_PATH2) {
822 if (!uart)
823 return 0;
824 if (hdr->sub_type == EFI_DEV_END_ENTIRE)
825 return 1;
826 uart = 0;
827 }
828 hdr = (struct efi_generic_dev_path *) ((u8 *) hdr + hdr->length);
829 }
830 printk(KERN_ERR "Malformed %s value\n", name);
831 return 0;
832}
diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S
new file mode 100644
index 000000000000..5a7fe70212a9
--- /dev/null
+++ b/arch/ia64/kernel/efi_stub.S
@@ -0,0 +1,86 @@
1/*
2 * EFI call stub.
3 *
4 * Copyright (C) 1999-2001 Hewlett-Packard Co
5 * David Mosberger <davidm@hpl.hp.com>
6 *
7 * This stub allows us to make EFI calls in physical mode with interrupts
8 * turned off. We need this because we can't call SetVirtualMap() until
9 * the kernel has booted far enough to allow allocation of struct vma_struct
10 * entries (which we would need to map stuff with memory attributes other
11 * than uncached or writeback...). Since the GetTime() service gets called
12 * earlier than that, we need to be able to make physical mode EFI calls from
13 * the kernel.
14 */
15
16/*
17 * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
18 * Abstraction Layer Specification", revision 2.6e). Note that
19 * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
20 * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
21 * (the br.ia instruction fails unless psr.dfl and psr.dfh are
22 * cleared). Fortunately, SAL promises not to touch the floating
23 * point regs, so at least we don't have to save f2-f127.
24 */
25#define PSR_BITS_TO_CLEAR \
26 (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT | \
27 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \
28 IA64_PSR_DFL | IA64_PSR_DFH)
29
30#define PSR_BITS_TO_SET \
31 (IA64_PSR_BN)
32
33#include <asm/processor.h>
34#include <asm/asmmacro.h>
35
36/*
37 * Inputs:
38 * in0 = address of function descriptor of EFI routine to call
39 * in1..in7 = arguments to routine
40 *
41 * Outputs:
42 * r8 = EFI_STATUS returned by called function
43 */
44
45GLOBAL_ENTRY(efi_call_phys)
46 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
47 alloc loc1=ar.pfs,8,7,7,0
48 ld8 r2=[in0],8 // load EFI function's entry point
49 mov loc0=rp
50 .body
51 ;;
52 mov loc2=gp // save global pointer
53 mov loc4=ar.rsc // save RSE configuration
54 mov ar.rsc=0 // put RSE in enforced lazy, LE mode
55 ;;
56 ld8 gp=[in0] // load EFI function's global pointer
57 movl r16=PSR_BITS_TO_CLEAR
58 mov loc3=psr // save processor status word
59 movl r17=PSR_BITS_TO_SET
60 ;;
61 or loc3=loc3,r17
62 mov b6=r2
63 ;;
64 andcm r16=loc3,r16 // get psr with IT, DT, and RT bits cleared
65 br.call.sptk.many rp=ia64_switch_mode_phys
66.ret0: mov out4=in5
67 mov out0=in1
68 mov out1=in2
69 mov out2=in3
70 mov out3=in4
71 mov out5=in6
72 mov out6=in7
73 mov loc5=r19
74 mov loc6=r20
75 br.call.sptk.many rp=b6 // call the EFI function
76.ret1: mov ar.rsc=0 // put RSE in enforced lazy, LE mode
77 mov r16=loc3
78 mov r19=loc5
79 mov r20=loc6
80 br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
81.ret2: mov ar.rsc=loc4 // restore RSE configuration
82 mov ar.pfs=loc1
83 mov rp=loc0
84 mov gp=loc2
85 br.ret.sptk.many rp
86END(efi_call_phys)
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
new file mode 100644
index 000000000000..0272c010a3ba
--- /dev/null
+++ b/arch/ia64/kernel/entry.S
@@ -0,0 +1,1587 @@
1/*
2 * ia64/kernel/entry.S
3 *
4 * Kernel entry points.
5 *
6 * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co
7 * David Mosberger-Tang <davidm@hpl.hp.com>
8 * Copyright (C) 1999, 2002-2003
9 * Asit Mallick <Asit.K.Mallick@intel.com>
10 * Don Dugger <Don.Dugger@intel.com>
11 * Suresh Siddha <suresh.b.siddha@intel.com>
12 * Fenghua Yu <fenghua.yu@intel.com>
13 * Copyright (C) 1999 VA Linux Systems
14 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
15 */
16/*
17 * ia64_switch_to now places correct virtual mapping in in TR2 for
18 * kernel stack. This allows us to handle interrupts without changing
19 * to physical mode.
20 *
21 * Jonathan Nicklin <nicklin@missioncriticallinux.com>
22 * Patrick O'Rourke <orourke@missioncriticallinux.com>
23 * 11/07/2000
24 */
25/*
26 * Global (preserved) predicate usage on syscall entry/exit path:
27 *
28 * pKStk: See entry.h.
29 * pUStk: See entry.h.
30 * pSys: See entry.h.
31 * pNonSys: !pSys
32 */
33
34#include <linux/config.h>
35
36#include <asm/asmmacro.h>
37#include <asm/cache.h>
38#include <asm/errno.h>
39#include <asm/kregs.h>
40#include <asm/offsets.h>
41#include <asm/pgtable.h>
42#include <asm/percpu.h>
43#include <asm/processor.h>
44#include <asm/thread_info.h>
45#include <asm/unistd.h>
46
47#include "minstate.h"
48
49 /*
50 * execve() is special because in case of success, we need to
51 * setup a null register window frame.
52 */
53ENTRY(ia64_execve)
54 /*
55 * Allocate 8 input registers since ptrace() may clobber them
56 */
57 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
58 alloc loc1=ar.pfs,8,2,4,0
59 mov loc0=rp
60 .body
61 mov out0=in0 // filename
62 ;; // stop bit between alloc and call
63 mov out1=in1 // argv
64 mov out2=in2 // envp
65 add out3=16,sp // regs
66 br.call.sptk.many rp=sys_execve
67.ret0:
68#ifdef CONFIG_IA32_SUPPORT
69 /*
70 * Check if we're returning to ia32 mode. If so, we need to restore ia32 registers
71 * from pt_regs.
72 */
73 adds r16=PT(CR_IPSR)+16,sp
74 ;;
75 ld8 r16=[r16]
76#endif
77 cmp4.ge p6,p7=r8,r0
78 mov ar.pfs=loc1 // restore ar.pfs
79 sxt4 r8=r8 // return 64-bit result
80 ;;
81 stf.spill [sp]=f0
82(p6) cmp.ne pKStk,pUStk=r0,r0 // a successful execve() lands us in user-mode...
83 mov rp=loc0
84(p6) mov ar.pfs=r0 // clear ar.pfs on success
85(p7) br.ret.sptk.many rp
86
87 /*
88 * In theory, we'd have to zap this state only to prevent leaking of
89 * security sensitive state (e.g., if current->mm->dumpable is zero). However,
90 * this executes in less than 20 cycles even on Itanium, so it's not worth
91 * optimizing for...).
92 */
93 mov ar.unat=0; mov ar.lc=0
94 mov r4=0; mov f2=f0; mov b1=r0
95 mov r5=0; mov f3=f0; mov b2=r0
96 mov r6=0; mov f4=f0; mov b3=r0
97 mov r7=0; mov f5=f0; mov b4=r0
98 ldf.fill f12=[sp]; mov f13=f0; mov b5=r0
99 ldf.fill f14=[sp]; ldf.fill f15=[sp]; mov f16=f0
100 ldf.fill f17=[sp]; ldf.fill f18=[sp]; mov f19=f0
101 ldf.fill f20=[sp]; ldf.fill f21=[sp]; mov f22=f0
102 ldf.fill f23=[sp]; ldf.fill f24=[sp]; mov f25=f0
103 ldf.fill f26=[sp]; ldf.fill f27=[sp]; mov f28=f0
104 ldf.fill f29=[sp]; ldf.fill f30=[sp]; mov f31=f0
105#ifdef CONFIG_IA32_SUPPORT
106 tbit.nz p6,p0=r16, IA64_PSR_IS_BIT
107 movl loc0=ia64_ret_from_ia32_execve
108 ;;
109(p6) mov rp=loc0
110#endif
111 br.ret.sptk.many rp
112END(ia64_execve)
113
114/*
115 * sys_clone2(u64 flags, u64 ustack_base, u64 ustack_size, u64 parent_tidptr, u64 child_tidptr,
116 * u64 tls)
117 */
118GLOBAL_ENTRY(sys_clone2)
119 /*
120 * Allocate 8 input registers since ptrace() may clobber them
121 */
122 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
123 alloc r16=ar.pfs,8,2,6,0
124 DO_SAVE_SWITCH_STACK
125 adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp
126 mov loc0=rp
127 mov loc1=r16 // save ar.pfs across do_fork
128 .body
129 mov out1=in1
130 mov out3=in2
131 tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
132 mov out4=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID
133 ;;
134(p6) st8 [r2]=in5 // store TLS in r16 for copy_thread()
135 mov out5=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
136 adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = &regs
137 mov out0=in0 // out0 = clone_flags
138 br.call.sptk.many rp=do_fork
139.ret1: .restore sp
140 adds sp=IA64_SWITCH_STACK_SIZE,sp // pop the switch stack
141 mov ar.pfs=loc1
142 mov rp=loc0
143 br.ret.sptk.many rp
144END(sys_clone2)
145
146/*
147 * sys_clone(u64 flags, u64 ustack_base, u64 parent_tidptr, u64 child_tidptr, u64 tls)
148 * Deprecated. Use sys_clone2() instead.
149 */
150GLOBAL_ENTRY(sys_clone)
151 /*
152 * Allocate 8 input registers since ptrace() may clobber them
153 */
154 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
155 alloc r16=ar.pfs,8,2,6,0
156 DO_SAVE_SWITCH_STACK
157 adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp
158 mov loc0=rp
159 mov loc1=r16 // save ar.pfs across do_fork
160 .body
161 mov out1=in1
162 mov out3=16 // stacksize (compensates for 16-byte scratch area)
163 tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
164 mov out4=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID
165 ;;
166(p6) st8 [r2]=in4 // store TLS in r13 (tp)
167 mov out5=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
168 adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = &regs
169 mov out0=in0 // out0 = clone_flags
170 br.call.sptk.many rp=do_fork
171.ret2: .restore sp
172 adds sp=IA64_SWITCH_STACK_SIZE,sp // pop the switch stack
173 mov ar.pfs=loc1
174 mov rp=loc0
175 br.ret.sptk.many rp
176END(sys_clone)
177
178/*
179 * prev_task <- ia64_switch_to(struct task_struct *next)
180 * With Ingo's new scheduler, interrupts are disabled when this routine gets
181 * called. The code starting at .map relies on this. The rest of the code
182 * doesn't care about the interrupt masking status.
183 */
184GLOBAL_ENTRY(ia64_switch_to)
185 .prologue
186 alloc r16=ar.pfs,1,0,0,0
187 DO_SAVE_SWITCH_STACK
188 .body
189
190 adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
191 movl r25=init_task
192 mov r27=IA64_KR(CURRENT_STACK)
193 adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
194 dep r20=0,in0,61,3 // physical address of "next"
195 ;;
196 st8 [r22]=sp // save kernel stack pointer of old task
197 shr.u r26=r20,IA64_GRANULE_SHIFT
198 cmp.eq p7,p6=r25,in0
199 ;;
200 /*
201 * If we've already mapped this task's page, we can skip doing it again.
202 */
203(p6) cmp.eq p7,p6=r26,r27
204(p6) br.cond.dpnt .map
205 ;;
206.done:
207(p6) ssm psr.ic // if we had to map, reenable the psr.ic bit FIRST!!!
208 ;;
209(p6) srlz.d
210 ld8 sp=[r21] // load kernel stack pointer of new task
211 mov IA64_KR(CURRENT)=in0 // update "current" application register
212 mov r8=r13 // return pointer to previously running task
213 mov r13=in0 // set "current" pointer
214 ;;
215 DO_LOAD_SWITCH_STACK
216
217#ifdef CONFIG_SMP
218 sync.i // ensure "fc"s done by this CPU are visible on other CPUs
219#endif
220 br.ret.sptk.many rp // boogie on out in new context
221
222.map:
223 rsm psr.ic // interrupts (psr.i) are already disabled here
224 movl r25=PAGE_KERNEL
225 ;;
226 srlz.d
227 or r23=r25,r20 // construct PA | page properties
228 mov r25=IA64_GRANULE_SHIFT<<2
229 ;;
230 mov cr.itir=r25
231 mov cr.ifa=in0 // VA of next task...
232 ;;
233 mov r25=IA64_TR_CURRENT_STACK
234 mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped...
235 ;;
236 itr.d dtr[r25]=r23 // wire in new mapping...
237 br.cond.sptk .done
238END(ia64_switch_to)
239
240/*
241 * Note that interrupts are enabled during save_switch_stack and load_switch_stack. This
242 * means that we may get an interrupt with "sp" pointing to the new kernel stack while
243 * ar.bspstore is still pointing to the old kernel backing store area. Since ar.rsc,
244 * ar.rnat, ar.bsp, and ar.bspstore are all preserved by interrupts, this is not a
245 * problem. Also, we don't need to specify unwind information for preserved registers
246 * that are not modified in save_switch_stack as the right unwind information is already
247 * specified at the call-site of save_switch_stack.
248 */
249
250/*
251 * save_switch_stack:
252 * - r16 holds ar.pfs
253 * - b7 holds address to return to
254 * - rp (b0) holds return address to save
255 */
256GLOBAL_ENTRY(save_switch_stack)
257 .prologue
258 .altrp b7
259 flushrs // flush dirty regs to backing store (must be first in insn group)
260 .save @priunat,r17
261 mov r17=ar.unat // preserve caller's
262 .body
263#ifdef CONFIG_ITANIUM
264 adds r2=16+128,sp
265 adds r3=16+64,sp
266 adds r14=SW(R4)+16,sp
267 ;;
268 st8.spill [r14]=r4,16 // spill r4
269 lfetch.fault.excl.nt1 [r3],128
270 ;;
271 lfetch.fault.excl.nt1 [r2],128
272 lfetch.fault.excl.nt1 [r3],128
273 ;;
274 lfetch.fault.excl [r2]
275 lfetch.fault.excl [r3]
276 adds r15=SW(R5)+16,sp
277#else
278 add r2=16+3*128,sp
279 add r3=16,sp
280 add r14=SW(R4)+16,sp
281 ;;
282 st8.spill [r14]=r4,SW(R6)-SW(R4) // spill r4 and prefetch offset 0x1c0
283 lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x010
284 ;;
285 lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x090
286 lfetch.fault.excl.nt1 [r2],128 // prefetch offset 0x190
287 ;;
288 lfetch.fault.excl.nt1 [r3] // prefetch offset 0x110
289 lfetch.fault.excl.nt1 [r2] // prefetch offset 0x210
290 adds r15=SW(R5)+16,sp
291#endif
292 ;;
293 st8.spill [r15]=r5,SW(R7)-SW(R5) // spill r5
294 mov.m ar.rsc=0 // put RSE in mode: enforced lazy, little endian, pl 0
295 add r2=SW(F2)+16,sp // r2 = &sw->f2
296 ;;
297 st8.spill [r14]=r6,SW(B0)-SW(R6) // spill r6
298 mov.m r18=ar.fpsr // preserve fpsr
299 add r3=SW(F3)+16,sp // r3 = &sw->f3
300 ;;
301 stf.spill [r2]=f2,32
302 mov.m r19=ar.rnat
303 mov r21=b0
304
305 stf.spill [r3]=f3,32
306 st8.spill [r15]=r7,SW(B2)-SW(R7) // spill r7
307 mov r22=b1
308 ;;
309 // since we're done with the spills, read and save ar.unat:
310 mov.m r29=ar.unat
311 mov.m r20=ar.bspstore
312 mov r23=b2
313 stf.spill [r2]=f4,32
314 stf.spill [r3]=f5,32
315 mov r24=b3
316 ;;
317 st8 [r14]=r21,SW(B1)-SW(B0) // save b0
318 st8 [r15]=r23,SW(B3)-SW(B2) // save b2
319 mov r25=b4
320 mov r26=b5
321 ;;
322 st8 [r14]=r22,SW(B4)-SW(B1) // save b1
323 st8 [r15]=r24,SW(AR_PFS)-SW(B3) // save b3
324 mov r21=ar.lc // I-unit
325 stf.spill [r2]=f12,32
326 stf.spill [r3]=f13,32
327 ;;
328 st8 [r14]=r25,SW(B5)-SW(B4) // save b4
329 st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS) // save ar.pfs
330 stf.spill [r2]=f14,32
331 stf.spill [r3]=f15,32
332 ;;
333 st8 [r14]=r26 // save b5
334 st8 [r15]=r21 // save ar.lc
335 stf.spill [r2]=f16,32
336 stf.spill [r3]=f17,32
337 ;;
338 stf.spill [r2]=f18,32
339 stf.spill [r3]=f19,32
340 ;;
341 stf.spill [r2]=f20,32
342 stf.spill [r3]=f21,32
343 ;;
344 stf.spill [r2]=f22,32
345 stf.spill [r3]=f23,32
346 ;;
347 stf.spill [r2]=f24,32
348 stf.spill [r3]=f25,32
349 ;;
350 stf.spill [r2]=f26,32
351 stf.spill [r3]=f27,32
352 ;;
353 stf.spill [r2]=f28,32
354 stf.spill [r3]=f29,32
355 ;;
356 stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
357 stf.spill [r3]=f31,SW(PR)-SW(F31)
358 add r14=SW(CALLER_UNAT)+16,sp
359 ;;
360 st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT) // save ar.unat
361 st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
362 mov r21=pr
363 ;;
364 st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
365 st8 [r3]=r21 // save predicate registers
366 ;;
367 st8 [r2]=r20 // save ar.bspstore
368 st8 [r14]=r18 // save fpsr
369 mov ar.rsc=3 // put RSE back into eager mode, pl 0
370 br.cond.sptk.many b7
371END(save_switch_stack)
372
373/*
374 * load_switch_stack:
375 * - "invala" MUST be done at call site (normally in DO_LOAD_SWITCH_STACK)
376 * - b7 holds address to return to
377 * - must not touch r8-r11
378 */
379ENTRY(load_switch_stack)
380 .prologue
381 .altrp b7
382
383 .body
384 lfetch.fault.nt1 [sp]
385 adds r2=SW(AR_BSPSTORE)+16,sp
386 adds r3=SW(AR_UNAT)+16,sp
387 mov ar.rsc=0 // put RSE into enforced lazy mode
388 adds r14=SW(CALLER_UNAT)+16,sp
389 adds r15=SW(AR_FPSR)+16,sp
390 ;;
391 ld8 r27=[r2],(SW(B0)-SW(AR_BSPSTORE)) // bspstore
392 ld8 r29=[r3],(SW(B1)-SW(AR_UNAT)) // unat
393 ;;
394 ld8 r21=[r2],16 // restore b0
395 ld8 r22=[r3],16 // restore b1
396 ;;
397 ld8 r23=[r2],16 // restore b2
398 ld8 r24=[r3],16 // restore b3
399 ;;
400 ld8 r25=[r2],16 // restore b4
401 ld8 r26=[r3],16 // restore b5
402 ;;
403 ld8 r16=[r2],(SW(PR)-SW(AR_PFS)) // ar.pfs
404 ld8 r17=[r3],(SW(AR_RNAT)-SW(AR_LC)) // ar.lc
405 ;;
406 ld8 r28=[r2] // restore pr
407 ld8 r30=[r3] // restore rnat
408 ;;
409 ld8 r18=[r14],16 // restore caller's unat
410 ld8 r19=[r15],24 // restore fpsr
411 ;;
412 ldf.fill f2=[r14],32
413 ldf.fill f3=[r15],32
414 ;;
415 ldf.fill f4=[r14],32
416 ldf.fill f5=[r15],32
417 ;;
418 ldf.fill f12=[r14],32
419 ldf.fill f13=[r15],32
420 ;;
421 ldf.fill f14=[r14],32
422 ldf.fill f15=[r15],32
423 ;;
424 ldf.fill f16=[r14],32
425 ldf.fill f17=[r15],32
426 ;;
427 ldf.fill f18=[r14],32
428 ldf.fill f19=[r15],32
429 mov b0=r21
430 ;;
431 ldf.fill f20=[r14],32
432 ldf.fill f21=[r15],32
433 mov b1=r22
434 ;;
435 ldf.fill f22=[r14],32
436 ldf.fill f23=[r15],32
437 mov b2=r23
438 ;;
439 mov ar.bspstore=r27
440 mov ar.unat=r29 // establish unat holding the NaT bits for r4-r7
441 mov b3=r24
442 ;;
443 ldf.fill f24=[r14],32
444 ldf.fill f25=[r15],32
445 mov b4=r25
446 ;;
447 ldf.fill f26=[r14],32
448 ldf.fill f27=[r15],32
449 mov b5=r26
450 ;;
451 ldf.fill f28=[r14],32
452 ldf.fill f29=[r15],32
453 mov ar.pfs=r16
454 ;;
455 ldf.fill f30=[r14],32
456 ldf.fill f31=[r15],24
457 mov ar.lc=r17
458 ;;
459 ld8.fill r4=[r14],16
460 ld8.fill r5=[r15],16
461 mov pr=r28,-1
462 ;;
463 ld8.fill r6=[r14],16
464 ld8.fill r7=[r15],16
465
466 mov ar.unat=r18 // restore caller's unat
467 mov ar.rnat=r30 // must restore after bspstore but before rsc!
468 mov ar.fpsr=r19 // restore fpsr
469 mov ar.rsc=3 // put RSE back into eager mode, pl 0
470 br.cond.sptk.many b7
471END(load_switch_stack)
472
473GLOBAL_ENTRY(__ia64_syscall)
474 .regstk 6,0,0,0
475 mov r15=in5 // put syscall number in place
476 break __BREAK_SYSCALL
477 movl r2=errno
478 cmp.eq p6,p7=-1,r10
479 ;;
480(p6) st4 [r2]=r8
481(p6) mov r8=-1
482 br.ret.sptk.many rp
483END(__ia64_syscall)
484
485GLOBAL_ENTRY(execve)
486 mov r15=__NR_execve // put syscall number in place
487 break __BREAK_SYSCALL
488 br.ret.sptk.many rp
489END(execve)
490
491GLOBAL_ENTRY(clone)
492 mov r15=__NR_clone // put syscall number in place
493 break __BREAK_SYSCALL
494 br.ret.sptk.many rp
495END(clone)
496
497 /*
498 * Invoke a system call, but do some tracing before and after the call.
499 * We MUST preserve the current register frame throughout this routine
500 * because some system calls (such as ia64_execve) directly
501 * manipulate ar.pfs.
502 */
503GLOBAL_ENTRY(ia64_trace_syscall)
504 PT_REGS_UNWIND_INFO(0)
505 /*
506 * We need to preserve the scratch registers f6-f11 in case the system
507 * call is sigreturn.
508 */
509 adds r16=PT(F6)+16,sp
510 adds r17=PT(F7)+16,sp
511 ;;
512 stf.spill [r16]=f6,32
513 stf.spill [r17]=f7,32
514 ;;
515 stf.spill [r16]=f8,32
516 stf.spill [r17]=f9,32
517 ;;
518 stf.spill [r16]=f10
519 stf.spill [r17]=f11
520 br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
521 adds r16=PT(F6)+16,sp
522 adds r17=PT(F7)+16,sp
523 ;;
524 ldf.fill f6=[r16],32
525 ldf.fill f7=[r17],32
526 ;;
527 ldf.fill f8=[r16],32
528 ldf.fill f9=[r17],32
529 ;;
530 ldf.fill f10=[r16]
531 ldf.fill f11=[r17]
532 // the syscall number may have changed, so re-load it and re-calculate the
533 // syscall entry-point:
534 adds r15=PT(R15)+16,sp // r15 = &pt_regs.r15 (syscall #)
535 ;;
536 ld8 r15=[r15]
537 mov r3=NR_syscalls - 1
538 ;;
539 adds r15=-1024,r15
540 movl r16=sys_call_table
541 ;;
542 shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024)
543 cmp.leu p6,p7=r15,r3
544 ;;
545(p6) ld8 r20=[r20] // load address of syscall entry point
546(p7) movl r20=sys_ni_syscall
547 ;;
548 mov b6=r20
549 br.call.sptk.many rp=b6 // do the syscall
550.strace_check_retval:
551 cmp.lt p6,p0=r8,r0 // syscall failed?
552 adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8
553 adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10
554 mov r10=0
555(p6) br.cond.sptk strace_error // syscall failed ->
556 ;; // avoid RAW on r10
557.strace_save_retval:
558.mem.offset 0,0; st8.spill [r2]=r8 // store return value in slot for r8
559.mem.offset 8,0; st8.spill [r3]=r10 // clear error indication in slot for r10
560 br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
561.ret3: br.cond.sptk .work_pending_syscall_end
562
563strace_error:
564 ld8 r3=[r2] // load pt_regs.r8
565 sub r9=0,r8 // negate return value to get errno value
566 ;;
567 cmp.ne p6,p0=r3,r0 // is pt_regs.r8!=0?
568 adds r3=16,r2 // r3=&pt_regs.r10
569 ;;
570(p6) mov r10=-1
571(p6) mov r8=r9
572 br.cond.sptk .strace_save_retval
573END(ia64_trace_syscall)
574
575 /*
576 * When traced and returning from sigreturn, we invoke syscall_trace but then
577 * go straight to ia64_leave_kernel rather than ia64_leave_syscall.
578 */
579GLOBAL_ENTRY(ia64_strace_leave_kernel)
580 PT_REGS_UNWIND_INFO(0)
581{ /*
582 * Some versions of gas generate bad unwind info if the first instruction of a
583 * procedure doesn't go into the first slot of a bundle. This is a workaround.
584 */
585 nop.m 0
586 nop.i 0
587 br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
588}
589.ret4: br.cond.sptk ia64_leave_kernel
590END(ia64_strace_leave_kernel)
591
592GLOBAL_ENTRY(ia64_ret_from_clone)
593 PT_REGS_UNWIND_INFO(0)
594{ /*
595 * Some versions of gas generate bad unwind info if the first instruction of a
596 * procedure doesn't go into the first slot of a bundle. This is a workaround.
597 */
598 nop.m 0
599 nop.i 0
600 /*
601 * We need to call schedule_tail() to complete the scheduling process.
602 * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the
603 * address of the previously executing task.
604 */
605 br.call.sptk.many rp=ia64_invoke_schedule_tail
606}
607.ret8:
608 adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
609 ;;
610 ld4 r2=[r2]
611 ;;
612 mov r8=0
613 and r2=_TIF_SYSCALL_TRACEAUDIT,r2
614 ;;
615 cmp.ne p6,p0=r2,r0
616(p6) br.cond.spnt .strace_check_retval
617 ;; // added stop bits to prevent r8 dependency
618END(ia64_ret_from_clone)
619 // fall through
620GLOBAL_ENTRY(ia64_ret_from_syscall)
621 PT_REGS_UNWIND_INFO(0)
622 cmp.ge p6,p7=r8,r0 // syscall executed successfully?
623 adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8
624 mov r10=r0 // clear error indication in r10
625(p7) br.cond.spnt handle_syscall_error // handle potential syscall failure
626END(ia64_ret_from_syscall)
627 // fall through
628/*
629 * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
630 * need to switch to bank 0 and doesn't restore the scratch registers.
631 * To avoid leaking kernel bits, the scratch registers are set to
632 * the following known-to-be-safe values:
633 *
634 * r1: restored (global pointer)
635 * r2: cleared
636 * r3: 1 (when returning to user-level)
637 * r8-r11: restored (syscall return value(s))
638 * r12: restored (user-level stack pointer)
639 * r13: restored (user-level thread pointer)
640 * r14: cleared
641 * r15: restored (syscall #)
642 * r16-r17: cleared
643 * r18: user-level b6
644 * r19: cleared
645 * r20: user-level ar.fpsr
646 * r21: user-level b0
647 * r22: cleared
648 * r23: user-level ar.bspstore
649 * r24: user-level ar.rnat
650 * r25: user-level ar.unat
651 * r26: user-level ar.pfs
652 * r27: user-level ar.rsc
653 * r28: user-level ip
654 * r29: user-level psr
655 * r30: user-level cfm
656 * r31: user-level pr
657 * f6-f11: cleared
658 * pr: restored (user-level pr)
659 * b0: restored (user-level rp)
660 * b6: restored
661 * b7: cleared
662 * ar.unat: restored (user-level ar.unat)
663 * ar.pfs: restored (user-level ar.pfs)
664 * ar.rsc: restored (user-level ar.rsc)
665 * ar.rnat: restored (user-level ar.rnat)
666 * ar.bspstore: restored (user-level ar.bspstore)
667 * ar.fpsr: restored (user-level ar.fpsr)
668 * ar.ccv: cleared
669 * ar.csd: cleared
670 * ar.ssd: cleared
671 */
672ENTRY(ia64_leave_syscall)
673 PT_REGS_UNWIND_INFO(0)
674 /*
675 * work.need_resched etc. mustn't get changed by this CPU before it returns to
676 * user- or fsys-mode, hence we disable interrupts early on.
677 *
678 * p6 controls whether current_thread_info()->flags needs to be check for
679 * extra work. We always check for extra work when returning to user-level.
680 * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
681 * is 0. After extra work processing has been completed, execution
682 * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
683 * needs to be redone.
684 */
685#ifdef CONFIG_PREEMPT
686 rsm psr.i // disable interrupts
687 cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall
688(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
689 ;;
690 .pred.rel.mutex pUStk,pKStk
691(pKStk) ld4 r21=[r20] // r21 <- preempt_count
692(pUStk) mov r21=0 // r21 <- 0
693 ;;
694 cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0)
695#else /* !CONFIG_PREEMPT */
696(pUStk) rsm psr.i
697 cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall
698(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
699#endif
700.work_processed_syscall:
701 adds r2=PT(LOADRS)+16,r12
702 adds r3=PT(AR_BSPSTORE)+16,r12
703 adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
704 ;;
705(p6) ld4 r31=[r18] // load current_thread_info()->flags
706 ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs"
707 mov b7=r0 // clear b7
708 ;;
709 ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage)
710 ld8 r18=[r2],PT(R9)-PT(B6) // load b6
711(p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
712 ;;
713 mov r16=ar.bsp // M2 get existing backing store pointer
714(p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending?
715(p6) br.cond.spnt .work_pending_syscall
716 ;;
717 // start restoring the state saved on the kernel stack (struct pt_regs):
718 ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
719 ld8 r11=[r3],PT(CR_IIP)-PT(R11)
720 mov f6=f0 // clear f6
721 ;;
722 invala // M0|1 invalidate ALAT
723 rsm psr.i | psr.ic // M2 initiate turning off of interrupt and interruption collection
724 mov f9=f0 // clear f9
725
726 ld8 r29=[r2],16 // load cr.ipsr
727 ld8 r28=[r3],16 // load cr.iip
728 mov f8=f0 // clear f8
729 ;;
730 ld8 r30=[r2],16 // M0|1 load cr.ifs
731 mov.m ar.ssd=r0 // M2 clear ar.ssd
732 cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs
733 ;;
734 ld8 r25=[r3],16 // M0|1 load ar.unat
735 mov.m ar.csd=r0 // M2 clear ar.csd
736 mov r22=r0 // clear r22
737 ;;
738 ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs
739(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled
740 mov f10=f0 // clear f10
741 ;;
742 ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
743 ld8 r27=[r3],PT(PR)-PT(AR_RSC) // load ar.rsc
744 mov f11=f0 // clear f11
745 ;;
746 ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // load ar.rnat (may be garbage)
747 ld8 r31=[r3],PT(R1)-PT(PR) // load predicates
748(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
749 ;;
750 ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // load ar.fpsr
751 ld8.fill r1=[r3],16 // load r1
752(pUStk) mov r17=1
753 ;;
754 srlz.d // M0 ensure interruption collection is off
755 ld8.fill r13=[r3],16
756 mov f7=f0 // clear f7
757 ;;
758 ld8.fill r12=[r2] // restore r12 (sp)
759 ld8.fill r15=[r3] // restore r15
760 addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
761 ;;
762(pUStk) ld4 r3=[r3] // r3 = cpu_data->phys_stacked_size_p8
763(pUStk) st1 [r14]=r17
764 mov b6=r18 // I0 restore b6
765 ;;
766 mov r14=r0 // clear r14
767 shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition
768(pKStk) br.cond.dpnt.many skip_rbs_switch
769
770 mov.m ar.ccv=r0 // clear ar.ccv
771(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
772 br.cond.sptk.many rbs_switch
773END(ia64_leave_syscall)
774
775#ifdef CONFIG_IA32_SUPPORT
776GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
777 PT_REGS_UNWIND_INFO(0)
778 adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8
779 adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10
780 ;;
781 .mem.offset 0,0
782 st8.spill [r2]=r8 // store return value in slot for r8 and set unat bit
783 .mem.offset 8,0
784 st8.spill [r3]=r0 // clear error indication in slot for r10 and set unat bit
785END(ia64_ret_from_ia32_execve_syscall)
786 // fall through
787#endif /* CONFIG_IA32_SUPPORT */
788GLOBAL_ENTRY(ia64_leave_kernel)
789 PT_REGS_UNWIND_INFO(0)
790 /*
791 * work.need_resched etc. mustn't get changed by this CPU before it returns to
792 * user- or fsys-mode, hence we disable interrupts early on.
793 *
794 * p6 controls whether current_thread_info()->flags needs to be check for
795 * extra work. We always check for extra work when returning to user-level.
796 * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
797 * is 0. After extra work processing has been completed, execution
798 * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
799 * needs to be redone.
800 */
801#ifdef CONFIG_PREEMPT
802 rsm psr.i // disable interrupts
803 cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel
804(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
805 ;;
806 .pred.rel.mutex pUStk,pKStk
807(pKStk) ld4 r21=[r20] // r21 <- preempt_count
808(pUStk) mov r21=0 // r21 <- 0
809 ;;
810 cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0)
811#else
812(pUStk) rsm psr.i
813 cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel
814(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
815#endif
816.work_processed_kernel:
817 adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
818 ;;
819(p6) ld4 r31=[r17] // load current_thread_info()->flags
820 adds r21=PT(PR)+16,r12
821 ;;
822
823 lfetch [r21],PT(CR_IPSR)-PT(PR)
824 adds r2=PT(B6)+16,r12
825 adds r3=PT(R16)+16,r12
826 ;;
827 lfetch [r21]
828 ld8 r28=[r2],8 // load b6
829 adds r29=PT(R24)+16,r12
830
831 ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
832 adds r30=PT(AR_CCV)+16,r12
833(p6) and r19=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
834 ;;
835 ld8.fill r24=[r29]
836 ld8 r15=[r30] // load ar.ccv
837(p6) cmp4.ne.unc p6,p0=r19, r0 // any special work pending?
838 ;;
839 ld8 r29=[r2],16 // load b7
840 ld8 r30=[r3],16 // load ar.csd
841(p6) br.cond.spnt .work_pending
842 ;;
843 ld8 r31=[r2],16 // load ar.ssd
844 ld8.fill r8=[r3],16
845 ;;
846 ld8.fill r9=[r2],16
847 ld8.fill r10=[r3],PT(R17)-PT(R10)
848 ;;
849 ld8.fill r11=[r2],PT(R18)-PT(R11)
850 ld8.fill r17=[r3],16
851 ;;
852 ld8.fill r18=[r2],16
853 ld8.fill r19=[r3],16
854 ;;
855 ld8.fill r20=[r2],16
856 ld8.fill r21=[r3],16
857 mov ar.csd=r30
858 mov ar.ssd=r31
859 ;;
860 rsm psr.i | psr.ic // initiate turning off of interrupt and interruption collection
861 invala // invalidate ALAT
862 ;;
863 ld8.fill r22=[r2],24
864 ld8.fill r23=[r3],24
865 mov b6=r28
866 ;;
867 ld8.fill r25=[r2],16
868 ld8.fill r26=[r3],16
869 mov b7=r29
870 ;;
871 ld8.fill r27=[r2],16
872 ld8.fill r28=[r3],16
873 ;;
874 ld8.fill r29=[r2],16
875 ld8.fill r30=[r3],24
876 ;;
877 ld8.fill r31=[r2],PT(F9)-PT(R31)
878 adds r3=PT(F10)-PT(F6),r3
879 ;;
880 ldf.fill f9=[r2],PT(F6)-PT(F9)
881 ldf.fill f10=[r3],PT(F8)-PT(F10)
882 ;;
883 ldf.fill f6=[r2],PT(F7)-PT(F6)
884 ;;
885 ldf.fill f7=[r2],PT(F11)-PT(F7)
886 ldf.fill f8=[r3],32
887 ;;
888 srlz.i // ensure interruption collection is off
889 mov ar.ccv=r15
890 ;;
891 ldf.fill f11=[r2]
892 bsw.0 // switch back to bank 0 (no stop bit required beforehand...)
893 ;;
894(pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
895 adds r16=PT(CR_IPSR)+16,r12
896 adds r17=PT(CR_IIP)+16,r12
897
898(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled
899 nop.i 0
900 nop.i 0
901 ;;
902 ld8 r29=[r16],16 // load cr.ipsr
903 ld8 r28=[r17],16 // load cr.iip
904 ;;
905 ld8 r30=[r16],16 // load cr.ifs
906 ld8 r25=[r17],16 // load ar.unat
907 ;;
908 ld8 r26=[r16],16 // load ar.pfs
909 ld8 r27=[r17],16 // load ar.rsc
910 cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs
911 ;;
912 ld8 r24=[r16],16 // load ar.rnat (may be garbage)
913 ld8 r23=[r17],16 // load ar.bspstore (may be garbage)
914 ;;
915 ld8 r31=[r16],16 // load predicates
916 ld8 r21=[r17],16 // load b0
917 ;;
918 ld8 r19=[r16],16 // load ar.rsc value for "loadrs"
919 ld8.fill r1=[r17],16 // load r1
920 ;;
921 ld8.fill r12=[r16],16
922 ld8.fill r13=[r17],16
923(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
924 ;;
925 ld8 r20=[r16],16 // ar.fpsr
926 ld8.fill r15=[r17],16
927 ;;
928 ld8.fill r14=[r16],16
929 ld8.fill r2=[r17]
930(pUStk) mov r17=1
931 ;;
932 ld8.fill r3=[r16]
933(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack
934 shr.u r18=r19,16 // get byte size of existing "dirty" partition
935 ;;
936 mov r16=ar.bsp // get existing backing store pointer
937 addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
938 ;;
939 ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8
940(pKStk) br.cond.dpnt skip_rbs_switch
941
942 /*
943 * Restore user backing store.
944 *
945 * NOTE: alloc, loadrs, and cover can't be predicated.
946 */
947(pNonSys) br.cond.dpnt dont_preserve_current_frame
948
949rbs_switch:
950 cover // add current frame into dirty partition and set cr.ifs
951 ;;
952 mov r19=ar.bsp // get new backing store pointer
953 sub r16=r16,r18 // krbs = old bsp - size of dirty partition
954 cmp.ne p9,p0=r0,r0 // clear p9 to skip restore of cr.ifs
955 ;;
956 sub r19=r19,r16 // calculate total byte size of dirty partition
957 add r18=64,r18 // don't force in0-in7 into memory...
958 ;;
959 shl r19=r19,16 // shift size of dirty partition into loadrs position
960 ;;
961dont_preserve_current_frame:
962 /*
963 * To prevent leaking bits between the kernel and user-space,
964 * we must clear the stacked registers in the "invalid" partition here.
965 * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
966 * 5 registers/cycle on McKinley).
967 */
968# define pRecurse p6
969# define pReturn p7
970#ifdef CONFIG_ITANIUM
971# define Nregs 10
972#else
973# define Nregs 14
974#endif
975 alloc loc0=ar.pfs,2,Nregs-2,2,0
976 shr.u loc1=r18,9 // RNaTslots <= floor(dirtySize / (64*8))
977 sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize
978 ;;
979 mov ar.rsc=r19 // load ar.rsc to be used for "loadrs"
980 shladd in0=loc1,3,r17
981 mov in1=0
982 ;;
983 TEXT_ALIGN(32)
984rse_clear_invalid:
985#ifdef CONFIG_ITANIUM
986 // cycle 0
987 { .mii
988 alloc loc0=ar.pfs,2,Nregs-2,2,0
989 cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
990 add out0=-Nregs*8,in0
991}{ .mfb
992 add out1=1,in1 // increment recursion count
993 nop.f 0
994 nop.b 0 // can't do br.call here because of alloc (WAW on CFM)
995 ;;
996}{ .mfi // cycle 1
997 mov loc1=0
998 nop.f 0
999 mov loc2=0
1000}{ .mib
1001 mov loc3=0
1002 mov loc4=0
1003(pRecurse) br.call.sptk.many b0=rse_clear_invalid
1004
1005}{ .mfi // cycle 2
1006 mov loc5=0
1007 nop.f 0
1008 cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
1009}{ .mib
1010 mov loc6=0
1011 mov loc7=0
1012(pReturn) br.ret.sptk.many b0
1013}
1014#else /* !CONFIG_ITANIUM */
1015 alloc loc0=ar.pfs,2,Nregs-2,2,0
1016 cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
1017 add out0=-Nregs*8,in0
1018 add out1=1,in1 // increment recursion count
1019 mov loc1=0
1020 mov loc2=0
1021 ;;
1022 mov loc3=0
1023 mov loc4=0
1024 mov loc5=0
1025 mov loc6=0
1026 mov loc7=0
1027(pRecurse) br.call.sptk.few b0=rse_clear_invalid
1028 ;;
1029 mov loc8=0
1030 mov loc9=0
1031 cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
1032 mov loc10=0
1033 mov loc11=0
1034(pReturn) br.ret.sptk.many b0
1035#endif /* !CONFIG_ITANIUM */
1036# undef pRecurse
1037# undef pReturn
1038 ;;
1039 alloc r17=ar.pfs,0,0,0,0 // drop current register frame
1040 ;;
1041 loadrs
1042 ;;
1043skip_rbs_switch:
1044 mov ar.unat=r25 // M2
1045(pKStk) extr.u r22=r22,21,1 // I0 extract current value of psr.pp from r22
1046(pLvSys)mov r19=r0 // A clear r19 for leave_syscall, no-op otherwise
1047 ;;
1048(pUStk) mov ar.bspstore=r23 // M2
1049(pKStk) dep r29=r22,r29,21,1 // I0 update ipsr.pp with psr.pp
1050(pLvSys)mov r16=r0 // A clear r16 for leave_syscall, no-op otherwise
1051 ;;
1052 mov cr.ipsr=r29 // M2
1053 mov ar.pfs=r26 // I0
1054(pLvSys)mov r17=r0 // A clear r17 for leave_syscall, no-op otherwise
1055
1056(p9) mov cr.ifs=r30 // M2
1057 mov b0=r21 // I0
1058(pLvSys)mov r18=r0 // A clear r18 for leave_syscall, no-op otherwise
1059
1060 mov ar.fpsr=r20 // M2
1061 mov cr.iip=r28 // M2
1062 nop 0
1063 ;;
1064(pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode
1065 nop 0
1066(pLvSys)mov r2=r0
1067
1068 mov ar.rsc=r27 // M2
1069 mov pr=r31,-1 // I0
1070 rfi // B
1071
1072 /*
1073 * On entry:
1074 * r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
1075 * r31 = current->thread_info->flags
1076 * On exit:
1077 * p6 = TRUE if work-pending-check needs to be redone
1078 */
1079.work_pending_syscall:
1080 add r2=-8,r2
1081 add r3=-8,r3
1082 ;;
1083 st8 [r2]=r8
1084 st8 [r3]=r10
1085.work_pending:
1086 tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context?
1087(p6) br.cond.sptk.few .sigdelayed
1088 ;;
1089 tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0?
1090(p6) br.cond.sptk.few .notify
1091#ifdef CONFIG_PREEMPT
1092(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
1093 ;;
1094(pKStk) st4 [r20]=r21
1095 ssm psr.i // enable interrupts
1096#endif
1097 br.call.spnt.many rp=schedule
1098.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1
1099 rsm psr.i // disable interrupts
1100 ;;
1101#ifdef CONFIG_PREEMPT
1102(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
1103 ;;
1104(pKStk) st4 [r20]=r0 // preempt_count() <- 0
1105#endif
1106(pLvSys)br.cond.sptk.few .work_pending_syscall_end
1107 br.cond.sptk.many .work_processed_kernel // re-check
1108
1109.notify:
1110(pUStk) br.call.spnt.many rp=notify_resume_user
1111.ret10: cmp.ne p6,p0=r0,r0 // p6 <- 0
1112(pLvSys)br.cond.sptk.few .work_pending_syscall_end
1113 br.cond.sptk.many .work_processed_kernel // don't re-check
1114
1115// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
1116// it could not be delivered. Deliver it now. The signal might be for us and
1117// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
1118// signal.
1119
1120.sigdelayed:
1121 br.call.sptk.many rp=do_sigdelayed
1122 cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check
1123(pLvSys)br.cond.sptk.few .work_pending_syscall_end
1124 br.cond.sptk.many .work_processed_kernel // re-check
1125
1126.work_pending_syscall_end:
1127 adds r2=PT(R8)+16,r12
1128 adds r3=PT(R10)+16,r12
1129 ;;
1130 ld8 r8=[r2]
1131 ld8 r10=[r3]
1132 br.cond.sptk.many .work_processed_syscall // re-check
1133
1134END(ia64_leave_kernel)
1135
1136ENTRY(handle_syscall_error)
1137 /*
1138 * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
1139 * lead us to mistake a negative return value as a failed syscall. Those syscall
1140 * must deposit a non-zero value in pt_regs.r8 to indicate an error. If
1141 * pt_regs.r8 is zero, we assume that the call completed successfully.
1142 */
1143 PT_REGS_UNWIND_INFO(0)
1144 ld8 r3=[r2] // load pt_regs.r8
1145 ;;
1146 cmp.eq p6,p7=r3,r0 // is pt_regs.r8==0?
1147 ;;
1148(p7) mov r10=-1
1149(p7) sub r8=0,r8 // negate return value to get errno
1150 br.cond.sptk ia64_leave_syscall
1151END(handle_syscall_error)
1152
1153 /*
1154 * Invoke schedule_tail(task) while preserving in0-in7, which may be needed
1155 * in case a system call gets restarted.
1156 */
1157GLOBAL_ENTRY(ia64_invoke_schedule_tail)
1158 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
1159 alloc loc1=ar.pfs,8,2,1,0
1160 mov loc0=rp
1161 mov out0=r8 // Address of previous task
1162 ;;
1163 br.call.sptk.many rp=schedule_tail
1164.ret11: mov ar.pfs=loc1
1165 mov rp=loc0
1166 br.ret.sptk.many rp
1167END(ia64_invoke_schedule_tail)
1168
1169 /*
1170 * Setup stack and call do_notify_resume_user(). Note that pSys and pNonSys need to
1171 * be set up by the caller. We declare 8 input registers so the system call
1172 * args get preserved, in case we need to restart a system call.
1173 */
1174ENTRY(notify_resume_user)
1175 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
1176 alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
1177 mov r9=ar.unat
1178 mov loc0=rp // save return address
1179 mov out0=0 // there is no "oldset"
1180 adds out1=8,sp // out1=&sigscratch->ar_pfs
1181(pSys) mov out2=1 // out2==1 => we're in a syscall
1182 ;;
1183(pNonSys) mov out2=0 // out2==0 => not a syscall
1184 .fframe 16
1185 .spillpsp ar.unat, 16 // (note that offset is relative to psp+0x10!)
1186 st8 [sp]=r9,-16 // allocate space for ar.unat and save it
1187 st8 [out1]=loc1,-8 // save ar.pfs, out1=&sigscratch
1188 .body
1189 br.call.sptk.many rp=do_notify_resume_user
1190.ret15: .restore sp
1191 adds sp=16,sp // pop scratch stack space
1192 ;;
1193 ld8 r9=[sp] // load new unat from sigscratch->scratch_unat
1194 mov rp=loc0
1195 ;;
1196 mov ar.unat=r9
1197 mov ar.pfs=loc1
1198 br.ret.sptk.many rp
1199END(notify_resume_user)
1200
1201GLOBAL_ENTRY(sys_rt_sigsuspend)
1202 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
1203 alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
1204 mov r9=ar.unat
1205 mov loc0=rp // save return address
1206 mov out0=in0 // mask
1207 mov out1=in1 // sigsetsize
1208 adds out2=8,sp // out2=&sigscratch->ar_pfs
1209 ;;
1210 .fframe 16
1211 .spillpsp ar.unat, 16 // (note that offset is relative to psp+0x10!)
1212 st8 [sp]=r9,-16 // allocate space for ar.unat and save it
1213 st8 [out2]=loc1,-8 // save ar.pfs, out2=&sigscratch
1214 .body
1215 br.call.sptk.many rp=ia64_rt_sigsuspend
1216.ret17: .restore sp
1217 adds sp=16,sp // pop scratch stack space
1218 ;;
1219 ld8 r9=[sp] // load new unat from sw->caller_unat
1220 mov rp=loc0
1221 ;;
1222 mov ar.unat=r9
1223 mov ar.pfs=loc1
1224 br.ret.sptk.many rp
1225END(sys_rt_sigsuspend)
1226
1227ENTRY(sys_rt_sigreturn)
1228 PT_REGS_UNWIND_INFO(0)
1229 /*
1230 * Allocate 8 input registers since ptrace() may clobber them
1231 */
1232 alloc r2=ar.pfs,8,0,1,0
1233 .prologue
1234 PT_REGS_SAVES(16)
1235 adds sp=-16,sp
1236 .body
1237 cmp.eq pNonSys,pSys=r0,r0 // sigreturn isn't a normal syscall...
1238 ;;
1239 /*
1240 * leave_kernel() restores f6-f11 from pt_regs, but since the streamlined
1241 * syscall-entry path does not save them we save them here instead. Note: we
1242 * don't need to save any other registers that are not saved by the stream-lined
1243 * syscall path, because restore_sigcontext() restores them.
1244 */
1245 adds r16=PT(F6)+32,sp
1246 adds r17=PT(F7)+32,sp
1247 ;;
1248 stf.spill [r16]=f6,32
1249 stf.spill [r17]=f7,32
1250 ;;
1251 stf.spill [r16]=f8,32
1252 stf.spill [r17]=f9,32
1253 ;;
1254 stf.spill [r16]=f10
1255 stf.spill [r17]=f11
1256 adds out0=16,sp // out0 = &sigscratch
1257 br.call.sptk.many rp=ia64_rt_sigreturn
1258.ret19: .restore sp 0
1259 adds sp=16,sp
1260 ;;
1261 ld8 r9=[sp] // load new ar.unat
1262 mov.sptk b7=r8,ia64_leave_kernel
1263 ;;
1264 mov ar.unat=r9
1265 br.many b7
1266END(sys_rt_sigreturn)
1267
1268GLOBAL_ENTRY(ia64_prepare_handle_unaligned)
1269 .prologue
1270 /*
1271 * r16 = fake ar.pfs, we simply need to make sure privilege is still 0
1272 */
1273 mov r16=r0
1274 DO_SAVE_SWITCH_STACK
1275 br.call.sptk.many rp=ia64_handle_unaligned // stack frame setup in ivt
1276.ret21: .body
1277 DO_LOAD_SWITCH_STACK
1278 br.cond.sptk.many rp // goes to ia64_leave_kernel
1279END(ia64_prepare_handle_unaligned)
1280
1281 //
1282 // unw_init_running(void (*callback)(info, arg), void *arg)
1283 //
1284# define EXTRA_FRAME_SIZE ((UNW_FRAME_INFO_SIZE+15)&~15)
1285
1286GLOBAL_ENTRY(unw_init_running)
1287 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
1288 alloc loc1=ar.pfs,2,3,3,0
1289 ;;
1290 ld8 loc2=[in0],8
1291 mov loc0=rp
1292 mov r16=loc1
1293 DO_SAVE_SWITCH_STACK
1294 .body
1295
1296 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
1297 .fframe IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE
1298 SWITCH_STACK_SAVES(EXTRA_FRAME_SIZE)
1299 adds sp=-EXTRA_FRAME_SIZE,sp
1300 .body
1301 ;;
1302 adds out0=16,sp // &info
1303 mov out1=r13 // current
1304 adds out2=16+EXTRA_FRAME_SIZE,sp // &switch_stack
1305 br.call.sptk.many rp=unw_init_frame_info
13061: adds out0=16,sp // &info
1307 mov b6=loc2
1308 mov loc2=gp // save gp across indirect function call
1309 ;;
1310 ld8 gp=[in0]
1311 mov out1=in1 // arg
1312 br.call.sptk.many rp=b6 // invoke the callback function
13131: mov gp=loc2 // restore gp
1314
1315 // For now, we don't allow changing registers from within
1316 // unw_init_running; if we ever want to allow that, we'd
1317 // have to do a load_switch_stack here:
1318 .restore sp
1319 adds sp=IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE,sp
1320
1321 mov ar.pfs=loc1
1322 mov rp=loc0
1323 br.ret.sptk.many rp
1324END(unw_init_running)
1325
1326 .rodata
1327 .align 8
1328 .globl sys_call_table
1329sys_call_table:
1330 data8 sys_ni_syscall // This must be sys_ni_syscall! See ivt.S.
1331 data8 sys_exit // 1025
1332 data8 sys_read
1333 data8 sys_write
1334 data8 sys_open
1335 data8 sys_close
1336 data8 sys_creat // 1030
1337 data8 sys_link
1338 data8 sys_unlink
1339 data8 ia64_execve
1340 data8 sys_chdir
1341 data8 sys_fchdir // 1035
1342 data8 sys_utimes
1343 data8 sys_mknod
1344 data8 sys_chmod
1345 data8 sys_chown
1346 data8 sys_lseek // 1040
1347 data8 sys_getpid
1348 data8 sys_getppid
1349 data8 sys_mount
1350 data8 sys_umount
1351 data8 sys_setuid // 1045
1352 data8 sys_getuid
1353 data8 sys_geteuid
1354 data8 sys_ptrace
1355 data8 sys_access
1356 data8 sys_sync // 1050
1357 data8 sys_fsync
1358 data8 sys_fdatasync
1359 data8 sys_kill
1360 data8 sys_rename
1361 data8 sys_mkdir // 1055
1362 data8 sys_rmdir
1363 data8 sys_dup
1364 data8 sys_pipe
1365 data8 sys_times
1366 data8 ia64_brk // 1060
1367 data8 sys_setgid
1368 data8 sys_getgid
1369 data8 sys_getegid
1370 data8 sys_acct
1371 data8 sys_ioctl // 1065
1372 data8 sys_fcntl
1373 data8 sys_umask
1374 data8 sys_chroot
1375 data8 sys_ustat
1376 data8 sys_dup2 // 1070
1377 data8 sys_setreuid
1378 data8 sys_setregid
1379 data8 sys_getresuid
1380 data8 sys_setresuid
1381 data8 sys_getresgid // 1075
1382 data8 sys_setresgid
1383 data8 sys_getgroups
1384 data8 sys_setgroups
1385 data8 sys_getpgid
1386 data8 sys_setpgid // 1080
1387 data8 sys_setsid
1388 data8 sys_getsid
1389 data8 sys_sethostname
1390 data8 sys_setrlimit
1391 data8 sys_getrlimit // 1085
1392 data8 sys_getrusage
1393 data8 sys_gettimeofday
1394 data8 sys_settimeofday
1395 data8 sys_select
1396 data8 sys_poll // 1090
1397 data8 sys_symlink
1398 data8 sys_readlink
1399 data8 sys_uselib
1400 data8 sys_swapon
1401 data8 sys_swapoff // 1095
1402 data8 sys_reboot
1403 data8 sys_truncate
1404 data8 sys_ftruncate
1405 data8 sys_fchmod
1406 data8 sys_fchown // 1100
1407 data8 ia64_getpriority
1408 data8 sys_setpriority
1409 data8 sys_statfs
1410 data8 sys_fstatfs
1411 data8 sys_gettid // 1105
1412 data8 sys_semget
1413 data8 sys_semop
1414 data8 sys_semctl
1415 data8 sys_msgget
1416 data8 sys_msgsnd // 1110
1417 data8 sys_msgrcv
1418 data8 sys_msgctl
1419 data8 sys_shmget
1420 data8 ia64_shmat
1421 data8 sys_shmdt // 1115
1422 data8 sys_shmctl
1423 data8 sys_syslog
1424 data8 sys_setitimer
1425 data8 sys_getitimer
1426 data8 sys_ni_syscall // 1120 /* was: ia64_oldstat */
1427 data8 sys_ni_syscall /* was: ia64_oldlstat */
1428 data8 sys_ni_syscall /* was: ia64_oldfstat */
1429 data8 sys_vhangup
1430 data8 sys_lchown
1431 data8 sys_remap_file_pages // 1125
1432 data8 sys_wait4
1433 data8 sys_sysinfo
1434 data8 sys_clone
1435 data8 sys_setdomainname
1436 data8 sys_newuname // 1130
1437 data8 sys_adjtimex
1438 data8 sys_ni_syscall /* was: ia64_create_module */
1439 data8 sys_init_module
1440 data8 sys_delete_module
1441 data8 sys_ni_syscall // 1135 /* was: sys_get_kernel_syms */
1442 data8 sys_ni_syscall /* was: sys_query_module */
1443 data8 sys_quotactl
1444 data8 sys_bdflush
1445 data8 sys_sysfs
1446 data8 sys_personality // 1140
1447 data8 sys_ni_syscall // sys_afs_syscall
1448 data8 sys_setfsuid
1449 data8 sys_setfsgid
1450 data8 sys_getdents
1451 data8 sys_flock // 1145
1452 data8 sys_readv
1453 data8 sys_writev
1454 data8 sys_pread64
1455 data8 sys_pwrite64
1456 data8 sys_sysctl // 1150
1457 data8 sys_mmap
1458 data8 sys_munmap
1459 data8 sys_mlock
1460 data8 sys_mlockall
1461 data8 sys_mprotect // 1155
1462 data8 ia64_mremap
1463 data8 sys_msync
1464 data8 sys_munlock
1465 data8 sys_munlockall
1466 data8 sys_sched_getparam // 1160
1467 data8 sys_sched_setparam
1468 data8 sys_sched_getscheduler
1469 data8 sys_sched_setscheduler
1470 data8 sys_sched_yield
1471 data8 sys_sched_get_priority_max // 1165
1472 data8 sys_sched_get_priority_min
1473 data8 sys_sched_rr_get_interval
1474 data8 sys_nanosleep
1475 data8 sys_nfsservctl
1476 data8 sys_prctl // 1170
1477 data8 sys_getpagesize
1478 data8 sys_mmap2
1479 data8 sys_pciconfig_read
1480 data8 sys_pciconfig_write
1481 data8 sys_perfmonctl // 1175
1482 data8 sys_sigaltstack
1483 data8 sys_rt_sigaction
1484 data8 sys_rt_sigpending
1485 data8 sys_rt_sigprocmask
1486 data8 sys_rt_sigqueueinfo // 1180
1487 data8 sys_rt_sigreturn
1488 data8 sys_rt_sigsuspend
1489 data8 sys_rt_sigtimedwait
1490 data8 sys_getcwd
1491 data8 sys_capget // 1185
1492 data8 sys_capset
1493 data8 sys_sendfile64
1494 data8 sys_ni_syscall // sys_getpmsg (STREAMS)
1495 data8 sys_ni_syscall // sys_putpmsg (STREAMS)
1496 data8 sys_socket // 1190
1497 data8 sys_bind
1498 data8 sys_connect
1499 data8 sys_listen
1500 data8 sys_accept
1501 data8 sys_getsockname // 1195
1502 data8 sys_getpeername
1503 data8 sys_socketpair
1504 data8 sys_send
1505 data8 sys_sendto
1506 data8 sys_recv // 1200
1507 data8 sys_recvfrom
1508 data8 sys_shutdown
1509 data8 sys_setsockopt
1510 data8 sys_getsockopt
1511 data8 sys_sendmsg // 1205
1512 data8 sys_recvmsg
1513 data8 sys_pivot_root
1514 data8 sys_mincore
1515 data8 sys_madvise
1516 data8 sys_newstat // 1210
1517 data8 sys_newlstat
1518 data8 sys_newfstat
1519 data8 sys_clone2
1520 data8 sys_getdents64
1521 data8 sys_getunwind // 1215
1522 data8 sys_readahead
1523 data8 sys_setxattr
1524 data8 sys_lsetxattr
1525 data8 sys_fsetxattr
1526 data8 sys_getxattr // 1220
1527 data8 sys_lgetxattr
1528 data8 sys_fgetxattr
1529 data8 sys_listxattr
1530 data8 sys_llistxattr
1531 data8 sys_flistxattr // 1225
1532 data8 sys_removexattr
1533 data8 sys_lremovexattr
1534 data8 sys_fremovexattr
1535 data8 sys_tkill
1536 data8 sys_futex // 1230
1537 data8 sys_sched_setaffinity
1538 data8 sys_sched_getaffinity
1539 data8 sys_set_tid_address
1540 data8 sys_fadvise64_64
1541 data8 sys_tgkill // 1235
1542 data8 sys_exit_group
1543 data8 sys_lookup_dcookie
1544 data8 sys_io_setup
1545 data8 sys_io_destroy
1546 data8 sys_io_getevents // 1240
1547 data8 sys_io_submit
1548 data8 sys_io_cancel
1549 data8 sys_epoll_create
1550 data8 sys_epoll_ctl
1551 data8 sys_epoll_wait // 1245
1552 data8 sys_restart_syscall
1553 data8 sys_semtimedop
1554 data8 sys_timer_create
1555 data8 sys_timer_settime
1556 data8 sys_timer_gettime // 1250
1557 data8 sys_timer_getoverrun
1558 data8 sys_timer_delete
1559 data8 sys_clock_settime
1560 data8 sys_clock_gettime
1561 data8 sys_clock_getres // 1255
1562 data8 sys_clock_nanosleep
1563 data8 sys_fstatfs64
1564 data8 sys_statfs64
1565 data8 sys_mbind
1566 data8 sys_get_mempolicy // 1260
1567 data8 sys_set_mempolicy
1568 data8 sys_mq_open
1569 data8 sys_mq_unlink
1570 data8 sys_mq_timedsend
1571 data8 sys_mq_timedreceive // 1265
1572 data8 sys_mq_notify
1573 data8 sys_mq_getsetattr
1574 data8 sys_ni_syscall // reserved for kexec_load
1575 data8 sys_ni_syscall // reserved for vserver
1576 data8 sys_waitid // 1270
1577 data8 sys_add_key
1578 data8 sys_request_key
1579 data8 sys_keyctl
1580 data8 sys_ni_syscall
1581 data8 sys_ni_syscall // 1275
1582 data8 sys_ni_syscall
1583 data8 sys_ni_syscall
1584 data8 sys_ni_syscall
1585 data8 sys_ni_syscall
1586
1587 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h
new file mode 100644
index 000000000000..6d4ecec989b5
--- /dev/null
+++ b/arch/ia64/kernel/entry.h
@@ -0,0 +1,82 @@
1#include <linux/config.h>
2
3/*
4 * Preserved registers that are shared between code in ivt.S and
5 * entry.S. Be careful not to step on these!
6 */
7#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */
8#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */
9#define PRED_USER_STACK 3 /* returning to user-stacks? */
10#define PRED_SYSCALL 4 /* inside a system call? */
11#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */
12
13#ifdef __ASSEMBLY__
14# define PASTE2(x,y) x##y
15# define PASTE(x,y) PASTE2(x,y)
16
17# define pLvSys PASTE(p,PRED_LEAVE_SYSCALL)
18# define pKStk PASTE(p,PRED_KERNEL_STACK)
19# define pUStk PASTE(p,PRED_USER_STACK)
20# define pSys PASTE(p,PRED_SYSCALL)
21# define pNonSys PASTE(p,PRED_NON_SYSCALL)
22#endif
23
24#define PT(f) (IA64_PT_REGS_##f##_OFFSET)
25#define SW(f) (IA64_SWITCH_STACK_##f##_OFFSET)
26
27#define PT_REGS_SAVES(off) \
28 .unwabi 3, 'i'; \
29 .fframe IA64_PT_REGS_SIZE+16+(off); \
30 .spillsp rp, PT(CR_IIP)+16+(off); \
31 .spillsp ar.pfs, PT(CR_IFS)+16+(off); \
32 .spillsp ar.unat, PT(AR_UNAT)+16+(off); \
33 .spillsp ar.fpsr, PT(AR_FPSR)+16+(off); \
34 .spillsp pr, PT(PR)+16+(off);
35
36#define PT_REGS_UNWIND_INFO(off) \
37 .prologue; \
38 PT_REGS_SAVES(off); \
39 .body
40
41#define SWITCH_STACK_SAVES(off) \
42 .savesp ar.unat,SW(CALLER_UNAT)+16+(off); \
43 .savesp ar.fpsr,SW(AR_FPSR)+16+(off); \
44 .spillsp f2,SW(F2)+16+(off); .spillsp f3,SW(F3)+16+(off); \
45 .spillsp f4,SW(F4)+16+(off); .spillsp f5,SW(F5)+16+(off); \
46 .spillsp f16,SW(F16)+16+(off); .spillsp f17,SW(F17)+16+(off); \
47 .spillsp f18,SW(F18)+16+(off); .spillsp f19,SW(F19)+16+(off); \
48 .spillsp f20,SW(F20)+16+(off); .spillsp f21,SW(F21)+16+(off); \
49 .spillsp f22,SW(F22)+16+(off); .spillsp f23,SW(F23)+16+(off); \
50 .spillsp f24,SW(F24)+16+(off); .spillsp f25,SW(F25)+16+(off); \
51 .spillsp f26,SW(F26)+16+(off); .spillsp f27,SW(F27)+16+(off); \
52 .spillsp f28,SW(F28)+16+(off); .spillsp f29,SW(F29)+16+(off); \
53 .spillsp f30,SW(F30)+16+(off); .spillsp f31,SW(F31)+16+(off); \
54 .spillsp r4,SW(R4)+16+(off); .spillsp r5,SW(R5)+16+(off); \
55 .spillsp r6,SW(R6)+16+(off); .spillsp r7,SW(R7)+16+(off); \
56 .spillsp b0,SW(B0)+16+(off); .spillsp b1,SW(B1)+16+(off); \
57 .spillsp b2,SW(B2)+16+(off); .spillsp b3,SW(B3)+16+(off); \
58 .spillsp b4,SW(B4)+16+(off); .spillsp b5,SW(B5)+16+(off); \
59 .spillsp ar.pfs,SW(AR_PFS)+16+(off); .spillsp ar.lc,SW(AR_LC)+16+(off); \
60 .spillsp @priunat,SW(AR_UNAT)+16+(off); \
61 .spillsp ar.rnat,SW(AR_RNAT)+16+(off); \
62 .spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off); \
63 .spillsp pr,SW(PR)+16+(off))
64
65#define DO_SAVE_SWITCH_STACK \
66 movl r28=1f; \
67 ;; \
68 .fframe IA64_SWITCH_STACK_SIZE; \
69 adds sp=-IA64_SWITCH_STACK_SIZE,sp; \
70 mov.ret.sptk b7=r28,1f; \
71 SWITCH_STACK_SAVES(0); \
72 br.cond.sptk.many save_switch_stack; \
731:
74
75#define DO_LOAD_SWITCH_STACK \
76 movl r28=1f; \
77 ;; \
78 invala; \
79 mov.ret.sptk b7=r28,1f; \
80 br.cond.sptk.many load_switch_stack; \
811: .restore sp; \
82 adds sp=IA64_SWITCH_STACK_SIZE,sp
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
new file mode 100644
index 000000000000..0d8650f7fce7
--- /dev/null
+++ b/arch/ia64/kernel/fsys.S
@@ -0,0 +1,884 @@
1/*
2 * This file contains the light-weight system call handlers (fsyscall-handlers).
3 *
4 * Copyright (C) 2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * 25-Sep-03 davidm Implement fsys_rt_sigprocmask().
8 * 18-Feb-03 louisk Implement fsys_gettimeofday().
9 * 28-Feb-03 davidm Fixed several bugs in fsys_gettimeofday(). Tuned it some more,
10 * probably broke it along the way... ;-)
11 * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make
12 * it capable of using memory based clocks without falling back to C code.
13 */
14
15#include <asm/asmmacro.h>
16#include <asm/errno.h>
17#include <asm/offsets.h>
18#include <asm/percpu.h>
19#include <asm/thread_info.h>
20#include <asm/sal.h>
21#include <asm/signal.h>
22#include <asm/system.h>
23#include <asm/unistd.h>
24
25#include "entry.h"
26
27/*
28 * See Documentation/ia64/fsys.txt for details on fsyscalls.
29 *
30 * On entry to an fsyscall handler:
31 * r10 = 0 (i.e., defaults to "successful syscall return")
32 * r11 = saved ar.pfs (a user-level value)
33 * r15 = system call number
34 * r16 = "current" task pointer (in normal kernel-mode, this is in r13)
35 * r32-r39 = system call arguments
36 * b6 = return address (a user-level value)
37 * ar.pfs = previous frame-state (a user-level value)
38 * PSR.be = cleared to zero (i.e., little-endian byte order is in effect)
39 * all other registers may contain values passed in from user-mode
40 *
41 * On return from an fsyscall handler:
42 * r11 = saved ar.pfs (as passed into the fsyscall handler)
43 * r15 = system call number (as passed into the fsyscall handler)
44 * r32-r39 = system call arguments (as passed into the fsyscall handler)
45 * b6 = return address (as passed into the fsyscall handler)
46 * ar.pfs = previous frame-state (as passed into the fsyscall handler)
47 */
48
49ENTRY(fsys_ni_syscall)
50 .prologue
51 .altrp b6
52 .body
53 mov r8=ENOSYS
54 mov r10=-1
55 FSYS_RETURN
56END(fsys_ni_syscall)
57
58ENTRY(fsys_getpid)
59 .prologue
60 .altrp b6
61 .body
62 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
63 ;;
64 ld4 r9=[r9]
65 add r8=IA64_TASK_TGID_OFFSET,r16
66 ;;
67 and r9=TIF_ALLWORK_MASK,r9
68 ld4 r8=[r8] // r8 = current->tgid
69 ;;
70 cmp.ne p8,p0=0,r9
71(p8) br.spnt.many fsys_fallback_syscall
72 FSYS_RETURN
73END(fsys_getpid)
74
75ENTRY(fsys_getppid)
76 .prologue
77 .altrp b6
78 .body
79 add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16
80 ;;
81 ld8 r17=[r17] // r17 = current->group_leader
82 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
83 ;;
84
85 ld4 r9=[r9]
86 add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = &current->group_leader->real_parent
87 ;;
88 and r9=TIF_ALLWORK_MASK,r9
89
901: ld8 r18=[r17] // r18 = current->group_leader->real_parent
91 ;;
92 cmp.ne p8,p0=0,r9
93 add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = &current->group_leader->real_parent->tgid
94 ;;
95
96 /*
97 * The .acq is needed to ensure that the read of tgid has returned its data before
98 * we re-check "real_parent".
99 */
100 ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid
101#ifdef CONFIG_SMP
102 /*
103 * Re-read current->group_leader->real_parent.
104 */
105 ld8 r19=[r17] // r19 = current->group_leader->real_parent
106(p8) br.spnt.many fsys_fallback_syscall
107 ;;
108 cmp.ne p6,p0=r18,r19 // did real_parent change?
109 mov r19=0 // i must not leak kernel bits...
110(p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check
111 ;;
112 mov r17=0 // i must not leak kernel bits...
113 mov r18=0 // i must not leak kernel bits...
114#else
115 mov r17=0 // i must not leak kernel bits...
116 mov r18=0 // i must not leak kernel bits...
117 mov r19=0 // i must not leak kernel bits...
118#endif
119 FSYS_RETURN
120END(fsys_getppid)
121
122ENTRY(fsys_set_tid_address)
123 .prologue
124 .altrp b6
125 .body
126 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
127 ;;
128 ld4 r9=[r9]
129 tnat.z p6,p7=r32 // check argument register for being NaT
130 ;;
131 and r9=TIF_ALLWORK_MASK,r9
132 add r8=IA64_TASK_PID_OFFSET,r16
133 add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16
134 ;;
135 ld4 r8=[r8]
136 cmp.ne p8,p0=0,r9
137 mov r17=-1
138 ;;
139(p6) st8 [r18]=r32
140(p7) st8 [r18]=r17
141(p8) br.spnt.many fsys_fallback_syscall
142 ;;
143 mov r17=0 // i must not leak kernel bits...
144 mov r18=0 // i must not leak kernel bits...
145 FSYS_RETURN
146END(fsys_set_tid_address)
147
148/*
149 * Ensure that the time interpolator structure is compatible with the asm code
150 */
151#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
152 || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
153#error fsys_gettimeofday incompatible with changes to struct time_interpolator
154#endif
155#define CLOCK_REALTIME 0
156#define CLOCK_MONOTONIC 1
157#define CLOCK_DIVIDE_BY_1000 0x4000
158#define CLOCK_ADD_MONOTONIC 0x8000
159
160ENTRY(fsys_gettimeofday)
161 .prologue
162 .altrp b6
163 .body
164 mov r31 = r32
165 tnat.nz p6,p0 = r33 // guard against NaT argument
166(p6) br.cond.spnt.few .fail_einval
167 mov r30 = CLOCK_DIVIDE_BY_1000
168 ;;
169.gettime:
170 // Register map
171 // Incoming r31 = pointer to address where to place result
172 // r30 = flags determining how time is processed
173 // r2,r3 = temp r4-r7 preserved
174 // r8 = result nanoseconds
175 // r9 = result seconds
176 // r10 = temporary storage for clock difference
177 // r11 = preserved: saved ar.pfs
178 // r12 = preserved: memory stack
179 // r13 = preserved: thread pointer
180 // r14 = address of mask / mask
181 // r15 = preserved: system call number
182 // r16 = preserved: current task pointer
183 // r17 = wall to monotonic use
184 // r18 = time_interpolator->offset
185 // r19 = address of wall_to_monotonic
186 // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
187 // r21 = shift factor
188 // r22 = address of time interpolator->last_counter
189 // r23 = address of time_interpolator->last_cycle
190 // r24 = adress of time_interpolator->offset
191 // r25 = last_cycle value
192 // r26 = last_counter value
193 // r27 = pointer to xtime
194 // r28 = sequence number at the beginning of critcal section
195 // r29 = address of seqlock
196 // r30 = time processing flags / memory address
197 // r31 = pointer to result
198 // Predicates
199 // p6,p7 short term use
200 // p8 = timesource ar.itc
201 // p9 = timesource mmio64
202 // p10 = timesource mmio32
203 // p11 = timesource not to be handled by asm code
204 // p12 = memory time source ( = p9 | p10)
205 // p13 = do cmpxchg with time_interpolator_last_cycle
206 // p14 = Divide by 1000
207 // p15 = Add monotonic
208 //
209 // Note that instructions are optimized for McKinley. McKinley can process two
210 // bundles simultaneously and therefore we continuously try to feed the CPU
211 // two bundles and then a stop.
212 tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure
213 mov pr = r30,0xc000 // Set predicates according to function
214 add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
215 movl r20 = time_interpolator
216 ;;
217 ld8 r20 = [r20] // get pointer to time_interpolator structure
218 movl r29 = xtime_lock
219 ld4 r2 = [r2] // process work pending flags
220 movl r27 = xtime
221 ;; // only one bundle here
222 ld8 r21 = [r20] // first quad with control information
223 and r2 = TIF_ALLWORK_MASK,r2
224(p6) br.cond.spnt.few .fail_einval // deferred branch
225 ;;
226 add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
227 extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc
228 extr r8 = r21,0,16 // time_interpolator->source
229 cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled
230(p6) br.cond.spnt.many fsys_fallback_syscall
231 ;;
232 cmp.eq p8,p12 = 0,r8 // Check for cpu timer
233 cmp.eq p9,p0 = 1,r8 // MMIO64 ?
234 extr r2 = r21,24,8 // time_interpolator->jitter
235 cmp.eq p10,p0 = 2,r8 // MMIO32 ?
236 cmp.ltu p11,p0 = 2,r8 // function or other clock
237(p11) br.cond.spnt.many fsys_fallback_syscall
238 ;;
239 setf.sig f7 = r3 // Setup for scaling of counter
240(p15) movl r19 = wall_to_monotonic
241(p12) ld8 r30 = [r10]
242 cmp.ne p13,p0 = r2,r0 // need jitter compensation?
243 extr r21 = r21,16,8 // shift factor
244 ;;
245.time_redo:
246 .pred.rel.mutex p8,p9,p10
247 ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes
248(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!!
249 add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
250(p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues..
251(p10) ld4 r2 = [r30] // readw(ti->address)
252(p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
253 ;; // could be removed by moving the last add upward
254 ld8 r26 = [r22] // time_interpolator->last_counter
255(p13) ld8 r25 = [r23] // time interpolator->last_cycle
256 add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
257(p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
258 ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
259 add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
260 ;;
261 ld8 r18 = [r24] // time_interpolator->offset
262 ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec
263(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
264 ;;
265 ld8 r14 = [r14] // time_interpolator->mask
266(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
267 sub r10 = r2,r26 // current_counter - last_counter
268 ;;
269(p6) sub r10 = r25,r26 // time we got was less than last_cycle
270(p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg
271 ;;
272 and r10 = r10,r14 // Apply mask
273 ;;
274 setf.sig f8 = r10
275 nop.i 123
276 ;;
277(p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv
278EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time
279 xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter)
280(p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs
281 ;;
282(p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
283(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo
284 // simulate tbit.nz.or p7,p0 = r28,0
285 and r28 = ~1,r28 // Make sequence even to force retry if odd
286 getf.sig r2 = f8
287 mf
288 add r8 = r8,r18 // Add time interpolator offset
289 ;;
290 ld4 r10 = [r29] // xtime_lock.sequence
291(p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs
292 shr.u r2 = r2,r21
293 ;; // overloaded 3 bundles!
294 // End critical section.
295 add r8 = r8,r2 // Add xtime.nsecs
296 cmp4.ne.or p7,p0 = r28,r10
297(p7) br.cond.dpnt.few .time_redo // sequence number changed ?
298 // Now r8=tv->tv_nsec and r9=tv->tv_sec
299 mov r10 = r0
300 movl r2 = 1000000000
301 add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31
302(p14) movl r3 = 2361183241434822607 // Prep for / 1000 hack
303 ;;
304.time_normalize:
305 mov r21 = r8
306 cmp.ge p6,p0 = r8,r2
307(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting some time
308 ;;
309(p14) setf.sig f8 = r20
310(p6) sub r8 = r8,r2
311(p6) add r9 = 1,r9 // two nops before the branch.
312(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod
313(p6) br.cond.dpnt.few .time_normalize
314 ;;
315 // Divided by 8 though shift. Now divide by 125
316 // The compiler was able to do that with a multiply
317 // and a shift and we do the same
318EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles
319(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it...
320 ;;
321 mov r8 = r0
322(p14) getf.sig r2 = f8
323 ;;
324(p14) shr.u r21 = r2, 4
325 ;;
326EX(.fail_efault, st8 [r31] = r9)
327EX(.fail_efault, st8 [r23] = r21)
328 FSYS_RETURN
329.fail_einval:
330 mov r8 = EINVAL
331 mov r10 = -1
332 FSYS_RETURN
333.fail_efault:
334 mov r8 = EFAULT
335 mov r10 = -1
336 FSYS_RETURN
337END(fsys_gettimeofday)
338
339ENTRY(fsys_clock_gettime)
340 .prologue
341 .altrp b6
342 .body
343 cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32
344 // Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC
345(p6) br.spnt.few fsys_fallback_syscall
346 mov r31 = r33
347 shl r30 = r32,15
348 br.many .gettime
349END(fsys_clock_gettime)
350
351/*
352 * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
353 */
354#if _NSIG_WORDS != 1
355# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1.
356#endif
357ENTRY(fsys_rt_sigprocmask)
358 .prologue
359 .altrp b6
360 .body
361
362 add r2=IA64_TASK_BLOCKED_OFFSET,r16
363 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
364 cmp4.ltu p6,p0=SIG_SETMASK,r32
365
366 cmp.ne p15,p0=r0,r34 // oset != NULL?
367 tnat.nz p8,p0=r34
368 add r31=IA64_TASK_SIGHAND_OFFSET,r16
369 ;;
370 ld8 r3=[r2] // read/prefetch current->blocked
371 ld4 r9=[r9]
372 tnat.nz.or p6,p0=r35
373
374 cmp.ne.or p6,p0=_NSIG_WORDS*8,r35
375 tnat.nz.or p6,p0=r32
376(p6) br.spnt.few .fail_einval // fail with EINVAL
377 ;;
378#ifdef CONFIG_SMP
379 ld8 r31=[r31] // r31 <- current->sighand
380#endif
381 and r9=TIF_ALLWORK_MASK,r9
382 tnat.nz.or p8,p0=r33
383 ;;
384 cmp.ne p7,p0=0,r9
385 cmp.eq p6,p0=r0,r33 // set == NULL?
386 add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31 // r31 <- current->sighand->siglock
387(p8) br.spnt.few .fail_efault // fail with EFAULT
388(p7) br.spnt.many fsys_fallback_syscall // got pending kernel work...
389(p6) br.dpnt.many .store_mask // -> short-circuit to just reading the signal mask
390
391 /* Argh, we actually have to do some work and _update_ the signal mask: */
392
393EX(.fail_efault, probe.r.fault r33, 3) // verify user has read-access to *set
394EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set
395 mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
396 ;;
397
398 rsm psr.i // mask interrupt delivery
399 mov ar.ccv=0
400 andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP
401
402#ifdef CONFIG_SMP
403 mov r17=1
404 ;;
405 cmpxchg4.acq r18=[r31],r17,ar.ccv // try to acquire the lock
406 mov r8=EINVAL // default to EINVAL
407 ;;
408 ld8 r3=[r2] // re-read current->blocked now that we hold the lock
409 cmp4.ne p6,p0=r18,r0
410(p6) br.cond.spnt.many .lock_contention
411 ;;
412#else
413 ld8 r3=[r2] // re-read current->blocked now that we hold the lock
414 mov r8=EINVAL // default to EINVAL
415#endif
416 add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16
417 add r19=IA64_TASK_SIGNAL_OFFSET,r16
418 cmp4.eq p6,p0=SIG_BLOCK,r32
419 ;;
420 ld8 r19=[r19] // r19 <- current->signal
421 cmp4.eq p7,p0=SIG_UNBLOCK,r32
422 cmp4.eq p8,p0=SIG_SETMASK,r32
423 ;;
424 ld8 r18=[r18] // r18 <- current->pending.signal
425 .pred.rel.mutex p6,p7,p8
426(p6) or r14=r3,r14 // SIG_BLOCK
427(p7) andcm r14=r3,r14 // SIG_UNBLOCK
428
429(p8) mov r14=r14 // SIG_SETMASK
430(p6) mov r8=0 // clear error code
431 // recalc_sigpending()
432 add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19
433
434 add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19
435 ;;
436 ld4 r17=[r17] // r17 <- current->signal->group_stop_count
437(p7) mov r8=0 // clear error code
438
439 ld8 r19=[r19] // r19 <- current->signal->shared_pending
440 ;;
441 cmp4.gt p6,p7=r17,r0 // p6/p7 <- (current->signal->group_stop_count > 0)?
442(p8) mov r8=0 // clear error code
443
444 or r18=r18,r19 // r18 <- current->pending | current->signal->shared_pending
445 ;;
446 // r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked:
447 andcm r18=r18,r14
448 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
449 ;;
450
451(p7) cmp.ne.or.andcm p6,p7=r18,r0 // p6/p7 <- signal pending
452 mov r19=0 // i must not leak kernel bits...
453(p6) br.cond.dpnt.many .sig_pending
454 ;;
455
4561: ld4 r17=[r9] // r17 <- current->thread_info->flags
457 ;;
458 mov ar.ccv=r17
459 and r18=~_TIF_SIGPENDING,r17 // r18 <- r17 & ~(1 << TIF_SIGPENDING)
460 ;;
461
462 st8 [r2]=r14 // update current->blocked with new mask
463 cmpxchg4.acq r14=[r9],r18,ar.ccv // current->thread_info->flags <- r18
464 ;;
465 cmp.ne p6,p0=r17,r14 // update failed?
466(p6) br.cond.spnt.few 1b // yes -> retry
467
468#ifdef CONFIG_SMP
469 st4.rel [r31]=r0 // release the lock
470#endif
471 ssm psr.i
472 ;;
473
474 srlz.d // ensure psr.i is set again
475 mov r18=0 // i must not leak kernel bits...
476
477.store_mask:
478EX(.fail_efault, (p15) probe.w.fault r34, 3) // verify user has write-access to *oset
479EX(.fail_efault, (p15) st8 [r34]=r3)
480 mov r2=0 // i must not leak kernel bits...
481 mov r3=0 // i must not leak kernel bits...
482 mov r8=0 // return 0
483 mov r9=0 // i must not leak kernel bits...
484 mov r14=0 // i must not leak kernel bits...
485 mov r17=0 // i must not leak kernel bits...
486 mov r31=0 // i must not leak kernel bits...
487 FSYS_RETURN
488
489.sig_pending:
490#ifdef CONFIG_SMP
491 st4.rel [r31]=r0 // release the lock
492#endif
493 ssm psr.i
494 ;;
495 srlz.d
496 br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall
497
498#ifdef CONFIG_SMP
499.lock_contention:
500 /* Rather than spinning here, fall back on doing a heavy-weight syscall. */
501 ssm psr.i
502 ;;
503 srlz.d
504 br.sptk.many fsys_fallback_syscall
505#endif
506END(fsys_rt_sigprocmask)
507
508ENTRY(fsys_fallback_syscall)
509 .prologue
510 .altrp b6
511 .body
512 /*
513 * We only get here from light-weight syscall handlers. Thus, we already
514 * know that r15 contains a valid syscall number. No need to re-check.
515 */
516 adds r17=-1024,r15
517 movl r14=sys_call_table
518 ;;
519 rsm psr.i
520 shladd r18=r17,3,r14
521 ;;
522 ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point
523 mov r29=psr // read psr (12 cyc load latency)
524 mov r27=ar.rsc
525 mov r21=ar.fpsr
526 mov r26=ar.pfs
527END(fsys_fallback_syscall)
528 /* FALL THROUGH */
529GLOBAL_ENTRY(fsys_bubble_down)
530 .prologue
531 .altrp b6
532 .body
533 /*
534 * We get here for syscalls that don't have a lightweight handler. For those, we
535 * need to bubble down into the kernel and that requires setting up a minimal
536 * pt_regs structure, and initializing the CPU state more or less as if an
537 * interruption had occurred. To make syscall-restarts work, we setup pt_regs
538 * such that cr_iip points to the second instruction in syscall_via_break.
539 * Decrementing the IP hence will restart the syscall via break and not
540 * decrementing IP will return us to the caller, as usual. Note that we preserve
541 * the value of psr.pp rather than initializing it from dcr.pp. This makes it
542 * possible to distinguish fsyscall execution from other privileged execution.
543 *
544 * On entry:
545 * - normal fsyscall handler register usage, except that we also have:
546 * - r18: address of syscall entry point
547 * - r21: ar.fpsr
548 * - r26: ar.pfs
549 * - r27: ar.rsc
550 * - r29: psr
551 */
552# define PSR_PRESERVED_BITS (IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \
553 | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \
554 | IA64_PSR_IC)
555 /*
556 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc. The rest we have
557 * to synthesize.
558 */
559# define PSR_ONE_BITS ((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \
560 | IA64_PSR_BN | IA64_PSR_I)
561
562 invala
563 movl r8=PSR_ONE_BITS
564
565 mov r25=ar.unat // save ar.unat (5 cyc)
566 movl r9=PSR_PRESERVED_BITS
567
568 mov ar.rsc=0 // set enforced lazy mode, pl 0, little-endian, loadrs=0
569 movl r28=__kernel_syscall_via_break
570 ;;
571 mov r23=ar.bspstore // save ar.bspstore (12 cyc)
572 mov r31=pr // save pr (2 cyc)
573 mov r20=r1 // save caller's gp in r20
574 ;;
575 mov r2=r16 // copy current task addr to addl-addressable register
576 and r9=r9,r29
577 mov r19=b6 // save b6 (2 cyc)
578 ;;
579 mov psr.l=r9 // slam the door (17 cyc to srlz.i)
580 or r29=r8,r29 // construct cr.ipsr value to save
581 addl r22=IA64_RBS_OFFSET,r2 // compute base of RBS
582 ;;
583 // GAS reports a spurious RAW hazard on the read of ar.rnat because it thinks
584 // we may be reading ar.itc after writing to psr.l. Avoid that message with
585 // this directive:
586 dv_serialize_data
587 mov.m r24=ar.rnat // read ar.rnat (5 cyc lat)
588 lfetch.fault.excl.nt1 [r22]
589 adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2
590
591 // ensure previous insn group is issued before we stall for srlz.i:
592 ;;
593 srlz.i // ensure new psr.l has been established
594 /////////////////////////////////////////////////////////////////////////////
595 ////////// from this point on, execution is not interruptible anymore
596 /////////////////////////////////////////////////////////////////////////////
597 addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // compute base of memory stack
598 cmp.ne pKStk,pUStk=r0,r0 // set pKStk <- 0, pUStk <- 1
599 ;;
600 st1 [r16]=r0 // clear current->thread.on_ustack flag
601 mov ar.bspstore=r22 // switch to kernel RBS
602 mov b6=r18 // copy syscall entry-point to b6 (7 cyc)
603 add r3=TI_FLAGS+IA64_TASK_SIZE,r2
604 ;;
605 ld4 r3=[r3] // r2 = current_thread_info()->flags
606 mov r18=ar.bsp // save (kernel) ar.bsp (12 cyc)
607 mov ar.rsc=0x3 // set eager mode, pl 0, little-endian, loadrs=0
608 br.call.sptk.many b7=ia64_syscall_setup
609 ;;
610 ssm psr.i
611 movl r2=ia64_ret_from_syscall
612 ;;
613 mov rp=r2 // set the real return addr
614 tbit.z p8,p0=r3,TIF_SYSCALL_TRACE
615 ;;
616(p10) br.cond.spnt.many ia64_ret_from_syscall // p10==true means out registers are more than 8
617(p8) br.call.sptk.many b6=b6 // ignore this return addr
618 br.cond.sptk ia64_trace_syscall
619END(fsys_bubble_down)
620
621 .rodata
622 .align 8
623 .globl fsyscall_table
624
625 data8 fsys_bubble_down
626fsyscall_table:
627 data8 fsys_ni_syscall
628 data8 0 // exit // 1025
629 data8 0 // read
630 data8 0 // write
631 data8 0 // open
632 data8 0 // close
633 data8 0 // creat // 1030
634 data8 0 // link
635 data8 0 // unlink
636 data8 0 // execve
637 data8 0 // chdir
638 data8 0 // fchdir // 1035
639 data8 0 // utimes
640 data8 0 // mknod
641 data8 0 // chmod
642 data8 0 // chown
643 data8 0 // lseek // 1040
644 data8 fsys_getpid // getpid
645 data8 fsys_getppid // getppid
646 data8 0 // mount
647 data8 0 // umount
648 data8 0 // setuid // 1045
649 data8 0 // getuid
650 data8 0 // geteuid
651 data8 0 // ptrace
652 data8 0 // access
653 data8 0 // sync // 1050
654 data8 0 // fsync
655 data8 0 // fdatasync
656 data8 0 // kill
657 data8 0 // rename
658 data8 0 // mkdir // 1055
659 data8 0 // rmdir
660 data8 0 // dup
661 data8 0 // pipe
662 data8 0 // times
663 data8 0 // brk // 1060
664 data8 0 // setgid
665 data8 0 // getgid
666 data8 0 // getegid
667 data8 0 // acct
668 data8 0 // ioctl // 1065
669 data8 0 // fcntl
670 data8 0 // umask
671 data8 0 // chroot
672 data8 0 // ustat
673 data8 0 // dup2 // 1070
674 data8 0 // setreuid
675 data8 0 // setregid
676 data8 0 // getresuid
677 data8 0 // setresuid
678 data8 0 // getresgid // 1075
679 data8 0 // setresgid
680 data8 0 // getgroups
681 data8 0 // setgroups
682 data8 0 // getpgid
683 data8 0 // setpgid // 1080
684 data8 0 // setsid
685 data8 0 // getsid
686 data8 0 // sethostname
687 data8 0 // setrlimit
688 data8 0 // getrlimit // 1085
689 data8 0 // getrusage
690 data8 fsys_gettimeofday // gettimeofday
691 data8 0 // settimeofday
692 data8 0 // select
693 data8 0 // poll // 1090
694 data8 0 // symlink
695 data8 0 // readlink
696 data8 0 // uselib
697 data8 0 // swapon
698 data8 0 // swapoff // 1095
699 data8 0 // reboot
700 data8 0 // truncate
701 data8 0 // ftruncate
702 data8 0 // fchmod
703 data8 0 // fchown // 1100
704 data8 0 // getpriority
705 data8 0 // setpriority
706 data8 0 // statfs
707 data8 0 // fstatfs
708 data8 0 // gettid // 1105
709 data8 0 // semget
710 data8 0 // semop
711 data8 0 // semctl
712 data8 0 // msgget
713 data8 0 // msgsnd // 1110
714 data8 0 // msgrcv
715 data8 0 // msgctl
716 data8 0 // shmget
717 data8 0 // shmat
718 data8 0 // shmdt // 1115
719 data8 0 // shmctl
720 data8 0 // syslog
721 data8 0 // setitimer
722 data8 0 // getitimer
723 data8 0 // 1120
724 data8 0
725 data8 0
726 data8 0 // vhangup
727 data8 0 // lchown
728 data8 0 // remap_file_pages // 1125
729 data8 0 // wait4
730 data8 0 // sysinfo
731 data8 0 // clone
732 data8 0 // setdomainname
733 data8 0 // newuname // 1130
734 data8 0 // adjtimex
735 data8 0
736 data8 0 // init_module
737 data8 0 // delete_module
738 data8 0 // 1135
739 data8 0
740 data8 0 // quotactl
741 data8 0 // bdflush
742 data8 0 // sysfs
743 data8 0 // personality // 1140
744 data8 0 // afs_syscall
745 data8 0 // setfsuid
746 data8 0 // setfsgid
747 data8 0 // getdents
748 data8 0 // flock // 1145
749 data8 0 // readv
750 data8 0 // writev
751 data8 0 // pread64
752 data8 0 // pwrite64
753 data8 0 // sysctl // 1150
754 data8 0 // mmap
755 data8 0 // munmap
756 data8 0 // mlock
757 data8 0 // mlockall
758 data8 0 // mprotect // 1155
759 data8 0 // mremap
760 data8 0 // msync
761 data8 0 // munlock
762 data8 0 // munlockall
763 data8 0 // sched_getparam // 1160
764 data8 0 // sched_setparam
765 data8 0 // sched_getscheduler
766 data8 0 // sched_setscheduler
767 data8 0 // sched_yield
768 data8 0 // sched_get_priority_max // 1165
769 data8 0 // sched_get_priority_min
770 data8 0 // sched_rr_get_interval
771 data8 0 // nanosleep
772 data8 0 // nfsservctl
773 data8 0 // prctl // 1170
774 data8 0 // getpagesize
775 data8 0 // mmap2
776 data8 0 // pciconfig_read
777 data8 0 // pciconfig_write
778 data8 0 // perfmonctl // 1175
779 data8 0 // sigaltstack
780 data8 0 // rt_sigaction
781 data8 0 // rt_sigpending
782 data8 fsys_rt_sigprocmask // rt_sigprocmask
783 data8 0 // rt_sigqueueinfo // 1180
784 data8 0 // rt_sigreturn
785 data8 0 // rt_sigsuspend
786 data8 0 // rt_sigtimedwait
787 data8 0 // getcwd
788 data8 0 // capget // 1185
789 data8 0 // capset
790 data8 0 // sendfile
791 data8 0
792 data8 0
793 data8 0 // socket // 1190
794 data8 0 // bind
795 data8 0 // connect
796 data8 0 // listen
797 data8 0 // accept
798 data8 0 // getsockname // 1195
799 data8 0 // getpeername
800 data8 0 // socketpair
801 data8 0 // send
802 data8 0 // sendto
803 data8 0 // recv // 1200
804 data8 0 // recvfrom
805 data8 0 // shutdown
806 data8 0 // setsockopt
807 data8 0 // getsockopt
808 data8 0 // sendmsg // 1205
809 data8 0 // recvmsg
810 data8 0 // pivot_root
811 data8 0 // mincore
812 data8 0 // madvise
813 data8 0 // newstat // 1210
814 data8 0 // newlstat
815 data8 0 // newfstat
816 data8 0 // clone2
817 data8 0 // getdents64
818 data8 0 // getunwind // 1215
819 data8 0 // readahead
820 data8 0 // setxattr
821 data8 0 // lsetxattr
822 data8 0 // fsetxattr
823 data8 0 // getxattr // 1220
824 data8 0 // lgetxattr
825 data8 0 // fgetxattr
826 data8 0 // listxattr
827 data8 0 // llistxattr
828 data8 0 // flistxattr // 1225
829 data8 0 // removexattr
830 data8 0 // lremovexattr
831 data8 0 // fremovexattr
832 data8 0 // tkill
833 data8 0 // futex // 1230
834 data8 0 // sched_setaffinity
835 data8 0 // sched_getaffinity
836 data8 fsys_set_tid_address // set_tid_address
837 data8 0 // fadvise64_64
838 data8 0 // tgkill // 1235
839 data8 0 // exit_group
840 data8 0 // lookup_dcookie
841 data8 0 // io_setup
842 data8 0 // io_destroy
843 data8 0 // io_getevents // 1240
844 data8 0 // io_submit
845 data8 0 // io_cancel
846 data8 0 // epoll_create
847 data8 0 // epoll_ctl
848 data8 0 // epoll_wait // 1245
849 data8 0 // restart_syscall
850 data8 0 // semtimedop
851 data8 0 // timer_create
852 data8 0 // timer_settime
853 data8 0 // timer_gettime // 1250
854 data8 0 // timer_getoverrun
855 data8 0 // timer_delete
856 data8 0 // clock_settime
857 data8 fsys_clock_gettime // clock_gettime
858 data8 0 // clock_getres // 1255
859 data8 0 // clock_nanosleep
860 data8 0 // fstatfs64
861 data8 0 // statfs64
862 data8 0
863 data8 0 // 1260
864 data8 0
865 data8 0 // mq_open
866 data8 0 // mq_unlink
867 data8 0 // mq_timedsend
868 data8 0 // mq_timedreceive // 1265
869 data8 0 // mq_notify
870 data8 0 // mq_getsetattr
871 data8 0 // kexec_load
872 data8 0
873 data8 0 // 1270
874 data8 0
875 data8 0
876 data8 0
877 data8 0
878 data8 0 // 1275
879 data8 0
880 data8 0
881 data8 0
882 data8 0
883
884 .org fsyscall_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S
new file mode 100644
index 000000000000..258c0a3238fb
--- /dev/null
+++ b/arch/ia64/kernel/gate-data.S
@@ -0,0 +1,3 @@
1 .section .data.gate, "aw"
2
3 .incbin "arch/ia64/kernel/gate.so"
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
new file mode 100644
index 000000000000..facf75acdc85
--- /dev/null
+++ b/arch/ia64/kernel/gate.S
@@ -0,0 +1,372 @@
1/*
2 * This file contains the code that gets mapped at the upper end of each task's text
3 * region. For now, it contains the signal trampoline code only.
4 *
5 * Copyright (C) 1999-2003 Hewlett-Packard Co
6 * David Mosberger-Tang <davidm@hpl.hp.com>
7 */
8
9#include <linux/config.h>
10
11#include <asm/asmmacro.h>
12#include <asm/errno.h>
13#include <asm/offsets.h>
14#include <asm/sigcontext.h>
15#include <asm/system.h>
16#include <asm/unistd.h>
17
18/*
19 * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation,
20 * complications with the linker (which likes to create PLT stubs for branches
21 * to targets outside the shared object) and to avoid multi-phase kernel builds, we
22 * simply create minimalistic "patch lists" in special ELF sections.
23 */
24 .section ".data.patch.fsyscall_table", "a"
25 .previous
26#define LOAD_FSYSCALL_TABLE(reg) \
27[1:] movl reg=0; \
28 .xdata4 ".data.patch.fsyscall_table", 1b-.
29
30 .section ".data.patch.brl_fsys_bubble_down", "a"
31 .previous
32#define BRL_COND_FSYS_BUBBLE_DOWN(pr) \
33[1:](pr)brl.cond.sptk 0; \
34 .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-.
35
36GLOBAL_ENTRY(__kernel_syscall_via_break)
37 .prologue
38 .altrp b6
39 .body
40 /*
41 * Note: for (fast) syscall restart to work, the break instruction must be
42 * the first one in the bundle addressed by syscall_via_break.
43 */
44{ .mib
45 break 0x100000
46 nop.i 0
47 br.ret.sptk.many b6
48}
49END(__kernel_syscall_via_break)
50
51/*
52 * On entry:
53 * r11 = saved ar.pfs
54 * r15 = system call #
55 * b0 = saved return address
56 * b6 = return address
57 * On exit:
58 * r11 = saved ar.pfs
59 * r15 = system call #
60 * b0 = saved return address
61 * all other "scratch" registers: undefined
62 * all "preserved" registers: same as on entry
63 */
64
65GLOBAL_ENTRY(__kernel_syscall_via_epc)
66 .prologue
67 .altrp b6
68 .body
69{
70 /*
71 * Note: the kernel cannot assume that the first two instructions in this
72 * bundle get executed. The remaining code must be safe even if
73 * they do not get executed.
74 */
75 adds r17=-1024,r15
76 mov r10=0 // default to successful syscall execution
77 epc
78}
79 ;;
80 rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be"
81 LOAD_FSYSCALL_TABLE(r14)
82
83 mov r16=IA64_KR(CURRENT) // 12 cycle read latency
84 tnat.nz p10,p9=r15
85 mov r19=NR_syscalls-1
86 ;;
87 shladd r18=r17,3,r14
88
89 srlz.d
90 cmp.ne p8,p0=r0,r0 // p8 <- FALSE
91 /* Note: if r17 is a NaT, p6 will be set to zero. */
92 cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)?
93 ;;
94(p6) ld8 r18=[r18]
95 mov r21=ar.fpsr
96 add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry
97 ;;
98(p6) mov b7=r18
99(p6) tbit.z p8,p0=r18,0
100(p8) br.dptk.many b7
101
102(p6) rsm psr.i
103 mov r27=ar.rsc
104 mov r26=ar.pfs
105 ;;
106 mov r29=psr // read psr (12 cyc load latency)
107/*
108 * brl.cond doesn't work as intended because the linker would convert this branch
109 * into a branch to a PLT. Perhaps there will be a way to avoid this with some
110 * future version of the linker. In the meantime, we just use an indirect branch
111 * instead.
112 */
113#ifdef CONFIG_ITANIUM
114(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down
115 ;;
116(p6) mov b7=r14
117(p6) br.sptk.many b7
118#else
119 BRL_COND_FSYS_BUBBLE_DOWN(p6)
120#endif
121
122 mov r10=-1
123(p10) mov r8=EINVAL
124(p9) mov r8=ENOSYS
125 FSYS_RETURN
126END(__kernel_syscall_via_epc)
127
128# define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET)
129# define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET)
130# define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET)
131# define SIGHANDLER_OFF (16 + IA64_SIGFRAME_HANDLER_OFFSET)
132# define SIGCONTEXT_OFF (16 + IA64_SIGFRAME_SIGCONTEXT_OFFSET)
133
134# define FLAGS_OFF IA64_SIGCONTEXT_FLAGS_OFFSET
135# define CFM_OFF IA64_SIGCONTEXT_CFM_OFFSET
136# define FR6_OFF IA64_SIGCONTEXT_FR6_OFFSET
137# define BSP_OFF IA64_SIGCONTEXT_AR_BSP_OFFSET
138# define RNAT_OFF IA64_SIGCONTEXT_AR_RNAT_OFFSET
139# define UNAT_OFF IA64_SIGCONTEXT_AR_UNAT_OFFSET
140# define FPSR_OFF IA64_SIGCONTEXT_AR_FPSR_OFFSET
141# define PR_OFF IA64_SIGCONTEXT_PR_OFFSET
142# define RP_OFF IA64_SIGCONTEXT_IP_OFFSET
143# define SP_OFF IA64_SIGCONTEXT_R12_OFFSET
144# define RBS_BASE_OFF IA64_SIGCONTEXT_RBS_BASE_OFFSET
145# define LOADRS_OFF IA64_SIGCONTEXT_LOADRS_OFFSET
146# define base0 r2
147# define base1 r3
148 /*
149 * When we get here, the memory stack looks like this:
150 *
151 * +===============================+
152 * | |
153 * // struct sigframe //
154 * | |
155 * +-------------------------------+ <-- sp+16
156 * | 16 byte of scratch |
157 * | space |
158 * +-------------------------------+ <-- sp
159 *
160 * The register stack looks _exactly_ the way it looked at the time the signal
161 * occurred. In other words, we're treading on a potential mine-field: each
162 * incoming general register may be a NaT value (including sp, in which case the
163 * process ends up dying with a SIGSEGV).
164 *
165 * The first thing need to do is a cover to get the registers onto the backing
166 * store. Once that is done, we invoke the signal handler which may modify some
167 * of the machine state. After returning from the signal handler, we return
168 * control to the previous context by executing a sigreturn system call. A signal
169 * handler may call the rt_sigreturn() function to directly return to a given
170 * sigcontext. However, the user-level sigreturn() needs to do much more than
171 * calling the rt_sigreturn() system call as it needs to unwind the stack to
172 * restore preserved registers that may have been saved on the signal handler's
173 * call stack.
174 */
175
176#define SIGTRAMP_SAVES \
177 .unwabi 3, 's'; /* mark this as a sigtramp handler (saves scratch regs) */ \
178 .unwabi @svr4, 's'; /* backwards compatibility with old unwinders (remove in v2.7) */ \
179 .savesp ar.unat, UNAT_OFF+SIGCONTEXT_OFF; \
180 .savesp ar.fpsr, FPSR_OFF+SIGCONTEXT_OFF; \
181 .savesp pr, PR_OFF+SIGCONTEXT_OFF; \
182 .savesp rp, RP_OFF+SIGCONTEXT_OFF; \
183 .savesp ar.pfs, CFM_OFF+SIGCONTEXT_OFF; \
184 .vframesp SP_OFF+SIGCONTEXT_OFF
185
186GLOBAL_ENTRY(__kernel_sigtramp)
187 // describe the state that is active when we get here:
188 .prologue
189 SIGTRAMP_SAVES
190 .body
191
192 .label_state 1
193
194 adds base0=SIGHANDLER_OFF,sp
195 adds base1=RBS_BASE_OFF+SIGCONTEXT_OFF,sp
196 br.call.sptk.many rp=1f
1971:
198 ld8 r17=[base0],(ARG0_OFF-SIGHANDLER_OFF) // get pointer to signal handler's plabel
199 ld8 r15=[base1] // get address of new RBS base (or NULL)
200 cover // push args in interrupted frame onto backing store
201 ;;
202 cmp.ne p1,p0=r15,r0 // do we need to switch rbs? (note: pr is saved by kernel)
203 mov.m r9=ar.bsp // fetch ar.bsp
204 .spillsp.p p1, ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
205(p1) br.cond.spnt setup_rbs // yup -> (clobbers p8, r14-r16, and r18-r20)
206back_from_setup_rbs:
207 alloc r8=ar.pfs,0,0,3,0
208 ld8 out0=[base0],16 // load arg0 (signum)
209 adds base1=(ARG1_OFF-(RBS_BASE_OFF+SIGCONTEXT_OFF)),base1
210 ;;
211 ld8 out1=[base1] // load arg1 (siginfop)
212 ld8 r10=[r17],8 // get signal handler entry point
213 ;;
214 ld8 out2=[base0] // load arg2 (sigcontextp)
215 ld8 gp=[r17] // get signal handler's global pointer
216 adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
217 ;;
218 .spillsp ar.bsp, BSP_OFF+SIGCONTEXT_OFF
219 st8 [base0]=r9 // save sc_ar_bsp
220 adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
221 adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
222 ;;
223 stf.spill [base0]=f6,32
224 stf.spill [base1]=f7,32
225 ;;
226 stf.spill [base0]=f8,32
227 stf.spill [base1]=f9,32
228 mov b6=r10
229 ;;
230 stf.spill [base0]=f10,32
231 stf.spill [base1]=f11,32
232 ;;
233 stf.spill [base0]=f12,32
234 stf.spill [base1]=f13,32
235 ;;
236 stf.spill [base0]=f14,32
237 stf.spill [base1]=f15,32
238 br.call.sptk.many rp=b6 // call the signal handler
239.ret0: adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
240 ;;
241 ld8 r15=[base0] // fetch sc_ar_bsp
242 mov r14=ar.bsp
243 ;;
244 cmp.ne p1,p0=r14,r15 // do we need to restore the rbs?
245(p1) br.cond.spnt restore_rbs // yup -> (clobbers r14-r18, f6 & f7)
246 ;;
247back_from_restore_rbs:
248 adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
249 adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
250 ;;
251 ldf.fill f6=[base0],32
252 ldf.fill f7=[base1],32
253 ;;
254 ldf.fill f8=[base0],32
255 ldf.fill f9=[base1],32
256 ;;
257 ldf.fill f10=[base0],32
258 ldf.fill f11=[base1],32
259 ;;
260 ldf.fill f12=[base0],32
261 ldf.fill f13=[base1],32
262 ;;
263 ldf.fill f14=[base0],32
264 ldf.fill f15=[base1],32
265 mov r15=__NR_rt_sigreturn
266 .restore sp // pop .prologue
267 break __BREAK_SYSCALL
268
269 .prologue
270 SIGTRAMP_SAVES
271setup_rbs:
272 mov ar.rsc=0 // put RSE into enforced lazy mode
273 ;;
274 .save ar.rnat, r19
275 mov r19=ar.rnat // save RNaT before switching backing store area
276 adds r14=(RNAT_OFF+SIGCONTEXT_OFF),sp
277
278 mov r18=ar.bspstore
279 mov ar.bspstore=r15 // switch over to new register backing store area
280 ;;
281
282 .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
283 st8 [r14]=r19 // save sc_ar_rnat
284 .body
285 mov.m r16=ar.bsp // sc_loadrs <- (new bsp - new bspstore) << 16
286 adds r14=(LOADRS_OFF+SIGCONTEXT_OFF),sp
287 ;;
288 invala
289 sub r15=r16,r15
290 extr.u r20=r18,3,6
291 ;;
292 mov ar.rsc=0xf // set RSE into eager mode, pl 3
293 cmp.eq p8,p0=63,r20
294 shl r15=r15,16
295 ;;
296 st8 [r14]=r15 // save sc_loadrs
297(p8) st8 [r18]=r19 // if bspstore points at RNaT slot, store RNaT there now
298 .restore sp // pop .prologue
299 br.cond.sptk back_from_setup_rbs
300
301 .prologue
302 SIGTRAMP_SAVES
303 .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
304 .body
305restore_rbs:
306 // On input:
307 // r14 = bsp1 (bsp at the time of return from signal handler)
308 // r15 = bsp0 (bsp at the time the signal occurred)
309 //
310 // Here, we need to calculate bspstore0, the value that ar.bspstore needs
311 // to be set to, based on bsp0 and the size of the dirty partition on
312 // the alternate stack (sc_loadrs >> 16). This can be done with the
313 // following algorithm:
314 //
315 // bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1));
316 //
317 // This is what the code below does.
318 //
319 alloc r2=ar.pfs,0,0,0,0 // alloc null frame
320 adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp
321 adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp
322 ;;
323 ld8 r17=[r16]
324 ld8 r16=[r18] // get new rnat
325 extr.u r18=r15,3,6 // r18 <- rse_slot_num(bsp0)
326 ;;
327 mov ar.rsc=r17 // put RSE into enforced lazy mode
328 shr.u r17=r17,16
329 ;;
330 sub r14=r14,r17 // r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16)
331 shr.u r17=r17,3 // r17 <- (sc_loadrs >> 19)
332 ;;
333 loadrs // restore dirty partition
334 extr.u r14=r14,3,6 // r14 <- rse_slot_num(bspstore1)
335 ;;
336 add r14=r14,r17 // r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19)
337 ;;
338 shr.u r14=r14,6 // r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40
339 ;;
340 sub r14=r14,r17 // r14 <- -rse_num_regs(bspstore1, bsp1)
341 movl r17=0x8208208208208209
342 ;;
343 add r18=r18,r14 // r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1)
344 setf.sig f7=r17
345 cmp.lt p7,p0=r14,r0 // p7 <- (r14 < 0)?
346 ;;
347(p7) adds r18=-62,r18 // delta -= 62
348 ;;
349 setf.sig f6=r18
350 ;;
351 xmpy.h f6=f6,f7
352 ;;
353 getf.sig r17=f6
354 ;;
355 add r17=r17,r18
356 shr r18=r18,63
357 ;;
358 shr r17=r17,5
359 ;;
360 sub r17=r17,r18 // r17 = delta/63
361 ;;
362 add r17=r14,r17 // r17 <- delta/63 - rse_num_regs(bspstore1, bsp1)
363 ;;
364 shladd r15=r17,3,r15 // r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1))
365 ;;
366 mov ar.bspstore=r15 // switch back to old register backing store area
367 ;;
368 mov ar.rnat=r16 // restore RNaT
369 mov ar.rsc=0xf // (will be restored later on from sc_ar_rsc)
370 // invala not necessary as that will happen when returning to user-mode
371 br.cond.sptk back_from_restore_rbs
372END(__kernel_sigtramp)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
new file mode 100644
index 000000000000..e1e4aba9ecd0
--- /dev/null
+++ b/arch/ia64/kernel/gate.lds.S
@@ -0,0 +1,95 @@
1/*
2 * Linker script for gate DSO. The gate pages are an ELF shared object prelinked to its
3 * virtual address, with only one read-only segment and one execute-only segment (both fit
4 * in one page). This script controls its layout.
5 */
6
7#include <linux/config.h>
8
9#include <asm/system.h>
10
11SECTIONS
12{
13 . = GATE_ADDR + SIZEOF_HEADERS;
14
15 .hash : { *(.hash) } :readable
16 .dynsym : { *(.dynsym) }
17 .dynstr : { *(.dynstr) }
18 .gnu.version : { *(.gnu.version) }
19 .gnu.version_d : { *(.gnu.version_d) }
20 .gnu.version_r : { *(.gnu.version_r) }
21 .dynamic : { *(.dynamic) } :readable :dynamic
22
23 /*
24 * This linker script is used both with -r and with -shared. For the layouts to match,
25 * we need to skip more than enough space for the dynamic symbol table et al. If this
26 * amount is insufficient, ld -shared will barf. Just increase it here.
27 */
28 . = GATE_ADDR + 0x500;
29
30 .data.patch : {
31 __start_gate_mckinley_e9_patchlist = .;
32 *(.data.patch.mckinley_e9)
33 __end_gate_mckinley_e9_patchlist = .;
34
35 __start_gate_vtop_patchlist = .;
36 *(.data.patch.vtop)
37 __end_gate_vtop_patchlist = .;
38
39 __start_gate_fsyscall_patchlist = .;
40 *(.data.patch.fsyscall_table)
41 __end_gate_fsyscall_patchlist = .;
42
43 __start_gate_brl_fsys_bubble_down_patchlist = .;
44 *(.data.patch.brl_fsys_bubble_down)
45 __end_gate_brl_fsys_bubble_down_patchlist = .;
46 } :readable
47 .IA_64.unwind_info : { *(.IA_64.unwind_info*) }
48 .IA_64.unwind : { *(.IA_64.unwind*) } :readable :unwind
49#ifdef HAVE_BUGGY_SEGREL
50 .text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) } :readable
51#else
52 . = ALIGN (PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1));
53 .text : { *(.text) *(.text.*) } :epc
54#endif
55
56 /DISCARD/ : {
57 *(.got.plt) *(.got)
58 *(.data .data.* .gnu.linkonce.d.*)
59 *(.dynbss)
60 *(.bss .bss.* .gnu.linkonce.b.*)
61 *(__ex_table)
62 }
63}
64
65/*
66 * We must supply the ELF program headers explicitly to get just one
67 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
68 */
69PHDRS
70{
71 readable PT_LOAD FILEHDR PHDRS FLAGS(4); /* PF_R */
72#ifndef HAVE_BUGGY_SEGREL
73 epc PT_LOAD FILEHDR PHDRS FLAGS(1); /* PF_X */
74#endif
75 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
76 unwind 0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */
77}
78
79/*
80 * This controls what symbols we export from the DSO.
81 */
82VERSION
83{
84 LINUX_2.5 {
85 global:
86 __kernel_syscall_via_break;
87 __kernel_syscall_via_epc;
88 __kernel_sigtramp;
89
90 local: *;
91 };
92}
93
94/* The ELF entry point can be used to set the AT_SYSINFO value. */
95ENTRY(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
new file mode 100644
index 000000000000..105c7fec8c6d
--- /dev/null
+++ b/arch/ia64/kernel/head.S
@@ -0,0 +1,996 @@
1/*
2 * Here is where the ball gets rolling as far as the kernel is concerned.
3 * When control is transferred to _start, the bootload has already
4 * loaded us to the correct address. All that's left to do here is
5 * to set up the kernel's global pointer and jump to the kernel
6 * entry point.
7 *
8 * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
9 * David Mosberger-Tang <davidm@hpl.hp.com>
10 * Stephane Eranian <eranian@hpl.hp.com>
11 * Copyright (C) 1999 VA Linux Systems
12 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
13 * Copyright (C) 1999 Intel Corp.
14 * Copyright (C) 1999 Asit Mallick <Asit.K.Mallick@intel.com>
15 * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com>
16 * Copyright (C) 2002 Fenghua Yu <fenghua.yu@intel.com>
17 * -Optimize __ia64_save_fpu() and __ia64_load_fpu() for Itanium 2.
18 */
19
20#include <linux/config.h>
21
22#include <asm/asmmacro.h>
23#include <asm/fpu.h>
24#include <asm/kregs.h>
25#include <asm/mmu_context.h>
26#include <asm/offsets.h>
27#include <asm/pal.h>
28#include <asm/pgtable.h>
29#include <asm/processor.h>
30#include <asm/ptrace.h>
31#include <asm/system.h>
32
33 .section __special_page_section,"ax"
34
35 .global empty_zero_page
36empty_zero_page:
37 .skip PAGE_SIZE
38
39 .global swapper_pg_dir
40swapper_pg_dir:
41 .skip PAGE_SIZE
42
43 .rodata
44halt_msg:
45 stringz "Halting kernel\n"
46
47 .text
48
49 .global start_ap
50
51 /*
52 * Start the kernel. When the bootloader passes control to _start(), r28
53 * points to the address of the boot parameter area. Execution reaches
54 * here in physical mode.
55 */
56GLOBAL_ENTRY(_start)
57start_ap:
58 .prologue
59 .save rp, r0 // terminate unwind chain with a NULL rp
60 .body
61
62 rsm psr.i | psr.ic
63 ;;
64 srlz.i
65 ;;
66 /*
67 * Initialize kernel region registers:
68 * rr[0]: VHPT enabled, page size = PAGE_SHIFT
69 * rr[1]: VHPT enabled, page size = PAGE_SHIFT
70 * rr[2]: VHPT enabled, page size = PAGE_SHIFT
71 * rr[3]: VHPT enabled, page size = PAGE_SHIFT
72 * rr[4]: VHPT enabled, page size = PAGE_SHIFT
73 * rr[5]: VHPT enabled, page size = PAGE_SHIFT
74 * rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT
75 * rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT
76 * We initialize all of them to prevent inadvertently assuming
77 * something about the state of address translation early in boot.
78 */
79 mov r6=((ia64_rid(IA64_REGION_ID_KERNEL, (0<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
80 movl r7=(0<<61)
81 mov r8=((ia64_rid(IA64_REGION_ID_KERNEL, (1<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
82 movl r9=(1<<61)
83 mov r10=((ia64_rid(IA64_REGION_ID_KERNEL, (2<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
84 movl r11=(2<<61)
85 mov r12=((ia64_rid(IA64_REGION_ID_KERNEL, (3<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
86 movl r13=(3<<61)
87 mov r14=((ia64_rid(IA64_REGION_ID_KERNEL, (4<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
88 movl r15=(4<<61)
89 mov r16=((ia64_rid(IA64_REGION_ID_KERNEL, (5<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
90 movl r17=(5<<61)
91 mov r18=((ia64_rid(IA64_REGION_ID_KERNEL, (6<<61)) << 8) | (IA64_GRANULE_SHIFT << 2))
92 movl r19=(6<<61)
93 mov r20=((ia64_rid(IA64_REGION_ID_KERNEL, (7<<61)) << 8) | (IA64_GRANULE_SHIFT << 2))
94 movl r21=(7<<61)
95 ;;
96 mov rr[r7]=r6
97 mov rr[r9]=r8
98 mov rr[r11]=r10
99 mov rr[r13]=r12
100 mov rr[r15]=r14
101 mov rr[r17]=r16
102 mov rr[r19]=r18
103 mov rr[r21]=r20
104 ;;
105 /*
106 * Now pin mappings into the TLB for kernel text and data
107 */
108 mov r18=KERNEL_TR_PAGE_SHIFT<<2
109 movl r17=KERNEL_START
110 ;;
111 mov cr.itir=r18
112 mov cr.ifa=r17
113 mov r16=IA64_TR_KERNEL
114 mov r3=ip
115 movl r18=PAGE_KERNEL
116 ;;
117 dep r2=0,r3,0,KERNEL_TR_PAGE_SHIFT
118 ;;
119 or r18=r2,r18
120 ;;
121 srlz.i
122 ;;
123 itr.i itr[r16]=r18
124 ;;
125 itr.d dtr[r16]=r18
126 ;;
127 srlz.i
128
129 /*
130 * Switch into virtual mode:
131 */
132 movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \
133 |IA64_PSR_DI)
134 ;;
135 mov cr.ipsr=r16
136 movl r17=1f
137 ;;
138 mov cr.iip=r17
139 mov cr.ifs=r0
140 ;;
141 rfi
142 ;;
1431: // now we are in virtual mode
144
145 // set IVT entry point---can't access I/O ports without it
146 movl r3=ia64_ivt
147 ;;
148 mov cr.iva=r3
149 movl r2=FPSR_DEFAULT
150 ;;
151 srlz.i
152 movl gp=__gp
153
154 mov ar.fpsr=r2
155 ;;
156
157#define isAP p2 // are we an Application Processor?
158#define isBP p3 // are we the Bootstrap Processor?
159
160#ifdef CONFIG_SMP
161 /*
162 * Find the init_task for the currently booting CPU. At poweron, and in
163 * UP mode, task_for_booting_cpu is NULL.
164 */
165 movl r3=task_for_booting_cpu
166 ;;
167 ld8 r3=[r3]
168 movl r2=init_task
169 ;;
170 cmp.eq isBP,isAP=r3,r0
171 ;;
172(isAP) mov r2=r3
173#else
174 movl r2=init_task
175 cmp.eq isBP,isAP=r0,r0
176#endif
177 ;;
178 tpa r3=r2 // r3 == phys addr of task struct
179 mov r16=-1
180(isBP) br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it
181
182 // load mapping for stack (virtaddr in r2, physaddr in r3)
183 rsm psr.ic
184 movl r17=PAGE_KERNEL
185 ;;
186 srlz.d
187 dep r18=0,r3,0,12
188 ;;
189 or r18=r17,r18
190 dep r2=-1,r3,61,3 // IMVA of task
191 ;;
192 mov r17=rr[r2]
193 shr.u r16=r3,IA64_GRANULE_SHIFT
194 ;;
195 dep r17=0,r17,8,24
196 ;;
197 mov cr.itir=r17
198 mov cr.ifa=r2
199
200 mov r19=IA64_TR_CURRENT_STACK
201 ;;
202 itr.d dtr[r19]=r18
203 ;;
204 ssm psr.ic
205 srlz.d
206 ;;
207
208.load_current:
209 // load the "current" pointer (r13) and ar.k6 with the current task
210 mov IA64_KR(CURRENT)=r2 // virtual address
211 mov IA64_KR(CURRENT_STACK)=r16
212 mov r13=r2
213 /*
214 * Reserve space at the top of the stack for "struct pt_regs". Kernel threads
215 * don't store interesting values in that structure, but the space still needs
216 * to be there because time-critical stuff such as the context switching can
217 * be implemented more efficiently (for example, __switch_to()
218 * always sets the psr.dfh bit of the task it is switching to).
219 */
220 addl r12=IA64_STK_OFFSET-IA64_PT_REGS_SIZE-16,r2
221 addl r2=IA64_RBS_OFFSET,r2 // initialize the RSE
222 mov ar.rsc=0 // place RSE in enforced lazy mode
223 ;;
224 loadrs // clear the dirty partition
225 ;;
226 mov ar.bspstore=r2 // establish the new RSE stack
227 ;;
228 mov ar.rsc=0x3 // place RSE in eager mode
229
230(isBP) dep r28=-1,r28,61,3 // make address virtual
231(isBP) movl r2=ia64_boot_param
232 ;;
233(isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader
234
235#ifdef CONFIG_SMP
236(isAP) br.call.sptk.many rp=start_secondary
237.ret0:
238(isAP) br.cond.sptk self
239#endif
240
241 // This is executed by the bootstrap processor (bsp) only:
242
243#ifdef CONFIG_IA64_FW_EMU
244 // initialize PAL & SAL emulator:
245 br.call.sptk.many rp=sys_fw_init
246.ret1:
247#endif
248 br.call.sptk.many rp=start_kernel
249.ret2: addl r3=@ltoff(halt_msg),gp
250 ;;
251 alloc r2=ar.pfs,8,0,2,0
252 ;;
253 ld8 out0=[r3]
254 br.call.sptk.many b0=console_print
255
256self: hint @pause
257 br.sptk.many self // endless loop
258END(_start)
259
260GLOBAL_ENTRY(ia64_save_debug_regs)
261 alloc r16=ar.pfs,1,0,0,0
262 mov r20=ar.lc // preserve ar.lc
263 mov ar.lc=IA64_NUM_DBG_REGS-1
264 mov r18=0
265 add r19=IA64_NUM_DBG_REGS*8,in0
266 ;;
2671: mov r16=dbr[r18]
268#ifdef CONFIG_ITANIUM
269 ;;
270 srlz.d
271#endif
272 mov r17=ibr[r18]
273 add r18=1,r18
274 ;;
275 st8.nta [in0]=r16,8
276 st8.nta [r19]=r17,8
277 br.cloop.sptk.many 1b
278 ;;
279 mov ar.lc=r20 // restore ar.lc
280 br.ret.sptk.many rp
281END(ia64_save_debug_regs)
282
283GLOBAL_ENTRY(ia64_load_debug_regs)
284 alloc r16=ar.pfs,1,0,0,0
285 lfetch.nta [in0]
286 mov r20=ar.lc // preserve ar.lc
287 add r19=IA64_NUM_DBG_REGS*8,in0
288 mov ar.lc=IA64_NUM_DBG_REGS-1
289 mov r18=-1
290 ;;
2911: ld8.nta r16=[in0],8
292 ld8.nta r17=[r19],8
293 add r18=1,r18
294 ;;
295 mov dbr[r18]=r16
296#ifdef CONFIG_ITANIUM
297 ;;
298 srlz.d // Errata 132 (NoFix status)
299#endif
300 mov ibr[r18]=r17
301 br.cloop.sptk.many 1b
302 ;;
303 mov ar.lc=r20 // restore ar.lc
304 br.ret.sptk.many rp
305END(ia64_load_debug_regs)
306
307GLOBAL_ENTRY(__ia64_save_fpu)
308 alloc r2=ar.pfs,1,4,0,0
309 adds loc0=96*16-16,in0
310 adds loc1=96*16-16-128,in0
311 ;;
312 stf.spill.nta [loc0]=f127,-256
313 stf.spill.nta [loc1]=f119,-256
314 ;;
315 stf.spill.nta [loc0]=f111,-256
316 stf.spill.nta [loc1]=f103,-256
317 ;;
318 stf.spill.nta [loc0]=f95,-256
319 stf.spill.nta [loc1]=f87,-256
320 ;;
321 stf.spill.nta [loc0]=f79,-256
322 stf.spill.nta [loc1]=f71,-256
323 ;;
324 stf.spill.nta [loc0]=f63,-256
325 stf.spill.nta [loc1]=f55,-256
326 adds loc2=96*16-32,in0
327 ;;
328 stf.spill.nta [loc0]=f47,-256
329 stf.spill.nta [loc1]=f39,-256
330 adds loc3=96*16-32-128,in0
331 ;;
332 stf.spill.nta [loc2]=f126,-256
333 stf.spill.nta [loc3]=f118,-256
334 ;;
335 stf.spill.nta [loc2]=f110,-256
336 stf.spill.nta [loc3]=f102,-256
337 ;;
338 stf.spill.nta [loc2]=f94,-256
339 stf.spill.nta [loc3]=f86,-256
340 ;;
341 stf.spill.nta [loc2]=f78,-256
342 stf.spill.nta [loc3]=f70,-256
343 ;;
344 stf.spill.nta [loc2]=f62,-256
345 stf.spill.nta [loc3]=f54,-256
346 adds loc0=96*16-48,in0
347 ;;
348 stf.spill.nta [loc2]=f46,-256
349 stf.spill.nta [loc3]=f38,-256
350 adds loc1=96*16-48-128,in0
351 ;;
352 stf.spill.nta [loc0]=f125,-256
353 stf.spill.nta [loc1]=f117,-256
354 ;;
355 stf.spill.nta [loc0]=f109,-256
356 stf.spill.nta [loc1]=f101,-256
357 ;;
358 stf.spill.nta [loc0]=f93,-256
359 stf.spill.nta [loc1]=f85,-256
360 ;;
361 stf.spill.nta [loc0]=f77,-256
362 stf.spill.nta [loc1]=f69,-256
363 ;;
364 stf.spill.nta [loc0]=f61,-256
365 stf.spill.nta [loc1]=f53,-256
366 adds loc2=96*16-64,in0
367 ;;
368 stf.spill.nta [loc0]=f45,-256
369 stf.spill.nta [loc1]=f37,-256
370 adds loc3=96*16-64-128,in0
371 ;;
372 stf.spill.nta [loc2]=f124,-256
373 stf.spill.nta [loc3]=f116,-256
374 ;;
375 stf.spill.nta [loc2]=f108,-256
376 stf.spill.nta [loc3]=f100,-256
377 ;;
378 stf.spill.nta [loc2]=f92,-256
379 stf.spill.nta [loc3]=f84,-256
380 ;;
381 stf.spill.nta [loc2]=f76,-256
382 stf.spill.nta [loc3]=f68,-256
383 ;;
384 stf.spill.nta [loc2]=f60,-256
385 stf.spill.nta [loc3]=f52,-256
386 adds loc0=96*16-80,in0
387 ;;
388 stf.spill.nta [loc2]=f44,-256
389 stf.spill.nta [loc3]=f36,-256
390 adds loc1=96*16-80-128,in0
391 ;;
392 stf.spill.nta [loc0]=f123,-256
393 stf.spill.nta [loc1]=f115,-256
394 ;;
395 stf.spill.nta [loc0]=f107,-256
396 stf.spill.nta [loc1]=f99,-256
397 ;;
398 stf.spill.nta [loc0]=f91,-256
399 stf.spill.nta [loc1]=f83,-256
400 ;;
401 stf.spill.nta [loc0]=f75,-256
402 stf.spill.nta [loc1]=f67,-256
403 ;;
404 stf.spill.nta [loc0]=f59,-256
405 stf.spill.nta [loc1]=f51,-256
406 adds loc2=96*16-96,in0
407 ;;
408 stf.spill.nta [loc0]=f43,-256
409 stf.spill.nta [loc1]=f35,-256
410 adds loc3=96*16-96-128,in0
411 ;;
412 stf.spill.nta [loc2]=f122,-256
413 stf.spill.nta [loc3]=f114,-256
414 ;;
415 stf.spill.nta [loc2]=f106,-256
416 stf.spill.nta [loc3]=f98,-256
417 ;;
418 stf.spill.nta [loc2]=f90,-256
419 stf.spill.nta [loc3]=f82,-256
420 ;;
421 stf.spill.nta [loc2]=f74,-256
422 stf.spill.nta [loc3]=f66,-256
423 ;;
424 stf.spill.nta [loc2]=f58,-256
425 stf.spill.nta [loc3]=f50,-256
426 adds loc0=96*16-112,in0
427 ;;
428 stf.spill.nta [loc2]=f42,-256
429 stf.spill.nta [loc3]=f34,-256
430 adds loc1=96*16-112-128,in0
431 ;;
432 stf.spill.nta [loc0]=f121,-256
433 stf.spill.nta [loc1]=f113,-256
434 ;;
435 stf.spill.nta [loc0]=f105,-256
436 stf.spill.nta [loc1]=f97,-256
437 ;;
438 stf.spill.nta [loc0]=f89,-256
439 stf.spill.nta [loc1]=f81,-256
440 ;;
441 stf.spill.nta [loc0]=f73,-256
442 stf.spill.nta [loc1]=f65,-256
443 ;;
444 stf.spill.nta [loc0]=f57,-256
445 stf.spill.nta [loc1]=f49,-256
446 adds loc2=96*16-128,in0
447 ;;
448 stf.spill.nta [loc0]=f41,-256
449 stf.spill.nta [loc1]=f33,-256
450 adds loc3=96*16-128-128,in0
451 ;;
452 stf.spill.nta [loc2]=f120,-256
453 stf.spill.nta [loc3]=f112,-256
454 ;;
455 stf.spill.nta [loc2]=f104,-256
456 stf.spill.nta [loc3]=f96,-256
457 ;;
458 stf.spill.nta [loc2]=f88,-256
459 stf.spill.nta [loc3]=f80,-256
460 ;;
461 stf.spill.nta [loc2]=f72,-256
462 stf.spill.nta [loc3]=f64,-256
463 ;;
464 stf.spill.nta [loc2]=f56,-256
465 stf.spill.nta [loc3]=f48,-256
466 ;;
467 stf.spill.nta [loc2]=f40
468 stf.spill.nta [loc3]=f32
469 br.ret.sptk.many rp
470END(__ia64_save_fpu)
471
472GLOBAL_ENTRY(__ia64_load_fpu)
473 alloc r2=ar.pfs,1,2,0,0
474 adds r3=128,in0
475 adds r14=256,in0
476 adds r15=384,in0
477 mov loc0=512
478 mov loc1=-1024+16
479 ;;
480 ldf.fill.nta f32=[in0],loc0
481 ldf.fill.nta f40=[ r3],loc0
482 ldf.fill.nta f48=[r14],loc0
483 ldf.fill.nta f56=[r15],loc0
484 ;;
485 ldf.fill.nta f64=[in0],loc0
486 ldf.fill.nta f72=[ r3],loc0
487 ldf.fill.nta f80=[r14],loc0
488 ldf.fill.nta f88=[r15],loc0
489 ;;
490 ldf.fill.nta f96=[in0],loc1
491 ldf.fill.nta f104=[ r3],loc1
492 ldf.fill.nta f112=[r14],loc1
493 ldf.fill.nta f120=[r15],loc1
494 ;;
495 ldf.fill.nta f33=[in0],loc0
496 ldf.fill.nta f41=[ r3],loc0
497 ldf.fill.nta f49=[r14],loc0
498 ldf.fill.nta f57=[r15],loc0
499 ;;
500 ldf.fill.nta f65=[in0],loc0
501 ldf.fill.nta f73=[ r3],loc0
502 ldf.fill.nta f81=[r14],loc0
503 ldf.fill.nta f89=[r15],loc0
504 ;;
505 ldf.fill.nta f97=[in0],loc1
506 ldf.fill.nta f105=[ r3],loc1
507 ldf.fill.nta f113=[r14],loc1
508 ldf.fill.nta f121=[r15],loc1
509 ;;
510 ldf.fill.nta f34=[in0],loc0
511 ldf.fill.nta f42=[ r3],loc0
512 ldf.fill.nta f50=[r14],loc0
513 ldf.fill.nta f58=[r15],loc0
514 ;;
515 ldf.fill.nta f66=[in0],loc0
516 ldf.fill.nta f74=[ r3],loc0
517 ldf.fill.nta f82=[r14],loc0
518 ldf.fill.nta f90=[r15],loc0
519 ;;
520 ldf.fill.nta f98=[in0],loc1
521 ldf.fill.nta f106=[ r3],loc1
522 ldf.fill.nta f114=[r14],loc1
523 ldf.fill.nta f122=[r15],loc1
524 ;;
525 ldf.fill.nta f35=[in0],loc0
526 ldf.fill.nta f43=[ r3],loc0
527 ldf.fill.nta f51=[r14],loc0
528 ldf.fill.nta f59=[r15],loc0
529 ;;
530 ldf.fill.nta f67=[in0],loc0
531 ldf.fill.nta f75=[ r3],loc0
532 ldf.fill.nta f83=[r14],loc0
533 ldf.fill.nta f91=[r15],loc0
534 ;;
535 ldf.fill.nta f99=[in0],loc1
536 ldf.fill.nta f107=[ r3],loc1
537 ldf.fill.nta f115=[r14],loc1
538 ldf.fill.nta f123=[r15],loc1
539 ;;
540 ldf.fill.nta f36=[in0],loc0
541 ldf.fill.nta f44=[ r3],loc0
542 ldf.fill.nta f52=[r14],loc0
543 ldf.fill.nta f60=[r15],loc0
544 ;;
545 ldf.fill.nta f68=[in0],loc0
546 ldf.fill.nta f76=[ r3],loc0
547 ldf.fill.nta f84=[r14],loc0
548 ldf.fill.nta f92=[r15],loc0
549 ;;
550 ldf.fill.nta f100=[in0],loc1
551 ldf.fill.nta f108=[ r3],loc1
552 ldf.fill.nta f116=[r14],loc1
553 ldf.fill.nta f124=[r15],loc1
554 ;;
555 ldf.fill.nta f37=[in0],loc0
556 ldf.fill.nta f45=[ r3],loc0
557 ldf.fill.nta f53=[r14],loc0
558 ldf.fill.nta f61=[r15],loc0
559 ;;
560 ldf.fill.nta f69=[in0],loc0
561 ldf.fill.nta f77=[ r3],loc0
562 ldf.fill.nta f85=[r14],loc0
563 ldf.fill.nta f93=[r15],loc0
564 ;;
565 ldf.fill.nta f101=[in0],loc1
566 ldf.fill.nta f109=[ r3],loc1
567 ldf.fill.nta f117=[r14],loc1
568 ldf.fill.nta f125=[r15],loc1
569 ;;
570 ldf.fill.nta f38 =[in0],loc0
571 ldf.fill.nta f46 =[ r3],loc0
572 ldf.fill.nta f54 =[r14],loc0
573 ldf.fill.nta f62 =[r15],loc0
574 ;;
575 ldf.fill.nta f70 =[in0],loc0
576 ldf.fill.nta f78 =[ r3],loc0
577 ldf.fill.nta f86 =[r14],loc0
578 ldf.fill.nta f94 =[r15],loc0
579 ;;
580 ldf.fill.nta f102=[in0],loc1
581 ldf.fill.nta f110=[ r3],loc1
582 ldf.fill.nta f118=[r14],loc1
583 ldf.fill.nta f126=[r15],loc1
584 ;;
585 ldf.fill.nta f39 =[in0],loc0
586 ldf.fill.nta f47 =[ r3],loc0
587 ldf.fill.nta f55 =[r14],loc0
588 ldf.fill.nta f63 =[r15],loc0
589 ;;
590 ldf.fill.nta f71 =[in0],loc0
591 ldf.fill.nta f79 =[ r3],loc0
592 ldf.fill.nta f87 =[r14],loc0
593 ldf.fill.nta f95 =[r15],loc0
594 ;;
595 ldf.fill.nta f103=[in0]
596 ldf.fill.nta f111=[ r3]
597 ldf.fill.nta f119=[r14]
598 ldf.fill.nta f127=[r15]
599 br.ret.sptk.many rp
600END(__ia64_load_fpu)
601
602GLOBAL_ENTRY(__ia64_init_fpu)
603 stf.spill [sp]=f0 // M3
604 mov f32=f0 // F
605 nop.b 0
606
607 ldfps f33,f34=[sp] // M0
608 ldfps f35,f36=[sp] // M1
609 mov f37=f0 // F
610 ;;
611
612 setf.s f38=r0 // M2
613 setf.s f39=r0 // M3
614 mov f40=f0 // F
615
616 ldfps f41,f42=[sp] // M0
617 ldfps f43,f44=[sp] // M1
618 mov f45=f0 // F
619
620 setf.s f46=r0 // M2
621 setf.s f47=r0 // M3
622 mov f48=f0 // F
623
624 ldfps f49,f50=[sp] // M0
625 ldfps f51,f52=[sp] // M1
626 mov f53=f0 // F
627
628 setf.s f54=r0 // M2
629 setf.s f55=r0 // M3
630 mov f56=f0 // F
631
632 ldfps f57,f58=[sp] // M0
633 ldfps f59,f60=[sp] // M1
634 mov f61=f0 // F
635
636 setf.s f62=r0 // M2
637 setf.s f63=r0 // M3
638 mov f64=f0 // F
639
640 ldfps f65,f66=[sp] // M0
641 ldfps f67,f68=[sp] // M1
642 mov f69=f0 // F
643
644 setf.s f70=r0 // M2
645 setf.s f71=r0 // M3
646 mov f72=f0 // F
647
648 ldfps f73,f74=[sp] // M0
649 ldfps f75,f76=[sp] // M1
650 mov f77=f0 // F
651
652 setf.s f78=r0 // M2
653 setf.s f79=r0 // M3
654 mov f80=f0 // F
655
656 ldfps f81,f82=[sp] // M0
657 ldfps f83,f84=[sp] // M1
658 mov f85=f0 // F
659
660 setf.s f86=r0 // M2
661 setf.s f87=r0 // M3
662 mov f88=f0 // F
663
664 /*
665 * When the instructions are cached, it would be faster to initialize
666 * the remaining registers with simply mov instructions (F-unit).
667 * This gets the time down to ~29 cycles. However, this would use up
668 * 33 bundles, whereas continuing with the above pattern yields
669 * 10 bundles and ~30 cycles.
670 */
671
672 ldfps f89,f90=[sp] // M0
673 ldfps f91,f92=[sp] // M1
674 mov f93=f0 // F
675
676 setf.s f94=r0 // M2
677 setf.s f95=r0 // M3
678 mov f96=f0 // F
679
680 ldfps f97,f98=[sp] // M0
681 ldfps f99,f100=[sp] // M1
682 mov f101=f0 // F
683
684 setf.s f102=r0 // M2
685 setf.s f103=r0 // M3
686 mov f104=f0 // F
687
688 ldfps f105,f106=[sp] // M0
689 ldfps f107,f108=[sp] // M1
690 mov f109=f0 // F
691
692 setf.s f110=r0 // M2
693 setf.s f111=r0 // M3
694 mov f112=f0 // F
695
696 ldfps f113,f114=[sp] // M0
697 ldfps f115,f116=[sp] // M1
698 mov f117=f0 // F
699
700 setf.s f118=r0 // M2
701 setf.s f119=r0 // M3
702 mov f120=f0 // F
703
704 ldfps f121,f122=[sp] // M0
705 ldfps f123,f124=[sp] // M1
706 mov f125=f0 // F
707
708 setf.s f126=r0 // M2
709 setf.s f127=r0 // M3
710 br.ret.sptk.many rp // F
711END(__ia64_init_fpu)
712
713/*
714 * Switch execution mode from virtual to physical
715 *
716 * Inputs:
717 * r16 = new psr to establish
718 * Output:
719 * r19 = old virtual address of ar.bsp
720 * r20 = old virtual address of sp
721 *
722 * Note: RSE must already be in enforced lazy mode
723 */
724GLOBAL_ENTRY(ia64_switch_mode_phys)
725 {
726 alloc r2=ar.pfs,0,0,0,0
727 rsm psr.i | psr.ic // disable interrupts and interrupt collection
728 mov r15=ip
729 }
730 ;;
731 {
732 flushrs // must be first insn in group
733 srlz.i
734 }
735 ;;
736 mov cr.ipsr=r16 // set new PSR
737 add r3=1f-ia64_switch_mode_phys,r15
738
739 mov r19=ar.bsp
740 mov r20=sp
741 mov r14=rp // get return address into a general register
742 ;;
743
744 // going to physical mode, use tpa to translate virt->phys
745 tpa r17=r19
746 tpa r3=r3
747 tpa sp=sp
748 tpa r14=r14
749 ;;
750
751 mov r18=ar.rnat // save ar.rnat
752 mov ar.bspstore=r17 // this steps on ar.rnat
753 mov cr.iip=r3
754 mov cr.ifs=r0
755 ;;
756 mov ar.rnat=r18 // restore ar.rnat
757 rfi // must be last insn in group
758 ;;
7591: mov rp=r14
760 br.ret.sptk.many rp
761END(ia64_switch_mode_phys)
762
763/*
764 * Switch execution mode from physical to virtual
765 *
766 * Inputs:
767 * r16 = new psr to establish
768 * r19 = new bspstore to establish
769 * r20 = new sp to establish
770 *
771 * Note: RSE must already be in enforced lazy mode
772 */
773GLOBAL_ENTRY(ia64_switch_mode_virt)
774 {
775 alloc r2=ar.pfs,0,0,0,0
776 rsm psr.i | psr.ic // disable interrupts and interrupt collection
777 mov r15=ip
778 }
779 ;;
780 {
781 flushrs // must be first insn in group
782 srlz.i
783 }
784 ;;
785 mov cr.ipsr=r16 // set new PSR
786 add r3=1f-ia64_switch_mode_virt,r15
787
788 mov r14=rp // get return address into a general register
789 ;;
790
791 // going to virtual
792 // - for code addresses, set upper bits of addr to KERNEL_START
793 // - for stack addresses, copy from input argument
794 movl r18=KERNEL_START
795 dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
796 dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
797 mov sp=r20
798 ;;
799 or r3=r3,r18
800 or r14=r14,r18
801 ;;
802
803 mov r18=ar.rnat // save ar.rnat
804 mov ar.bspstore=r19 // this steps on ar.rnat
805 mov cr.iip=r3
806 mov cr.ifs=r0
807 ;;
808 mov ar.rnat=r18 // restore ar.rnat
809 rfi // must be last insn in group
810 ;;
8111: mov rp=r14
812 br.ret.sptk.many rp
813END(ia64_switch_mode_virt)
814
815GLOBAL_ENTRY(ia64_delay_loop)
816 .prologue
817{ nop 0 // work around GAS unwind info generation bug...
818 .save ar.lc,r2
819 mov r2=ar.lc
820 .body
821 ;;
822 mov ar.lc=r32
823}
824 ;;
825 // force loop to be 32-byte aligned (GAS bug means we cannot use .align
826 // inside function body without corrupting unwind info).
827{ nop 0 }
8281: br.cloop.sptk.few 1b
829 ;;
830 mov ar.lc=r2
831 br.ret.sptk.many rp
832END(ia64_delay_loop)
833
834/*
835 * Return a CPU-local timestamp in nano-seconds. This timestamp is
836 * NOT synchronized across CPUs its return value must never be
837 * compared against the values returned on another CPU. The usage in
838 * kernel/sched.c ensures that.
839 *
840 * The return-value of sched_clock() is NOT supposed to wrap-around.
841 * If it did, it would cause some scheduling hiccups (at the worst).
842 * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even
843 * that would happen only once every 5+ years.
844 *
845 * The code below basically calculates:
846 *
847 * (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT
848 *
849 * except that the multiplication and the shift are done with 128-bit
850 * intermediate precision so that we can produce a full 64-bit result.
851 */
852GLOBAL_ENTRY(sched_clock)
853 addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
854 mov.m r9=ar.itc // fetch cycle-counter (35 cyc)
855 ;;
856 ldf8 f8=[r8]
857 ;;
858 setf.sig f9=r9 // certain to stall, so issue it _after_ ldf8...
859 ;;
860 xmpy.lu f10=f9,f8 // calculate low 64 bits of 128-bit product (4 cyc)
861 xmpy.hu f11=f9,f8 // calculate high 64 bits of 128-bit product
862 ;;
863 getf.sig r8=f10 // (5 cyc)
864 getf.sig r9=f11
865 ;;
866 shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
867 br.ret.sptk.many rp
868END(sched_clock)
869
870GLOBAL_ENTRY(start_kernel_thread)
871 .prologue
872 .save rp, r0 // this is the end of the call-chain
873 .body
874 alloc r2 = ar.pfs, 0, 0, 2, 0
875 mov out0 = r9
876 mov out1 = r11;;
877 br.call.sptk.many rp = kernel_thread_helper;;
878 mov out0 = r8
879 br.call.sptk.many rp = sys_exit;;
8801: br.sptk.few 1b // not reached
881END(start_kernel_thread)
882
883#ifdef CONFIG_IA64_BRL_EMU
884
885/*
886 * Assembly routines used by brl_emu.c to set preserved register state.
887 */
888
889#define SET_REG(reg) \
890 GLOBAL_ENTRY(ia64_set_##reg); \
891 alloc r16=ar.pfs,1,0,0,0; \
892 mov reg=r32; \
893 ;; \
894 br.ret.sptk.many rp; \
895 END(ia64_set_##reg)
896
897SET_REG(b1);
898SET_REG(b2);
899SET_REG(b3);
900SET_REG(b4);
901SET_REG(b5);
902
903#endif /* CONFIG_IA64_BRL_EMU */
904
905#ifdef CONFIG_SMP
906 /*
907 * This routine handles spinlock contention. It uses a non-standard calling
908 * convention to avoid converting leaf routines into interior routines. Because
909 * of this special convention, there are several restrictions:
910 *
911 * - do not use gp relative variables, this code is called from the kernel
912 * and from modules, r1 is undefined.
913 * - do not use stacked registers, the caller owns them.
914 * - do not use the scratch stack space, the caller owns it.
915 * - do not use any registers other than the ones listed below
916 *
917 * Inputs:
918 * ar.pfs - saved CFM of caller
919 * ar.ccv - 0 (and available for use)
920 * r27 - flags from spin_lock_irqsave or 0. Must be preserved.
921 * r28 - available for use.
922 * r29 - available for use.
923 * r30 - available for use.
924 * r31 - address of lock, available for use.
925 * b6 - return address
926 * p14 - available for use.
927 * p15 - used to track flag status.
928 *
929 * If you patch this code to use more registers, do not forget to update
930 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
931 */
932
933#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
934
935GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
936 .prologue
937 .save ar.pfs, r0 // this code effectively has a zero frame size
938 .save rp, r28
939 .body
940 nop 0
941 tbit.nz p15,p0=r27,IA64_PSR_I_BIT
942 .restore sp // pop existing prologue after next insn
943 mov b6 = r28
944 .prologue
945 .save ar.pfs, r0
946 .altrp b6
947 .body
948 ;;
949(p15) ssm psr.i // reenable interrupts if they were on
950 // DavidM says that srlz.d is slow and is not required in this case
951.wait:
952 // exponential backoff, kdb, lockmeter etc. go in here
953 hint @pause
954 ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word
955 nop 0
956 ;;
957 cmp4.ne p14,p0=r30,r0
958(p14) br.cond.sptk.few .wait
959(p15) rsm psr.i // disable interrupts if we reenabled them
960 br.cond.sptk.few b6 // lock is now free, try to acquire
961 .global ia64_spinlock_contention_pre3_4_end // for kernprof
962ia64_spinlock_contention_pre3_4_end:
963END(ia64_spinlock_contention_pre3_4)
964
965#else
966
967GLOBAL_ENTRY(ia64_spinlock_contention)
968 .prologue
969 .altrp b6
970 .body
971 tbit.nz p15,p0=r27,IA64_PSR_I_BIT
972 ;;
973.wait:
974(p15) ssm psr.i // reenable interrupts if they were on
975 // DavidM says that srlz.d is slow and is not required in this case
976.wait2:
977 // exponential backoff, kdb, lockmeter etc. go in here
978 hint @pause
979 ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word
980 ;;
981 cmp4.ne p14,p0=r30,r0
982 mov r30 = 1
983(p14) br.cond.sptk.few .wait2
984(p15) rsm psr.i // disable interrupts if we reenabled them
985 ;;
986 cmpxchg4.acq r30=[r31], r30, ar.ccv
987 ;;
988 cmp4.ne p14,p0=r0,r30
989(p14) br.cond.sptk.few .wait
990
991 br.ret.sptk.many b6 // lock is now taken
992END(ia64_spinlock_contention)
993
994#endif
995
996#endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
new file mode 100644
index 000000000000..7bbf019c9867
--- /dev/null
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -0,0 +1,127 @@
1/*
2 * Architecture-specific kernel symbols
3 *
4 * Don't put any exports here unless it's defined in an assembler file.
5 * All other exports should be put directly after the definition.
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10
11#include <linux/string.h>
12EXPORT_SYMBOL(memset);
13EXPORT_SYMBOL(memchr);
14EXPORT_SYMBOL(memcmp);
15EXPORT_SYMBOL(memcpy);
16EXPORT_SYMBOL(memmove);
17EXPORT_SYMBOL(memscan);
18EXPORT_SYMBOL(strcat);
19EXPORT_SYMBOL(strchr);
20EXPORT_SYMBOL(strcmp);
21EXPORT_SYMBOL(strcpy);
22EXPORT_SYMBOL(strlen);
23EXPORT_SYMBOL(strncat);
24EXPORT_SYMBOL(strncmp);
25EXPORT_SYMBOL(strncpy);
26EXPORT_SYMBOL(strnlen);
27EXPORT_SYMBOL(strrchr);
28EXPORT_SYMBOL(strstr);
29EXPORT_SYMBOL(strpbrk);
30
31#include <asm/checksum.h>
32EXPORT_SYMBOL(ip_fast_csum); /* hand-coded assembly */
33
34#include <asm/semaphore.h>
35EXPORT_SYMBOL(__down);
36EXPORT_SYMBOL(__down_interruptible);
37EXPORT_SYMBOL(__down_trylock);
38EXPORT_SYMBOL(__up);
39
40#include <asm/page.h>
41EXPORT_SYMBOL(clear_page);
42
43#ifdef CONFIG_VIRTUAL_MEM_MAP
44#include <linux/bootmem.h>
45EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic code */
46#endif
47
48#include <asm/processor.h>
49EXPORT_SYMBOL(per_cpu__cpu_info);
50#ifdef CONFIG_SMP
51EXPORT_SYMBOL(per_cpu__local_per_cpu_offset);
52#endif
53
54#include <asm/uaccess.h>
55EXPORT_SYMBOL(__copy_user);
56EXPORT_SYMBOL(__do_clear_user);
57EXPORT_SYMBOL(__strlen_user);
58EXPORT_SYMBOL(__strncpy_from_user);
59EXPORT_SYMBOL(__strnlen_user);
60
61#include <asm/unistd.h>
62EXPORT_SYMBOL(__ia64_syscall);
63
64/* from arch/ia64/lib */
65extern void __divsi3(void);
66extern void __udivsi3(void);
67extern void __modsi3(void);
68extern void __umodsi3(void);
69extern void __divdi3(void);
70extern void __udivdi3(void);
71extern void __moddi3(void);
72extern void __umoddi3(void);
73
74EXPORT_SYMBOL(__divsi3);
75EXPORT_SYMBOL(__udivsi3);
76EXPORT_SYMBOL(__modsi3);
77EXPORT_SYMBOL(__umodsi3);
78EXPORT_SYMBOL(__divdi3);
79EXPORT_SYMBOL(__udivdi3);
80EXPORT_SYMBOL(__moddi3);
81EXPORT_SYMBOL(__umoddi3);
82
83#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
84extern void xor_ia64_2(void);
85extern void xor_ia64_3(void);
86extern void xor_ia64_4(void);
87extern void xor_ia64_5(void);
88
89EXPORT_SYMBOL(xor_ia64_2);
90EXPORT_SYMBOL(xor_ia64_3);
91EXPORT_SYMBOL(xor_ia64_4);
92EXPORT_SYMBOL(xor_ia64_5);
93#endif
94
95#include <asm/pal.h>
96EXPORT_SYMBOL(ia64_pal_call_phys_stacked);
97EXPORT_SYMBOL(ia64_pal_call_phys_static);
98EXPORT_SYMBOL(ia64_pal_call_stacked);
99EXPORT_SYMBOL(ia64_pal_call_static);
100EXPORT_SYMBOL(ia64_load_scratch_fpregs);
101EXPORT_SYMBOL(ia64_save_scratch_fpregs);
102
103#include <asm/unwind.h>
104EXPORT_SYMBOL(unw_init_running);
105
106#ifdef ASM_SUPPORTED
107# ifdef CONFIG_SMP
108# if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
109/*
110 * This is not a normal routine and we don't want a function descriptor for it, so we use
111 * a fake declaration here.
112 */
113extern char ia64_spinlock_contention_pre3_4;
114EXPORT_SYMBOL(ia64_spinlock_contention_pre3_4);
115# else
116/*
117 * This is not a normal routine and we don't want a function descriptor for it, so we use
118 * a fake declaration here.
119 */
120extern char ia64_spinlock_contention;
121EXPORT_SYMBOL(ia64_spinlock_contention);
122# endif
123# endif
124#endif
125
126extern char ia64_ivt[];
127EXPORT_SYMBOL(ia64_ivt);
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
new file mode 100644
index 000000000000..b69c397ed1bf
--- /dev/null
+++ b/arch/ia64/kernel/init_task.c
@@ -0,0 +1,46 @@
1/*
2 * This is where we statically allocate and initialize the initial
3 * task.
4 *
5 * Copyright (C) 1999, 2002-2003 Hewlett-Packard Co
6 * David Mosberger-Tang <davidm@hpl.hp.com>
7 */
8
9#include <linux/init.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/init_task.h>
14#include <linux/mqueue.h>
15
16#include <asm/uaccess.h>
17#include <asm/pgtable.h>
18
19static struct fs_struct init_fs = INIT_FS;
20static struct files_struct init_files = INIT_FILES;
21static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
22static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
23struct mm_struct init_mm = INIT_MM(init_mm);
24
25EXPORT_SYMBOL(init_mm);
26
27/*
28 * Initial task structure.
29 *
30 * We need to make sure that this is properly aligned due to the way process stacks are
31 * handled. This is done by having a special ".data.init_task" section...
32 */
33#define init_thread_info init_task_mem.s.thread_info
34
35union {
36 struct {
37 struct task_struct task;
38 struct thread_info thread_info;
39 } s;
40 unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)];
41} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{
42 .task = INIT_TASK(init_task_mem.s.task),
43 .thread_info = INIT_THREAD_INFO(init_task_mem.s.task)
44}};
45
46EXPORT_SYMBOL(init_task);
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
new file mode 100644
index 000000000000..c15be5c38f56
--- /dev/null
+++ b/arch/ia64/kernel/iosapic.c
@@ -0,0 +1,827 @@
1/*
2 * I/O SAPIC support.
3 *
4 * Copyright (C) 1999 Intel Corp.
5 * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
6 * Copyright (C) 2000-2002 J.I. Lee <jung-ik.lee@intel.com>
7 * Copyright (C) 1999-2000, 2002-2003 Hewlett-Packard Co.
8 * David Mosberger-Tang <davidm@hpl.hp.com>
9 * Copyright (C) 1999 VA Linux Systems
10 * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
11 *
12 * 00/04/19 D. Mosberger Rewritten to mirror more closely the x86 I/O APIC code.
13 * In particular, we now have separate handlers for edge
14 * and level triggered interrupts.
15 * 00/10/27 Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector allocation
16 * PCI to vector mapping, shared PCI interrupts.
17 * 00/10/27 D. Mosberger Document things a bit more to make them more understandable.
18 * Clean up much of the old IOSAPIC cruft.
19 * 01/07/27 J.I. Lee PCI irq routing, Platform/Legacy interrupts and fixes for
20 * ACPI S5(SoftOff) support.
21 * 02/01/23 J.I. Lee iosapic pgm fixes for PCI irq routing from _PRT
22 * 02/01/07 E. Focht <efocht@ess.nec.de> Redirectable interrupt vectors in
23 * iosapic_set_affinity(), initializations for
24 * /proc/irq/#/smp_affinity
25 * 02/04/02 P. Diefenbaugh Cleaned up ACPI PCI IRQ routing.
26 * 02/04/18 J.I. Lee bug fix in iosapic_init_pci_irq
27 * 02/04/30 J.I. Lee bug fix in find_iosapic to fix ACPI PCI IRQ to IOSAPIC mapping
28 * error
29 * 02/07/29 T. Kochi Allocate interrupt vectors dynamically
30 * 02/08/04 T. Kochi Cleaned up terminology (irq, global system interrupt, vector, etc.)
31 * 02/09/20 D. Mosberger Simplified by taking advantage of ACPI's pci_irq code.
32 * 03/02/19 B. Helgaas Make pcat_compat system-wide, not per-IOSAPIC.
33 * Remove iosapic_address & gsi_base from external interfaces.
34 * Rationalize __init/__devinit attributes.
35 * 04/12/04 Ashok Raj <ashok.raj@intel.com> Intel Corporation 2004
36 * Updated to work with irq migration necessary for CPU Hotplug
37 */
38/*
39 * Here is what the interrupt logic between a PCI device and the kernel looks like:
40 *
41 * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, INTD). The
42 * device is uniquely identified by its bus--, and slot-number (the function
43 * number does not matter here because all functions share the same interrupt
44 * lines).
45 *
46 * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC controller.
47 * Multiple interrupt lines may have to share the same IOSAPIC pin (if they're level
48 * triggered and use the same polarity). Each interrupt line has a unique Global
49 * System Interrupt (GSI) number which can be calculated as the sum of the controller's
50 * base GSI number and the IOSAPIC pin number to which the line connects.
51 *
52 * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the IOSAPIC pin
53 * into the IA-64 interrupt vector. This interrupt vector is then sent to the CPU.
54 *
55 * (4) The kernel recognizes an interrupt as an IRQ. The IRQ interface is used as
56 * architecture-independent interrupt handling mechanism in Linux. As an
57 * IRQ is a number, we have to have IA-64 interrupt vector number <-> IRQ number
58 * mapping. On smaller systems, we use one-to-one mapping between IA-64 vector and
59 * IRQ. A platform can implement platform_irq_to_vector(irq) and
60 * platform_local_vector_to_irq(vector) APIs to differentiate the mapping.
61 * Please see also include/asm-ia64/hw_irq.h for those APIs.
62 *
63 * To sum up, there are three levels of mappings involved:
64 *
65 * PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ
66 *
67 * Note: The term "IRQ" is loosely used everywhere in Linux kernel to describe interrupts.
68 * Now we use "IRQ" only for Linux IRQ's. ISA IRQ (isa_irq) is the only exception in this
69 * source code.
70 */
71#include <linux/config.h>
72
73#include <linux/acpi.h>
74#include <linux/init.h>
75#include <linux/irq.h>
76#include <linux/kernel.h>
77#include <linux/list.h>
78#include <linux/pci.h>
79#include <linux/smp.h>
80#include <linux/smp_lock.h>
81#include <linux/string.h>
82
83#include <asm/delay.h>
84#include <asm/hw_irq.h>
85#include <asm/io.h>
86#include <asm/iosapic.h>
87#include <asm/machvec.h>
88#include <asm/processor.h>
89#include <asm/ptrace.h>
90#include <asm/system.h>
91
92
93#undef DEBUG_INTERRUPT_ROUTING
94
95#ifdef DEBUG_INTERRUPT_ROUTING
96#define DBG(fmt...) printk(fmt)
97#else
98#define DBG(fmt...)
99#endif
100
101static DEFINE_SPINLOCK(iosapic_lock);
102
103/* These tables map IA-64 vectors to the IOSAPIC pin that generates this vector. */
104
105static struct iosapic_intr_info {
106 char __iomem *addr; /* base address of IOSAPIC */
107 u32 low32; /* current value of low word of Redirection table entry */
108 unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */
109 char rte_index; /* IOSAPIC RTE index (-1 => not an IOSAPIC interrupt) */
110 unsigned char dmode : 3; /* delivery mode (see iosapic.h) */
111 unsigned char polarity: 1; /* interrupt polarity (see iosapic.h) */
112 unsigned char trigger : 1; /* trigger mode (see iosapic.h) */
113 int refcnt; /* reference counter */
114} iosapic_intr_info[IA64_NUM_VECTORS];
115
116static struct iosapic {
117 char __iomem *addr; /* base address of IOSAPIC */
118 unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */
119 unsigned short num_rte; /* number of RTE in this IOSAPIC */
120#ifdef CONFIG_NUMA
121 unsigned short node; /* numa node association via pxm */
122#endif
123} iosapic_lists[NR_IOSAPICS];
124
125static int num_iosapic;
126
127static unsigned char pcat_compat __initdata; /* 8259 compatibility flag */
128
129
130/*
131 * Find an IOSAPIC associated with a GSI
132 */
133static inline int
134find_iosapic (unsigned int gsi)
135{
136 int i;
137
138 for (i = 0; i < num_iosapic; i++) {
139 if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < iosapic_lists[i].num_rte)
140 return i;
141 }
142
143 return -1;
144}
145
146static inline int
147_gsi_to_vector (unsigned int gsi)
148{
149 struct iosapic_intr_info *info;
150
151 for (info = iosapic_intr_info; info < iosapic_intr_info + IA64_NUM_VECTORS; ++info)
152 if (info->gsi_base + info->rte_index == gsi)
153 return info - iosapic_intr_info;
154 return -1;
155}
156
157/*
158 * Translate GSI number to the corresponding IA-64 interrupt vector. If no
159 * entry exists, return -1.
160 */
161inline int
162gsi_to_vector (unsigned int gsi)
163{
164 return _gsi_to_vector(gsi);
165}
166
167int
168gsi_to_irq (unsigned int gsi)
169{
170 /*
171 * XXX fix me: this assumes an identity mapping vetween IA-64 vector and Linux irq
172 * numbers...
173 */
174 return _gsi_to_vector(gsi);
175}
176
177static void
178set_rte (unsigned int vector, unsigned int dest, int mask)
179{
180 unsigned long pol, trigger, dmode;
181 u32 low32, high32;
182 char __iomem *addr;
183 int rte_index;
184 char redir;
185
186 DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest);
187
188 rte_index = iosapic_intr_info[vector].rte_index;
189 if (rte_index < 0)
190 return; /* not an IOSAPIC interrupt */
191
192 addr = iosapic_intr_info[vector].addr;
193 pol = iosapic_intr_info[vector].polarity;
194 trigger = iosapic_intr_info[vector].trigger;
195 dmode = iosapic_intr_info[vector].dmode;
196 vector &= (~IA64_IRQ_REDIRECTED);
197
198 redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0;
199
200#ifdef CONFIG_SMP
201 {
202 unsigned int irq;
203
204 for (irq = 0; irq < NR_IRQS; ++irq)
205 if (irq_to_vector(irq) == vector) {
206 set_irq_affinity_info(irq, (int)(dest & 0xffff), redir);
207 break;
208 }
209 }
210#endif
211
212 low32 = ((pol << IOSAPIC_POLARITY_SHIFT) |
213 (trigger << IOSAPIC_TRIGGER_SHIFT) |
214 (dmode << IOSAPIC_DELIVERY_SHIFT) |
215 ((mask ? 1 : 0) << IOSAPIC_MASK_SHIFT) |
216 vector);
217
218 /* dest contains both id and eid */
219 high32 = (dest << IOSAPIC_DEST_SHIFT);
220
221 iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32);
222 iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
223 iosapic_intr_info[vector].low32 = low32;
224}
225
226static void
227nop (unsigned int vector)
228{
229 /* do nothing... */
230}
231
232static void
233mask_irq (unsigned int irq)
234{
235 unsigned long flags;
236 char __iomem *addr;
237 u32 low32;
238 int rte_index;
239 ia64_vector vec = irq_to_vector(irq);
240
241 addr = iosapic_intr_info[vec].addr;
242 rte_index = iosapic_intr_info[vec].rte_index;
243
244 if (rte_index < 0)
245 return; /* not an IOSAPIC interrupt! */
246
247 spin_lock_irqsave(&iosapic_lock, flags);
248 {
249 /* set only the mask bit */
250 low32 = iosapic_intr_info[vec].low32 |= IOSAPIC_MASK;
251 iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
252 }
253 spin_unlock_irqrestore(&iosapic_lock, flags);
254}
255
256static void
257unmask_irq (unsigned int irq)
258{
259 unsigned long flags;
260 char __iomem *addr;
261 u32 low32;
262 int rte_index;
263 ia64_vector vec = irq_to_vector(irq);
264
265 addr = iosapic_intr_info[vec].addr;
266 rte_index = iosapic_intr_info[vec].rte_index;
267 if (rte_index < 0)
268 return; /* not an IOSAPIC interrupt! */
269
270 spin_lock_irqsave(&iosapic_lock, flags);
271 {
272 low32 = iosapic_intr_info[vec].low32 &= ~IOSAPIC_MASK;
273 iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
274 }
275 spin_unlock_irqrestore(&iosapic_lock, flags);
276}
277
278
279static void
280iosapic_set_affinity (unsigned int irq, cpumask_t mask)
281{
282#ifdef CONFIG_SMP
283 unsigned long flags;
284 u32 high32, low32;
285 int dest, rte_index;
286 char __iomem *addr;
287 int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
288 ia64_vector vec;
289
290 irq &= (~IA64_IRQ_REDIRECTED);
291 vec = irq_to_vector(irq);
292
293 if (cpus_empty(mask))
294 return;
295
296 dest = cpu_physical_id(first_cpu(mask));
297
298 rte_index = iosapic_intr_info[vec].rte_index;
299 addr = iosapic_intr_info[vec].addr;
300
301 if (rte_index < 0)
302 return; /* not an IOSAPIC interrupt */
303
304 set_irq_affinity_info(irq, dest, redir);
305
306 /* dest contains both id and eid */
307 high32 = dest << IOSAPIC_DEST_SHIFT;
308
309 spin_lock_irqsave(&iosapic_lock, flags);
310 {
311 low32 = iosapic_intr_info[vec].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT);
312
313 if (redir)
314 /* change delivery mode to lowest priority */
315 low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
316 else
317 /* change delivery mode to fixed */
318 low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT);
319
320 iosapic_intr_info[vec].low32 = low32;
321 iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32);
322 iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
323 }
324 spin_unlock_irqrestore(&iosapic_lock, flags);
325#endif
326}
327
328/*
329 * Handlers for level-triggered interrupts.
330 */
331
332static unsigned int
333iosapic_startup_level_irq (unsigned int irq)
334{
335 unmask_irq(irq);
336 return 0;
337}
338
339static void
340iosapic_end_level_irq (unsigned int irq)
341{
342 ia64_vector vec = irq_to_vector(irq);
343
344 move_irq(irq);
345 iosapic_eoi(iosapic_intr_info[vec].addr, vec);
346}
347
348#define iosapic_shutdown_level_irq mask_irq
349#define iosapic_enable_level_irq unmask_irq
350#define iosapic_disable_level_irq mask_irq
351#define iosapic_ack_level_irq nop
352
353struct hw_interrupt_type irq_type_iosapic_level = {
354 .typename = "IO-SAPIC-level",
355 .startup = iosapic_startup_level_irq,
356 .shutdown = iosapic_shutdown_level_irq,
357 .enable = iosapic_enable_level_irq,
358 .disable = iosapic_disable_level_irq,
359 .ack = iosapic_ack_level_irq,
360 .end = iosapic_end_level_irq,
361 .set_affinity = iosapic_set_affinity
362};
363
364/*
365 * Handlers for edge-triggered interrupts.
366 */
367
368static unsigned int
369iosapic_startup_edge_irq (unsigned int irq)
370{
371 unmask_irq(irq);
372 /*
373 * IOSAPIC simply drops interrupts pended while the
374 * corresponding pin was masked, so we can't know if an
375 * interrupt is pending already. Let's hope not...
376 */
377 return 0;
378}
379
380static void
381iosapic_ack_edge_irq (unsigned int irq)
382{
383 irq_desc_t *idesc = irq_descp(irq);
384
385 move_irq(irq);
386 /*
387 * Once we have recorded IRQ_PENDING already, we can mask the
388 * interrupt for real. This prevents IRQ storms from unhandled
389 * devices.
390 */
391 if ((idesc->status & (IRQ_PENDING|IRQ_DISABLED)) == (IRQ_PENDING|IRQ_DISABLED))
392 mask_irq(irq);
393}
394
395#define iosapic_enable_edge_irq unmask_irq
396#define iosapic_disable_edge_irq nop
397#define iosapic_end_edge_irq nop
398
399struct hw_interrupt_type irq_type_iosapic_edge = {
400 .typename = "IO-SAPIC-edge",
401 .startup = iosapic_startup_edge_irq,
402 .shutdown = iosapic_disable_edge_irq,
403 .enable = iosapic_enable_edge_irq,
404 .disable = iosapic_disable_edge_irq,
405 .ack = iosapic_ack_edge_irq,
406 .end = iosapic_end_edge_irq,
407 .set_affinity = iosapic_set_affinity
408};
409
410unsigned int
411iosapic_version (char __iomem *addr)
412{
413 /*
414 * IOSAPIC Version Register return 32 bit structure like:
415 * {
416 * unsigned int version : 8;
417 * unsigned int reserved1 : 8;
418 * unsigned int max_redir : 8;
419 * unsigned int reserved2 : 8;
420 * }
421 */
422 return iosapic_read(addr, IOSAPIC_VERSION);
423}
424
425/*
426 * if the given vector is already owned by other,
427 * assign a new vector for the other and make the vector available
428 */
429static void __init
430iosapic_reassign_vector (int vector)
431{
432 int new_vector;
433
434 if (iosapic_intr_info[vector].rte_index >= 0 || iosapic_intr_info[vector].addr
435 || iosapic_intr_info[vector].gsi_base || iosapic_intr_info[vector].dmode
436 || iosapic_intr_info[vector].polarity || iosapic_intr_info[vector].trigger)
437 {
438 new_vector = assign_irq_vector(AUTO_ASSIGN);
439 printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector);
440 memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector],
441 sizeof(struct iosapic_intr_info));
442 memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info));
443 iosapic_intr_info[vector].rte_index = -1;
444 }
445}
446
447static void
448register_intr (unsigned int gsi, int vector, unsigned char delivery,
449 unsigned long polarity, unsigned long trigger)
450{
451 irq_desc_t *idesc;
452 struct hw_interrupt_type *irq_type;
453 int rte_index;
454 int index;
455 unsigned long gsi_base;
456 void __iomem *iosapic_address;
457
458 index = find_iosapic(gsi);
459 if (index < 0) {
460 printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", __FUNCTION__, gsi);
461 return;
462 }
463
464 iosapic_address = iosapic_lists[index].addr;
465 gsi_base = iosapic_lists[index].gsi_base;
466
467 rte_index = gsi - gsi_base;
468 iosapic_intr_info[vector].rte_index = rte_index;
469 iosapic_intr_info[vector].polarity = polarity;
470 iosapic_intr_info[vector].dmode = delivery;
471 iosapic_intr_info[vector].addr = iosapic_address;
472 iosapic_intr_info[vector].gsi_base = gsi_base;
473 iosapic_intr_info[vector].trigger = trigger;
474 iosapic_intr_info[vector].refcnt++;
475
476 if (trigger == IOSAPIC_EDGE)
477 irq_type = &irq_type_iosapic_edge;
478 else
479 irq_type = &irq_type_iosapic_level;
480
481 idesc = irq_descp(vector);
482 if (idesc->handler != irq_type) {
483 if (idesc->handler != &no_irq_type)
484 printk(KERN_WARNING "%s: changing vector %d from %s to %s\n",
485 __FUNCTION__, vector, idesc->handler->typename, irq_type->typename);
486 idesc->handler = irq_type;
487 }
488}
489
490static unsigned int
491get_target_cpu (unsigned int gsi, int vector)
492{
493#ifdef CONFIG_SMP
494 static int cpu = -1;
495
496 /*
497 * If the platform supports redirection via XTP, let it
498 * distribute interrupts.
499 */
500 if (smp_int_redirect & SMP_IRQ_REDIRECTION)
501 return cpu_physical_id(smp_processor_id());
502
503 /*
504 * Some interrupts (ACPI SCI, for instance) are registered
505 * before the BSP is marked as online.
506 */
507 if (!cpu_online(smp_processor_id()))
508 return cpu_physical_id(smp_processor_id());
509
510#ifdef CONFIG_NUMA
511 {
512 int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
513 cpumask_t cpu_mask;
514
515 iosapic_index = find_iosapic(gsi);
516 if (iosapic_index < 0 ||
517 iosapic_lists[iosapic_index].node == MAX_NUMNODES)
518 goto skip_numa_setup;
519
520 cpu_mask = node_to_cpumask(iosapic_lists[iosapic_index].node);
521
522 for_each_cpu_mask(numa_cpu, cpu_mask) {
523 if (!cpu_online(numa_cpu))
524 cpu_clear(numa_cpu, cpu_mask);
525 }
526
527 num_cpus = cpus_weight(cpu_mask);
528
529 if (!num_cpus)
530 goto skip_numa_setup;
531
532 /* Use vector assigment to distribute across cpus in node */
533 cpu_index = vector % num_cpus;
534
535 for (numa_cpu = first_cpu(cpu_mask) ; i < cpu_index ; i++)
536 numa_cpu = next_cpu(numa_cpu, cpu_mask);
537
538 if (numa_cpu != NR_CPUS)
539 return cpu_physical_id(numa_cpu);
540 }
541skip_numa_setup:
542#endif
543 /*
544 * Otherwise, round-robin interrupt vectors across all the
545 * processors. (It'd be nice if we could be smarter in the
546 * case of NUMA.)
547 */
548 do {
549 if (++cpu >= NR_CPUS)
550 cpu = 0;
551 } while (!cpu_online(cpu));
552
553 return cpu_physical_id(cpu);
554#else
555 return cpu_physical_id(smp_processor_id());
556#endif
557}
558
559/*
560 * ACPI can describe IOSAPIC interrupts via static tables and namespace
561 * methods. This provides an interface to register those interrupts and
562 * program the IOSAPIC RTE.
563 */
564int
565iosapic_register_intr (unsigned int gsi,
566 unsigned long polarity, unsigned long trigger)
567{
568 int vector;
569 unsigned int dest;
570 unsigned long flags;
571
572 /*
573 * If this GSI has already been registered (i.e., it's a
574 * shared interrupt, or we lost a race to register it),
575 * don't touch the RTE.
576 */
577 spin_lock_irqsave(&iosapic_lock, flags);
578 {
579 vector = gsi_to_vector(gsi);
580 if (vector > 0) {
581 iosapic_intr_info[vector].refcnt++;
582 spin_unlock_irqrestore(&iosapic_lock, flags);
583 return vector;
584 }
585
586 vector = assign_irq_vector(AUTO_ASSIGN);
587 dest = get_target_cpu(gsi, vector);
588 register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY,
589 polarity, trigger);
590
591 set_rte(vector, dest, 1);
592 }
593 spin_unlock_irqrestore(&iosapic_lock, flags);
594
595 printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
596 gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
597 (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
598 cpu_logical_id(dest), dest, vector);
599
600 return vector;
601}
602
603#ifdef CONFIG_ACPI_DEALLOCATE_IRQ
604void
605iosapic_unregister_intr (unsigned int gsi)
606{
607 unsigned long flags;
608 int irq, vector;
609 irq_desc_t *idesc;
610 int rte_index;
611 unsigned long trigger, polarity;
612
613 /*
614 * If the irq associated with the gsi is not found,
615 * iosapic_unregister_intr() is unbalanced. We need to check
616 * this again after getting locks.
617 */
618 irq = gsi_to_irq(gsi);
619 if (irq < 0) {
620 printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi);
621 WARN_ON(1);
622 return;
623 }
624 vector = irq_to_vector(irq);
625
626 idesc = irq_descp(irq);
627 spin_lock_irqsave(&idesc->lock, flags);
628 spin_lock(&iosapic_lock);
629 {
630 rte_index = iosapic_intr_info[vector].rte_index;
631 if (rte_index < 0) {
632 spin_unlock(&iosapic_lock);
633 spin_unlock_irqrestore(&idesc->lock, flags);
634 printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi);
635 WARN_ON(1);
636 return;
637 }
638
639 if (--iosapic_intr_info[vector].refcnt > 0) {
640 spin_unlock(&iosapic_lock);
641 spin_unlock_irqrestore(&idesc->lock, flags);
642 return;
643 }
644
645 /*
646 * If interrupt handlers still exist on the irq
647 * associated with the gsi, don't unregister the
648 * interrupt.
649 */
650 if (idesc->action) {
651 iosapic_intr_info[vector].refcnt++;
652 spin_unlock(&iosapic_lock);
653 spin_unlock_irqrestore(&idesc->lock, flags);
654 printk(KERN_WARNING "Cannot unregister GSI. IRQ %u is still in use.\n", irq);
655 return;
656 }
657
658 /* Clear the interrupt controller descriptor. */
659 idesc->handler = &no_irq_type;
660
661 trigger = iosapic_intr_info[vector].trigger;
662 polarity = iosapic_intr_info[vector].polarity;
663
664 /* Clear the interrupt information. */
665 memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info));
666 iosapic_intr_info[vector].rte_index = -1; /* mark as unused */
667 }
668 spin_unlock(&iosapic_lock);
669 spin_unlock_irqrestore(&idesc->lock, flags);
670
671 /* Free the interrupt vector */
672 free_irq_vector(vector);
673
674 printk(KERN_INFO "GSI %u (%s, %s) -> vector %d unregisterd.\n",
675 gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
676 (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
677 vector);
678}
679#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */
680
681/*
682 * ACPI calls this when it finds an entry for a platform interrupt.
683 * Note that the irq_base and IOSAPIC address must be set in iosapic_init().
684 */
685int __init
686iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
687 int iosapic_vector, u16 eid, u16 id,
688 unsigned long polarity, unsigned long trigger)
689{
690 static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"};
691 unsigned char delivery;
692 int vector, mask = 0;
693 unsigned int dest = ((id << 8) | eid) & 0xffff;
694
695 switch (int_type) {
696 case ACPI_INTERRUPT_PMI:
697 vector = iosapic_vector;
698 /*
699 * since PMI vector is alloc'd by FW(ACPI) not by kernel,
700 * we need to make sure the vector is available
701 */
702 iosapic_reassign_vector(vector);
703 delivery = IOSAPIC_PMI;
704 break;
705 case ACPI_INTERRUPT_INIT:
706 vector = assign_irq_vector(AUTO_ASSIGN);
707 delivery = IOSAPIC_INIT;
708 break;
709 case ACPI_INTERRUPT_CPEI:
710 vector = IA64_CPE_VECTOR;
711 delivery = IOSAPIC_LOWEST_PRIORITY;
712 mask = 1;
713 break;
714 default:
715 printk(KERN_ERR "iosapic_register_platform_irq(): invalid int type 0x%x\n", int_type);
716 return -1;
717 }
718
719 register_intr(gsi, vector, delivery, polarity, trigger);
720
721 printk(KERN_INFO "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
722 int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown",
723 int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
724 (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
725 cpu_logical_id(dest), dest, vector);
726
727 set_rte(vector, dest, mask);
728 return vector;
729}
730
731
732/*
733 * ACPI calls this when it finds an entry for a legacy ISA IRQ override.
734 * Note that the gsi_base and IOSAPIC address must be set in iosapic_init().
735 */
736void __init
737iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi,
738 unsigned long polarity,
739 unsigned long trigger)
740{
741 int vector;
742 unsigned int dest = cpu_physical_id(smp_processor_id());
743
744 vector = isa_irq_to_vector(isa_irq);
745
746 register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, polarity, trigger);
747
748 DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n",
749 isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level",
750 polarity == IOSAPIC_POL_HIGH ? "high" : "low",
751 cpu_logical_id(dest), dest, vector);
752
753 set_rte(vector, dest, 1);
754}
755
756void __init
757iosapic_system_init (int system_pcat_compat)
758{
759 int vector;
760
761 for (vector = 0; vector < IA64_NUM_VECTORS; ++vector)
762 iosapic_intr_info[vector].rte_index = -1; /* mark as unused */
763
764 pcat_compat = system_pcat_compat;
765 if (pcat_compat) {
766 /*
767 * Disable the compatibility mode interrupts (8259 style), needs IN/OUT support
768 * enabled.
769 */
770 printk(KERN_INFO "%s: Disabling PC-AT compatible 8259 interrupts\n", __FUNCTION__);
771 outb(0xff, 0xA1);
772 outb(0xff, 0x21);
773 }
774}
775
776void __init
777iosapic_init (unsigned long phys_addr, unsigned int gsi_base)
778{
779 int num_rte;
780 unsigned int isa_irq, ver;
781 char __iomem *addr;
782
783 addr = ioremap(phys_addr, 0);
784 ver = iosapic_version(addr);
785
786 /*
787 * The MAX_REDIR register holds the highest input pin
788 * number (starting from 0).
789 * We add 1 so that we can use it for number of pins (= RTEs)
790 */
791 num_rte = ((ver >> 16) & 0xff) + 1;
792
793 iosapic_lists[num_iosapic].addr = addr;
794 iosapic_lists[num_iosapic].gsi_base = gsi_base;
795 iosapic_lists[num_iosapic].num_rte = num_rte;
796#ifdef CONFIG_NUMA
797 iosapic_lists[num_iosapic].node = MAX_NUMNODES;
798#endif
799 num_iosapic++;
800
801 if ((gsi_base == 0) && pcat_compat) {
802 /*
803 * Map the legacy ISA devices into the IOSAPIC data. Some of these may
804 * get reprogrammed later on with data from the ACPI Interrupt Source
805 * Override table.
806 */
807 for (isa_irq = 0; isa_irq < 16; ++isa_irq)
808 iosapic_override_isa_irq(isa_irq, isa_irq, IOSAPIC_POL_HIGH, IOSAPIC_EDGE);
809 }
810}
811
812#ifdef CONFIG_NUMA
813void __init
814map_iosapic_to_node(unsigned int gsi_base, int node)
815{
816 int index;
817
818 index = find_iosapic(gsi_base);
819 if (index < 0) {
820 printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
821 __FUNCTION__, gsi_base);
822 return;
823 }
824 iosapic_lists[index].node = node;
825 return;
826}
827#endif
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
new file mode 100644
index 000000000000..28f2aadc38d0
--- /dev/null
+++ b/arch/ia64/kernel/irq.c
@@ -0,0 +1,238 @@
1/*
2 * linux/arch/ia64/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the code used by various IRQ handling routines:
7 * asking for different IRQ's should be done through these routines
8 * instead of just grabbing them. Thus setups with different IRQ numbers
9 * shouldn't result in any weird surprises, and installing new handlers
10 * should be easier.
11 *
12 * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004
13 *
14 * 4/14/2004: Added code to handle cpu migration and do safe irq
15 * migration without lossing interrupts for iosapic
16 * architecture.
17 */
18
19#include <asm/delay.h>
20#include <asm/uaccess.h>
21#include <linux/module.h>
22#include <linux/seq_file.h>
23#include <linux/interrupt.h>
24#include <linux/kernel_stat.h>
25
26/*
27 * 'what should we do if we get a hw irq event on an illegal vector'.
28 * each architecture has to answer this themselves.
29 */
30void ack_bad_irq(unsigned int irq)
31{
32 printk(KERN_ERR "Unexpected irq vector 0x%x on CPU %u!\n", irq, smp_processor_id());
33}
34
35#ifdef CONFIG_IA64_GENERIC
36unsigned int __ia64_local_vector_to_irq (ia64_vector vec)
37{
38 return (unsigned int) vec;
39}
40#endif
41
42/*
43 * Interrupt statistics:
44 */
45
46atomic_t irq_err_count;
47
48/*
49 * /proc/interrupts printing:
50 */
51
52int show_interrupts(struct seq_file *p, void *v)
53{
54 int i = *(loff_t *) v, j;
55 struct irqaction * action;
56 unsigned long flags;
57
58 if (i == 0) {
59 seq_printf(p, " ");
60 for (j=0; j<NR_CPUS; j++)
61 if (cpu_online(j))
62 seq_printf(p, "CPU%d ",j);
63 seq_putc(p, '\n');
64 }
65
66 if (i < NR_IRQS) {
67 spin_lock_irqsave(&irq_desc[i].lock, flags);
68 action = irq_desc[i].action;
69 if (!action)
70 goto skip;
71 seq_printf(p, "%3d: ",i);
72#ifndef CONFIG_SMP
73 seq_printf(p, "%10u ", kstat_irqs(i));
74#else
75 for (j = 0; j < NR_CPUS; j++)
76 if (cpu_online(j))
77 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
78#endif
79 seq_printf(p, " %14s", irq_desc[i].handler->typename);
80 seq_printf(p, " %s", action->name);
81
82 for (action=action->next; action; action = action->next)
83 seq_printf(p, ", %s", action->name);
84
85 seq_putc(p, '\n');
86skip:
87 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
88 } else if (i == NR_IRQS)
89 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
90 return 0;
91}
92
93#ifdef CONFIG_SMP
94/*
95 * This is updated when the user sets irq affinity via /proc
96 */
97static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
98static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
99
100static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
101
102/*
103 * Arch specific routine for deferred write to iosapic rte to reprogram
104 * intr destination.
105 */
106void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
107{
108 pending_irq_cpumask[irq] = mask_val;
109}
110
111void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
112{
113 cpumask_t mask = CPU_MASK_NONE;
114
115 cpu_set(cpu_logical_id(hwid), mask);
116
117 if (irq < NR_IRQS) {
118 irq_affinity[irq] = mask;
119 irq_redir[irq] = (char) (redir & 0xff);
120 }
121}
122
123
124void move_irq(int irq)
125{
126 /* note - we hold desc->lock */
127 cpumask_t tmp;
128 irq_desc_t *desc = irq_descp(irq);
129 int redir = test_bit(irq, pending_irq_redir);
130
131 if (unlikely(!desc->handler->set_affinity))
132 return;
133
134 if (!cpus_empty(pending_irq_cpumask[irq])) {
135 cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
136 if (unlikely(!cpus_empty(tmp))) {
137 desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0),
138 pending_irq_cpumask[irq]);
139 }
140 cpus_clear(pending_irq_cpumask[irq]);
141 }
142}
143
144
145#endif /* CONFIG_SMP */
146
147#ifdef CONFIG_HOTPLUG_CPU
148unsigned int vectors_in_migration[NR_IRQS];
149
150/*
151 * Since cpu_online_map is already updated, we just need to check for
152 * affinity that has zeros
153 */
154static void migrate_irqs(void)
155{
156 cpumask_t mask;
157 irq_desc_t *desc;
158 int irq, new_cpu;
159
160 for (irq=0; irq < NR_IRQS; irq++) {
161 desc = irq_descp(irq);
162
163 /*
164 * No handling for now.
165 * TBD: Implement a disable function so we can now
166 * tell CPU not to respond to these local intr sources.
167 * such as ITV,CPEI,MCA etc.
168 */
169 if (desc->status == IRQ_PER_CPU)
170 continue;
171
172 cpus_and(mask, irq_affinity[irq], cpu_online_map);
173 if (any_online_cpu(mask) == NR_CPUS) {
174 /*
175 * Save it for phase 2 processing
176 */
177 vectors_in_migration[irq] = irq;
178
179 new_cpu = any_online_cpu(cpu_online_map);
180 mask = cpumask_of_cpu(new_cpu);
181
182 /*
183 * Al three are essential, currently WARN_ON.. maybe panic?
184 */
185 if (desc->handler && desc->handler->disable &&
186 desc->handler->enable && desc->handler->set_affinity) {
187 desc->handler->disable(irq);
188 desc->handler->set_affinity(irq, mask);
189 desc->handler->enable(irq);
190 } else {
191 WARN_ON((!(desc->handler) || !(desc->handler->disable) ||
192 !(desc->handler->enable) ||
193 !(desc->handler->set_affinity)));
194 }
195 }
196 }
197}
198
199void fixup_irqs(void)
200{
201 unsigned int irq;
202 extern void ia64_process_pending_intr(void);
203
204 ia64_set_itv(1<<16);
205 /*
206 * Phase 1: Locate irq's bound to this cpu and
207 * relocate them for cpu removal.
208 */
209 migrate_irqs();
210
211 /*
212 * Phase 2: Perform interrupt processing for all entries reported in
213 * local APIC.
214 */
215 ia64_process_pending_intr();
216
217 /*
218 * Phase 3: Now handle any interrupts not captured in local APIC.
219 * This is to account for cases that device interrupted during the time the
220 * rte was being disabled and re-programmed.
221 */
222 for (irq=0; irq < NR_IRQS; irq++) {
223 if (vectors_in_migration[irq]) {
224 vectors_in_migration[irq]=0;
225 __do_IRQ(irq, NULL);
226 }
227 }
228
229 /*
230 * Now let processor die. We do irq disable and max_xtp() to
231 * ensure there is no more interrupts routed to this processor.
232 * But the local timer interrupt can have 1 pending which we
233 * take care in timer_interrupt().
234 */
235 max_xtp();
236 local_irq_disable();
237}
238#endif
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
new file mode 100644
index 000000000000..5ba06ebe355b
--- /dev/null
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -0,0 +1,278 @@
1/*
2 * linux/arch/ia64/kernel/irq.c
3 *
4 * Copyright (C) 1998-2001 Hewlett-Packard Co
5 * Stephane Eranian <eranian@hpl.hp.com>
6 * David Mosberger-Tang <davidm@hpl.hp.com>
7 *
8 * 6/10/99: Updated to bring in sync with x86 version to facilitate
9 * support for SMP and different interrupt controllers.
10 *
11 * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector
12 * PCI to vector allocation routine.
13 * 04/14/2004 Ashok Raj <ashok.raj@intel.com>
14 * Added CPU Hotplug handling for IPF.
15 */
16
17#include <linux/config.h>
18#include <linux/module.h>
19
20#include <linux/jiffies.h>
21#include <linux/errno.h>
22#include <linux/init.h>
23#include <linux/interrupt.h>
24#include <linux/ioport.h>
25#include <linux/kernel_stat.h>
26#include <linux/slab.h>
27#include <linux/ptrace.h>
28#include <linux/random.h> /* for rand_initialize_irq() */
29#include <linux/signal.h>
30#include <linux/smp.h>
31#include <linux/smp_lock.h>
32#include <linux/threads.h>
33#include <linux/bitops.h>
34
35#include <asm/delay.h>
36#include <asm/intrinsics.h>
37#include <asm/io.h>
38#include <asm/hw_irq.h>
39#include <asm/machvec.h>
40#include <asm/pgtable.h>
41#include <asm/system.h>
42
43#ifdef CONFIG_PERFMON
44# include <asm/perfmon.h>
45#endif
46
47#define IRQ_DEBUG 0
48
49/* default base addr of IPI table */
50void __iomem *ipi_base_addr = ((void __iomem *)
51 (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR));
52
53/*
54 * Legacy IRQ to IA-64 vector translation table.
55 */
56__u8 isa_irq_to_vector_map[16] = {
57 /* 8259 IRQ translation, first 16 entries */
58 0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
59 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21
60};
61EXPORT_SYMBOL(isa_irq_to_vector_map);
62
63static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)];
64
65int
66assign_irq_vector (int irq)
67{
68 int pos, vector;
69 again:
70 pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
71 vector = IA64_FIRST_DEVICE_VECTOR + pos;
72 if (vector > IA64_LAST_DEVICE_VECTOR)
73 /* XXX could look for sharable vectors instead of panic'ing... */
74 panic("assign_irq_vector: out of interrupt vectors!");
75 if (test_and_set_bit(pos, ia64_vector_mask))
76 goto again;
77 return vector;
78}
79
80void
81free_irq_vector (int vector)
82{
83 int pos;
84
85 if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR)
86 return;
87
88 pos = vector - IA64_FIRST_DEVICE_VECTOR;
89 if (!test_and_clear_bit(pos, ia64_vector_mask))
90 printk(KERN_WARNING "%s: double free!\n", __FUNCTION__);
91}
92
93#ifdef CONFIG_SMP
94# define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE)
95#else
96# define IS_RESCHEDULE(vec) (0)
97#endif
98/*
99 * That's where the IVT branches when we get an external
100 * interrupt. This branches to the correct hardware IRQ handler via
101 * function ptr.
102 */
103void
104ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
105{
106 unsigned long saved_tpr;
107
108#if IRQ_DEBUG
109 {
110 unsigned long bsp, sp;
111
112 /*
113 * Note: if the interrupt happened while executing in
114 * the context switch routine (ia64_switch_to), we may
115 * get a spurious stack overflow here. This is
116 * because the register and the memory stack are not
117 * switched atomically.
118 */
119 bsp = ia64_getreg(_IA64_REG_AR_BSP);
120 sp = ia64_getreg(_IA64_REG_SP);
121
122 if ((sp - bsp) < 1024) {
123 static unsigned char count;
124 static long last_time;
125
126 if (jiffies - last_time > 5*HZ)
127 count = 0;
128 if (++count < 5) {
129 last_time = jiffies;
130 printk("ia64_handle_irq: DANGER: less than "
131 "1KB of free stack space!!\n"
132 "(bsp=0x%lx, sp=%lx)\n", bsp, sp);
133 }
134 }
135 }
136#endif /* IRQ_DEBUG */
137
138 /*
139 * Always set TPR to limit maximum interrupt nesting depth to
140 * 16 (without this, it would be ~240, which could easily lead
141 * to kernel stack overflows).
142 */
143 irq_enter();
144 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
145 ia64_srlz_d();
146 while (vector != IA64_SPURIOUS_INT_VECTOR) {
147 if (!IS_RESCHEDULE(vector)) {
148 ia64_setreg(_IA64_REG_CR_TPR, vector);
149 ia64_srlz_d();
150
151 __do_IRQ(local_vector_to_irq(vector), regs);
152
153 /*
154 * Disable interrupts and send EOI:
155 */
156 local_irq_disable();
157 ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
158 }
159 ia64_eoi();
160 vector = ia64_get_ivr();
161 }
162 /*
163 * This must be done *after* the ia64_eoi(). For example, the keyboard softirq
164 * handler needs to be able to wait for further keyboard interrupts, which can't
165 * come through until ia64_eoi() has been done.
166 */
167 irq_exit();
168}
169
170#ifdef CONFIG_HOTPLUG_CPU
171/*
172 * This function emulates a interrupt processing when a cpu is about to be
173 * brought down.
174 */
175void ia64_process_pending_intr(void)
176{
177 ia64_vector vector;
178 unsigned long saved_tpr;
179 extern unsigned int vectors_in_migration[NR_IRQS];
180
181 vector = ia64_get_ivr();
182
183 irq_enter();
184 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
185 ia64_srlz_d();
186
187 /*
188 * Perform normal interrupt style processing
189 */
190 while (vector != IA64_SPURIOUS_INT_VECTOR) {
191 if (!IS_RESCHEDULE(vector)) {
192 ia64_setreg(_IA64_REG_CR_TPR, vector);
193 ia64_srlz_d();
194
195 /*
196 * Now try calling normal ia64_handle_irq as it would have got called
197 * from a real intr handler. Try passing null for pt_regs, hopefully
198 * it will work. I hope it works!.
199 * Probably could shared code.
200 */
201 vectors_in_migration[local_vector_to_irq(vector)]=0;
202 __do_IRQ(local_vector_to_irq(vector), NULL);
203
204 /*
205 * Disable interrupts and send EOI
206 */
207 local_irq_disable();
208 ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
209 }
210 ia64_eoi();
211 vector = ia64_get_ivr();
212 }
213 irq_exit();
214}
215#endif
216
217
218#ifdef CONFIG_SMP
219extern irqreturn_t handle_IPI (int irq, void *dev_id, struct pt_regs *regs);
220
221static struct irqaction ipi_irqaction = {
222 .handler = handle_IPI,
223 .flags = SA_INTERRUPT,
224 .name = "IPI"
225};
226#endif
227
228void
229register_percpu_irq (ia64_vector vec, struct irqaction *action)
230{
231 irq_desc_t *desc;
232 unsigned int irq;
233
234 for (irq = 0; irq < NR_IRQS; ++irq)
235 if (irq_to_vector(irq) == vec) {
236 desc = irq_descp(irq);
237 desc->status |= IRQ_PER_CPU;
238 desc->handler = &irq_type_ia64_lsapic;
239 if (action)
240 setup_irq(irq, action);
241 }
242}
243
244void __init
245init_IRQ (void)
246{
247 register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
248#ifdef CONFIG_SMP
249 register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
250#endif
251#ifdef CONFIG_PERFMON
252 pfm_init_percpu();
253#endif
254 platform_irq_init();
255}
256
257void
258ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect)
259{
260 void __iomem *ipi_addr;
261 unsigned long ipi_data;
262 unsigned long phys_cpu_id;
263
264#ifdef CONFIG_SMP
265 phys_cpu_id = cpu_physical_id(cpu);
266#else
267 phys_cpu_id = (ia64_getreg(_IA64_REG_CR_LID) >> 16) & 0xffff;
268#endif
269
270 /*
271 * cpu number is in 8bit ID and 8bit EID
272 */
273
274 ipi_data = (delivery_mode << 8) | (vector & 0xff);
275 ipi_addr = ipi_base_addr + ((phys_cpu_id << 4) | ((redirect & 1) << 3));
276
277 writeq(ipi_data, ipi_addr);
278}
diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c
new file mode 100644
index 000000000000..ea14e6a04409
--- /dev/null
+++ b/arch/ia64/kernel/irq_lsapic.c
@@ -0,0 +1,37 @@
1/*
2 * LSAPIC Interrupt Controller
3 *
4 * This takes care of interrupts that are generated by the CPU's
5 * internal Streamlined Advanced Programmable Interrupt Controller
6 * (LSAPIC), such as the ITC and IPI interrupts.
7 *
8 * Copyright (C) 1999 VA Linux Systems
9 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
10 * Copyright (C) 2000 Hewlett-Packard Co
11 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
12 */
13
14#include <linux/sched.h>
15#include <linux/irq.h>
16
17static unsigned int
18lsapic_noop_startup (unsigned int irq)
19{
20 return 0;
21}
22
23static void
24lsapic_noop (unsigned int irq)
25{
26 /* nuthing to do... */
27}
28
29struct hw_interrupt_type irq_type_ia64_lsapic = {
30 .typename = "LSAPIC",
31 .startup = lsapic_noop_startup,
32 .shutdown = lsapic_noop,
33 .enable = lsapic_noop,
34 .disable = lsapic_noop,
35 .ack = lsapic_noop,
36 .end = lsapic_noop
37};
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
new file mode 100644
index 000000000000..d9c05d53435b
--- /dev/null
+++ b/arch/ia64/kernel/ivt.S
@@ -0,0 +1,1619 @@
1/*
2 * arch/ia64/kernel/ivt.S
3 *
4 * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
5 * Stephane Eranian <eranian@hpl.hp.com>
6 * David Mosberger <davidm@hpl.hp.com>
7 * Copyright (C) 2000, 2002-2003 Intel Co
8 * Asit Mallick <asit.k.mallick@intel.com>
9 * Suresh Siddha <suresh.b.siddha@intel.com>
10 * Kenneth Chen <kenneth.w.chen@intel.com>
11 * Fenghua Yu <fenghua.yu@intel.com>
12 *
13 * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP
14 * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT.
15 */
16/*
17 * This file defines the interruption vector table used by the CPU.
18 * It does not include one entry per possible cause of interruption.
19 *
20 * The first 20 entries of the table contain 64 bundles each while the
21 * remaining 48 entries contain only 16 bundles each.
22 *
23 * The 64 bundles are used to allow inlining the whole handler for critical
24 * interruptions like TLB misses.
25 *
26 * For each entry, the comment is as follows:
27 *
28 * // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
29 * entry offset ----/ / / / /
30 * entry number ---------/ / / /
31 * size of the entry -------------/ / /
32 * vector name -------------------------------------/ /
33 * interruptions triggering this vector ----------------------/
34 *
35 * The table is 32KB in size and must be aligned on 32KB boundary.
36 * (The CPU ignores the 15 lower bits of the address)
37 *
38 * Table is based upon EAS2.6 (Oct 1999)
39 */
40
41#include <linux/config.h>
42
43#include <asm/asmmacro.h>
44#include <asm/break.h>
45#include <asm/ia32.h>
46#include <asm/kregs.h>
47#include <asm/offsets.h>
48#include <asm/pgtable.h>
49#include <asm/processor.h>
50#include <asm/ptrace.h>
51#include <asm/system.h>
52#include <asm/thread_info.h>
53#include <asm/unistd.h>
54#include <asm/errno.h>
55
56#if 1
57# define PSR_DEFAULT_BITS psr.ac
58#else
59# define PSR_DEFAULT_BITS 0
60#endif
61
62#if 0
63 /*
64 * This lets you track the last eight faults that occurred on the CPU. Make sure ar.k2 isn't
65 * needed for something else before enabling this...
66 */
67# define DBG_FAULT(i) mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
68#else
69# define DBG_FAULT(i)
70#endif
71
72#define MINSTATE_VIRT /* needed by minstate.h */
73#include "minstate.h"
74
75#define FAULT(n) \
76 mov r31=pr; \
77 mov r19=n;; /* prepare to save predicates */ \
78 br.sptk.many dispatch_to_fault_handler
79
80 .section .text.ivt,"ax"
81
82 .align 32768 // align on 32KB boundary
83 .global ia64_ivt
84ia64_ivt:
85/////////////////////////////////////////////////////////////////////////////////////////
86// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
87ENTRY(vhpt_miss)
88 DBG_FAULT(0)
89 /*
90 * The VHPT vector is invoked when the TLB entry for the virtual page table
91 * is missing. This happens only as a result of a previous
92 * (the "original") TLB miss, which may either be caused by an instruction
93 * fetch or a data access (or non-access).
94 *
95 * What we do here is normal TLB miss handing for the _original_ miss, followed
96 * by inserting the TLB entry for the virtual page table page that the VHPT
97 * walker was attempting to access. The latter gets inserted as long
98 * as both L1 and L2 have valid mappings for the faulting address.
99 * The TLB entry for the original miss gets inserted only if
100 * the L3 entry indicates that the page is present.
101 *
102 * do_page_fault gets invoked in the following cases:
103 * - the faulting virtual address uses unimplemented address bits
104 * - the faulting virtual address has no L1, L2, or L3 mapping
105 */
106 mov r16=cr.ifa // get address that caused the TLB miss
107#ifdef CONFIG_HUGETLB_PAGE
108 movl r18=PAGE_SHIFT
109 mov r25=cr.itir
110#endif
111 ;;
112 rsm psr.dt // use physical addressing for data
113 mov r31=pr // save the predicate registers
114 mov r19=IA64_KR(PT_BASE) // get page table base address
115 shl r21=r16,3 // shift bit 60 into sign bit
116 shr.u r17=r16,61 // get the region number into r17
117 ;;
118 shr r22=r21,3
119#ifdef CONFIG_HUGETLB_PAGE
120 extr.u r26=r25,2,6
121 ;;
122 cmp.ne p8,p0=r18,r26
123 sub r27=r26,r18
124 ;;
125(p8) dep r25=r18,r25,2,6
126(p8) shr r22=r22,r27
127#endif
128 ;;
129 cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5?
130 shr.u r18=r22,PGDIR_SHIFT // get bits 33-63 of the faulting address
131 ;;
132(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
133
134 srlz.d
135 LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
136
137 .pred.rel "mutex", p6, p7
138(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
139(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
140 ;;
141(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
142(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
143 cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
144 shr.u r18=r22,PMD_SHIFT // shift L2 index into position
145 ;;
146 ld8 r17=[r17] // fetch the L1 entry (may be 0)
147 ;;
148(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
149 dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
150 ;;
151(p7) ld8 r20=[r17] // fetch the L2 entry (may be 0)
152 shr.u r19=r22,PAGE_SHIFT // shift L3 index into position
153 ;;
154(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L2 entry NULL?
155 dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
156 ;;
157(p7) ld8 r18=[r21] // read the L3 PTE
158 mov r19=cr.isr // cr.isr bit 0 tells us if this is an insn miss
159 ;;
160(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared?
161 mov r22=cr.iha // get the VHPT address that caused the TLB miss
162 ;; // avoid RAW on p7
163(p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss?
164 dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address
165 ;;
166(p10) itc.i r18 // insert the instruction TLB entry
167(p11) itc.d r18 // insert the data TLB entry
168(p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault)
169 mov cr.ifa=r22
170
171#ifdef CONFIG_HUGETLB_PAGE
172(p8) mov cr.itir=r25 // change to default page-size for VHPT
173#endif
174
175 /*
176 * Now compute and insert the TLB entry for the virtual page table. We never
177 * execute in a page table page so there is no need to set the exception deferral
178 * bit.
179 */
180 adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
181 ;;
182(p7) itc.d r24
183 ;;
184#ifdef CONFIG_SMP
185 /*
186 * Tell the assemblers dependency-violation checker that the above "itc" instructions
187 * cannot possibly affect the following loads:
188 */
189 dv_serialize_data
190
191 /*
192 * Re-check L2 and L3 pagetable. If they changed, we may have received a ptc.g
193 * between reading the pagetable and the "itc". If so, flush the entry we
194 * inserted and retry.
195 */
196 ld8 r25=[r21] // read L3 PTE again
197 ld8 r26=[r17] // read L2 entry again
198 ;;
199 cmp.ne p6,p7=r26,r20 // did L2 entry change
200 mov r27=PAGE_SHIFT<<2
201 ;;
202(p6) ptc.l r22,r27 // purge PTE page translation
203(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did L3 PTE change
204 ;;
205(p6) ptc.l r16,r27 // purge translation
206#endif
207
208 mov pr=r31,-1 // restore predicate registers
209 rfi
210END(vhpt_miss)
211
212 .org ia64_ivt+0x400
213/////////////////////////////////////////////////////////////////////////////////////////
214// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
215ENTRY(itlb_miss)
216 DBG_FAULT(1)
217 /*
218 * The ITLB handler accesses the L3 PTE via the virtually mapped linear
219 * page table. If a nested TLB miss occurs, we switch into physical
220 * mode, walk the page table, and then re-execute the L3 PTE read
221 * and go on normally after that.
222 */
223 mov r16=cr.ifa // get virtual address
224 mov r29=b0 // save b0
225 mov r31=pr // save predicates
226.itlb_fault:
227 mov r17=cr.iha // get virtual address of L3 PTE
228 movl r30=1f // load nested fault continuation point
229 ;;
2301: ld8 r18=[r17] // read L3 PTE
231 ;;
232 mov b0=r29
233 tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
234(p6) br.cond.spnt page_fault
235 ;;
236 itc.i r18
237 ;;
238#ifdef CONFIG_SMP
239 /*
240 * Tell the assemblers dependency-violation checker that the above "itc" instructions
241 * cannot possibly affect the following loads:
242 */
243 dv_serialize_data
244
245 ld8 r19=[r17] // read L3 PTE again and see if same
246 mov r20=PAGE_SHIFT<<2 // setup page size for purge
247 ;;
248 cmp.ne p7,p0=r18,r19
249 ;;
250(p7) ptc.l r16,r20
251#endif
252 mov pr=r31,-1
253 rfi
254END(itlb_miss)
255
256 .org ia64_ivt+0x0800
257/////////////////////////////////////////////////////////////////////////////////////////
258// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
259ENTRY(dtlb_miss)
260 DBG_FAULT(2)
261 /*
262 * The DTLB handler accesses the L3 PTE via the virtually mapped linear
263 * page table. If a nested TLB miss occurs, we switch into physical
264 * mode, walk the page table, and then re-execute the L3 PTE read
265 * and go on normally after that.
266 */
267 mov r16=cr.ifa // get virtual address
268 mov r29=b0 // save b0
269 mov r31=pr // save predicates
270dtlb_fault:
271 mov r17=cr.iha // get virtual address of L3 PTE
272 movl r30=1f // load nested fault continuation point
273 ;;
2741: ld8 r18=[r17] // read L3 PTE
275 ;;
276 mov b0=r29
277 tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
278(p6) br.cond.spnt page_fault
279 ;;
280 itc.d r18
281 ;;
282#ifdef CONFIG_SMP
283 /*
284 * Tell the assemblers dependency-violation checker that the above "itc" instructions
285 * cannot possibly affect the following loads:
286 */
287 dv_serialize_data
288
289 ld8 r19=[r17] // read L3 PTE again and see if same
290 mov r20=PAGE_SHIFT<<2 // setup page size for purge
291 ;;
292 cmp.ne p7,p0=r18,r19
293 ;;
294(p7) ptc.l r16,r20
295#endif
296 mov pr=r31,-1
297 rfi
298END(dtlb_miss)
299
300 .org ia64_ivt+0x0c00
301/////////////////////////////////////////////////////////////////////////////////////////
302// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
303ENTRY(alt_itlb_miss)
304 DBG_FAULT(3)
305 mov r16=cr.ifa // get address that caused the TLB miss
306 movl r17=PAGE_KERNEL
307 mov r21=cr.ipsr
308 movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
309 mov r31=pr
310 ;;
311#ifdef CONFIG_DISABLE_VHPT
312 shr.u r22=r16,61 // get the region number into r21
313 ;;
314 cmp.gt p8,p0=6,r22 // user mode
315 ;;
316(p8) thash r17=r16
317 ;;
318(p8) mov cr.iha=r17
319(p8) mov r29=b0 // save b0
320(p8) br.cond.dptk .itlb_fault
321#endif
322 extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
323 and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
324 shr.u r18=r16,57 // move address bit 61 to bit 4
325 ;;
326 andcm r18=0x10,r18 // bit 4=~address-bit(61)
327 cmp.ne p8,p0=r0,r23 // psr.cpl != 0?
328 or r19=r17,r19 // insert PTE control bits into r19
329 ;;
330 or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
331(p8) br.cond.spnt page_fault
332 ;;
333 itc.i r19 // insert the TLB entry
334 mov pr=r31,-1
335 rfi
336END(alt_itlb_miss)
337
338 .org ia64_ivt+0x1000
339/////////////////////////////////////////////////////////////////////////////////////////
340// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
341ENTRY(alt_dtlb_miss)
342 DBG_FAULT(4)
343 mov r16=cr.ifa // get address that caused the TLB miss
344 movl r17=PAGE_KERNEL
345 mov r20=cr.isr
346 movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
347 mov r21=cr.ipsr
348 mov r31=pr
349 ;;
350#ifdef CONFIG_DISABLE_VHPT
351 shr.u r22=r16,61 // get the region number into r21
352 ;;
353 cmp.gt p8,p0=6,r22 // access to region 0-5
354 ;;
355(p8) thash r17=r16
356 ;;
357(p8) mov cr.iha=r17
358(p8) mov r29=b0 // save b0
359(p8) br.cond.dptk dtlb_fault
360#endif
361 extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
362 and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field
363 tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on?
364 shr.u r18=r16,57 // move address bit 61 to bit 4
365 and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
366 tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on?
367 ;;
368 andcm r18=0x10,r18 // bit 4=~address-bit(61)
369 cmp.ne p8,p0=r0,r23
370(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field
371(p8) br.cond.spnt page_fault
372
373 dep r21=-1,r21,IA64_PSR_ED_BIT,1
374 or r19=r19,r17 // insert PTE control bits into r19
375 ;;
376 or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
377(p6) mov cr.ipsr=r21
378 ;;
379(p7) itc.d r19 // insert the TLB entry
380 mov pr=r31,-1
381 rfi
382END(alt_dtlb_miss)
383
384 .org ia64_ivt+0x1400
385/////////////////////////////////////////////////////////////////////////////////////////
386// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
387ENTRY(nested_dtlb_miss)
388 /*
389 * In the absence of kernel bugs, we get here when the virtually mapped linear
390 * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
391 * Access-bit, or Data Access-bit faults). If the DTLB entry for the virtual page
392 * table is missing, a nested TLB miss fault is triggered and control is
393 * transferred to this point. When this happens, we lookup the pte for the
394 * faulting address by walking the page table in physical mode and return to the
395 * continuation point passed in register r30 (or call page_fault if the address is
396 * not mapped).
397 *
398 * Input: r16: faulting address
399 * r29: saved b0
400 * r30: continuation address
401 * r31: saved pr
402 *
403 * Output: r17: physical address of L3 PTE of faulting address
404 * r29: saved b0
405 * r30: continuation address
406 * r31: saved pr
407 *
408 * Clobbered: b0, r18, r19, r21, psr.dt (cleared)
409 */
410 rsm psr.dt // switch to using physical data addressing
411 mov r19=IA64_KR(PT_BASE) // get the page table base address
412 shl r21=r16,3 // shift bit 60 into sign bit
413 ;;
414 shr.u r17=r16,61 // get the region number into r17
415 ;;
416 cmp.eq p6,p7=5,r17 // is faulting address in region 5?
417 shr.u r18=r16,PGDIR_SHIFT // get bits 33-63 of faulting address
418 ;;
419(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
420
421 srlz.d
422 LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
423
424 .pred.rel "mutex", p6, p7
425(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
426(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
427 ;;
428(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8
429(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
430 cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
431 shr.u r18=r16,PMD_SHIFT // shift L2 index into position
432 ;;
433 ld8 r17=[r17] // fetch the L1 entry (may be 0)
434 ;;
435(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL?
436 dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry
437 ;;
438(p7) ld8 r17=[r17] // fetch the L2 entry (may be 0)
439 shr.u r19=r16,PAGE_SHIFT // shift L3 index into position
440 ;;
441(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L2 entry NULL?
442 dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry
443(p6) br.cond.spnt page_fault
444 mov b0=r30
445 br.sptk.many b0 // return to continuation point
446END(nested_dtlb_miss)
447
448 .org ia64_ivt+0x1800
449/////////////////////////////////////////////////////////////////////////////////////////
450// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
451ENTRY(ikey_miss)
452 DBG_FAULT(6)
453 FAULT(6)
454END(ikey_miss)
455
456 //-----------------------------------------------------------------------------------
457 // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
458ENTRY(page_fault)
459 ssm psr.dt
460 ;;
461 srlz.i
462 ;;
463 SAVE_MIN_WITH_COVER
464 alloc r15=ar.pfs,0,0,3,0
465 mov out0=cr.ifa
466 mov out1=cr.isr
467 adds r3=8,r2 // set up second base pointer
468 ;;
469 ssm psr.ic | PSR_DEFAULT_BITS
470 ;;
471 srlz.i // guarantee that interruption collectin is on
472 ;;
473(p15) ssm psr.i // restore psr.i
474 movl r14=ia64_leave_kernel
475 ;;
476 SAVE_REST
477 mov rp=r14
478 ;;
479 adds out2=16,r12 // out2 = pointer to pt_regs
480 br.call.sptk.many b6=ia64_do_page_fault // ignore return address
481END(page_fault)
482
483 .org ia64_ivt+0x1c00
484/////////////////////////////////////////////////////////////////////////////////////////
485// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
486ENTRY(dkey_miss)
487 DBG_FAULT(7)
488 FAULT(7)
489END(dkey_miss)
490
491 .org ia64_ivt+0x2000
492/////////////////////////////////////////////////////////////////////////////////////////
493// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
494ENTRY(dirty_bit)
495 DBG_FAULT(8)
496 /*
497 * What we do here is to simply turn on the dirty bit in the PTE. We need to
498 * update both the page-table and the TLB entry. To efficiently access the PTE,
499 * we address it through the virtual page table. Most likely, the TLB entry for
500 * the relevant virtual page table page is still present in the TLB so we can
501 * normally do this without additional TLB misses. In case the necessary virtual
502 * page table TLB entry isn't present, we take a nested TLB miss hit where we look
503 * up the physical address of the L3 PTE and then continue at label 1 below.
504 */
505 mov r16=cr.ifa // get the address that caused the fault
506 movl r30=1f // load continuation point in case of nested fault
507 ;;
508 thash r17=r16 // compute virtual address of L3 PTE
509 mov r29=b0 // save b0 in case of nested fault
510 mov r31=pr // save pr
511#ifdef CONFIG_SMP
512 mov r28=ar.ccv // save ar.ccv
513 ;;
5141: ld8 r18=[r17]
515 ;; // avoid RAW on r18
516 mov ar.ccv=r18 // set compare value for cmpxchg
517 or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
518 ;;
519 cmpxchg8.acq r26=[r17],r25,ar.ccv
520 mov r24=PAGE_SHIFT<<2
521 ;;
522 cmp.eq p6,p7=r26,r18
523 ;;
524(p6) itc.d r25 // install updated PTE
525 ;;
526 /*
527 * Tell the assemblers dependency-violation checker that the above "itc" instructions
528 * cannot possibly affect the following loads:
529 */
530 dv_serialize_data
531
532 ld8 r18=[r17] // read PTE again
533 ;;
534 cmp.eq p6,p7=r18,r25 // is it same as the newly installed
535 ;;
536(p7) ptc.l r16,r24
537 mov b0=r29 // restore b0
538 mov ar.ccv=r28
539#else
540 ;;
5411: ld8 r18=[r17]
542 ;; // avoid RAW on r18
543 or r18=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
544 mov b0=r29 // restore b0
545 ;;
546 st8 [r17]=r18 // store back updated PTE
547 itc.d r18 // install updated PTE
548#endif
549 mov pr=r31,-1 // restore pr
550 rfi
551END(dirty_bit)
552
553 .org ia64_ivt+0x2400
554/////////////////////////////////////////////////////////////////////////////////////////
555// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
556ENTRY(iaccess_bit)
557 DBG_FAULT(9)
558 // Like Entry 8, except for instruction access
559 mov r16=cr.ifa // get the address that caused the fault
560 movl r30=1f // load continuation point in case of nested fault
561 mov r31=pr // save predicates
562#ifdef CONFIG_ITANIUM
563 /*
564 * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
565 */
566 mov r17=cr.ipsr
567 ;;
568 mov r18=cr.iip
569 tbit.z p6,p0=r17,IA64_PSR_IS_BIT // IA64 instruction set?
570 ;;
571(p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa
572#endif /* CONFIG_ITANIUM */
573 ;;
574 thash r17=r16 // compute virtual address of L3 PTE
575 mov r29=b0 // save b0 in case of nested fault)
576#ifdef CONFIG_SMP
577 mov r28=ar.ccv // save ar.ccv
578 ;;
5791: ld8 r18=[r17]
580 ;;
581 mov ar.ccv=r18 // set compare value for cmpxchg
582 or r25=_PAGE_A,r18 // set the accessed bit
583 ;;
584 cmpxchg8.acq r26=[r17],r25,ar.ccv
585 mov r24=PAGE_SHIFT<<2
586 ;;
587 cmp.eq p6,p7=r26,r18
588 ;;
589(p6) itc.i r25 // install updated PTE
590 ;;
591 /*
592 * Tell the assemblers dependency-violation checker that the above "itc" instructions
593 * cannot possibly affect the following loads:
594 */
595 dv_serialize_data
596
597 ld8 r18=[r17] // read PTE again
598 ;;
599 cmp.eq p6,p7=r18,r25 // is it same as the newly installed
600 ;;
601(p7) ptc.l r16,r24
602 mov b0=r29 // restore b0
603 mov ar.ccv=r28
604#else /* !CONFIG_SMP */
605 ;;
6061: ld8 r18=[r17]
607 ;;
608 or r18=_PAGE_A,r18 // set the accessed bit
609 mov b0=r29 // restore b0
610 ;;
611 st8 [r17]=r18 // store back updated PTE
612 itc.i r18 // install updated PTE
613#endif /* !CONFIG_SMP */
614 mov pr=r31,-1
615 rfi
616END(iaccess_bit)
617
618 .org ia64_ivt+0x2800
619/////////////////////////////////////////////////////////////////////////////////////////
620// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
621ENTRY(daccess_bit)
622 DBG_FAULT(10)
623 // Like Entry 8, except for data access
624 mov r16=cr.ifa // get the address that caused the fault
625 movl r30=1f // load continuation point in case of nested fault
626 ;;
627 thash r17=r16 // compute virtual address of L3 PTE
628 mov r31=pr
629 mov r29=b0 // save b0 in case of nested fault)
630#ifdef CONFIG_SMP
631 mov r28=ar.ccv // save ar.ccv
632 ;;
6331: ld8 r18=[r17]
634 ;; // avoid RAW on r18
635 mov ar.ccv=r18 // set compare value for cmpxchg
636 or r25=_PAGE_A,r18 // set the dirty bit
637 ;;
638 cmpxchg8.acq r26=[r17],r25,ar.ccv
639 mov r24=PAGE_SHIFT<<2
640 ;;
641 cmp.eq p6,p7=r26,r18
642 ;;
643(p6) itc.d r25 // install updated PTE
644 /*
645 * Tell the assemblers dependency-violation checker that the above "itc" instructions
646 * cannot possibly affect the following loads:
647 */
648 dv_serialize_data
649 ;;
650 ld8 r18=[r17] // read PTE again
651 ;;
652 cmp.eq p6,p7=r18,r25 // is it same as the newly installed
653 ;;
654(p7) ptc.l r16,r24
655 mov ar.ccv=r28
656#else
657 ;;
6581: ld8 r18=[r17]
659 ;; // avoid RAW on r18
660 or r18=_PAGE_A,r18 // set the accessed bit
661 ;;
662 st8 [r17]=r18 // store back updated PTE
663 itc.d r18 // install updated PTE
664#endif
665 mov b0=r29 // restore b0
666 mov pr=r31,-1
667 rfi
668END(daccess_bit)
669
670 .org ia64_ivt+0x2c00
671/////////////////////////////////////////////////////////////////////////////////////////
672// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
673ENTRY(break_fault)
674 /*
675 * The streamlined system call entry/exit paths only save/restore the initial part
676 * of pt_regs. This implies that the callers of system-calls must adhere to the
677 * normal procedure calling conventions.
678 *
679 * Registers to be saved & restored:
680 * CR registers: cr.ipsr, cr.iip, cr.ifs
681 * AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
682 * others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
683 * Registers to be restored only:
684 * r8-r11: output value from the system call.
685 *
686 * During system call exit, scratch registers (including r15) are modified/cleared
687 * to prevent leaking bits from kernel to user level.
688 */
689 DBG_FAULT(11)
690 mov r16=IA64_KR(CURRENT) // r16 = current task; 12 cycle read lat.
691 mov r17=cr.iim
692 mov r18=__IA64_BREAK_SYSCALL
693 mov r21=ar.fpsr
694 mov r29=cr.ipsr
695 mov r19=b6
696 mov r25=ar.unat
697 mov r27=ar.rsc
698 mov r26=ar.pfs
699 mov r28=cr.iip
700 mov r31=pr // prepare to save predicates
701 mov r20=r1
702 ;;
703 adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
704 cmp.eq p0,p7=r18,r17 // is this a system call? (p7 <- false, if so)
705(p7) br.cond.spnt non_syscall
706 ;;
707 ld1 r17=[r16] // load current->thread.on_ustack flag
708 st1 [r16]=r0 // clear current->thread.on_ustack flag
709 add r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // set r1 for MINSTATE_START_SAVE_MIN_VIRT
710 ;;
711 invala
712
713 /* adjust return address so we skip over the break instruction: */
714
715 extr.u r8=r29,41,2 // extract ei field from cr.ipsr
716 ;;
717 cmp.eq p6,p7=2,r8 // isr.ei==2?
718 mov r2=r1 // setup r2 for ia64_syscall_setup
719 ;;
720(p6) mov r8=0 // clear ei to 0
721(p6) adds r28=16,r28 // switch cr.iip to next bundle cr.ipsr.ei wrapped
722(p7) adds r8=1,r8 // increment ei to next slot
723 ;;
724 cmp.eq pKStk,pUStk=r0,r17 // are we in kernel mode already?
725 dep r29=r8,r29,41,2 // insert new ei into cr.ipsr
726 ;;
727
728 // switch from user to kernel RBS:
729 MINSTATE_START_SAVE_MIN_VIRT
730 br.call.sptk.many b7=ia64_syscall_setup
731 ;;
732 MINSTATE_END_SAVE_MIN_VIRT // switch to bank 1
733 ssm psr.ic | PSR_DEFAULT_BITS
734 ;;
735 srlz.i // guarantee that interruption collection is on
736 mov r3=NR_syscalls - 1
737 ;;
738(p15) ssm psr.i // restore psr.i
739 // p10==true means out registers are more than 8 or r15's Nat is true
740(p10) br.cond.spnt.many ia64_ret_from_syscall
741 ;;
742 movl r16=sys_call_table
743
744 adds r15=-1024,r15 // r15 contains the syscall number---subtract 1024
745 movl r2=ia64_ret_from_syscall
746 ;;
747 shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024)
748 cmp.leu p6,p7=r15,r3 // (syscall > 0 && syscall < 1024 + NR_syscalls) ?
749 mov rp=r2 // set the real return addr
750 ;;
751(p6) ld8 r20=[r20] // load address of syscall entry point
752(p7) movl r20=sys_ni_syscall
753
754 add r2=TI_FLAGS+IA64_TASK_SIZE,r13
755 ;;
756 ld4 r2=[r2] // r2 = current_thread_info()->flags
757 ;;
758 and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit
759 ;;
760 cmp.eq p8,p0=r2,r0
761 mov b6=r20
762 ;;
763(p8) br.call.sptk.many b6=b6 // ignore this return addr
764 br.cond.sptk ia64_trace_syscall
765 // NOT REACHED
766END(break_fault)
767
768 .org ia64_ivt+0x3000
769/////////////////////////////////////////////////////////////////////////////////////////
770// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
771ENTRY(interrupt)
772 DBG_FAULT(12)
773 mov r31=pr // prepare to save predicates
774 ;;
775 SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3
776 ssm psr.ic | PSR_DEFAULT_BITS
777 ;;
778 adds r3=8,r2 // set up second base pointer for SAVE_REST
779 srlz.i // ensure everybody knows psr.ic is back on
780 ;;
781 SAVE_REST
782 ;;
783 alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
784 mov out0=cr.ivr // pass cr.ivr as first arg
785 add out1=16,sp // pass pointer to pt_regs as second arg
786 ;;
787 srlz.d // make sure we see the effect of cr.ivr
788 movl r14=ia64_leave_kernel
789 ;;
790 mov rp=r14
791 br.call.sptk.many b6=ia64_handle_irq
792END(interrupt)
793
794 .org ia64_ivt+0x3400
795/////////////////////////////////////////////////////////////////////////////////////////
796// 0x3400 Entry 13 (size 64 bundles) Reserved
797 DBG_FAULT(13)
798 FAULT(13)
799
800 .org ia64_ivt+0x3800
801/////////////////////////////////////////////////////////////////////////////////////////
802// 0x3800 Entry 14 (size 64 bundles) Reserved
803 DBG_FAULT(14)
804 FAULT(14)
805
806 /*
807 * There is no particular reason for this code to be here, other than that
808 * there happens to be space here that would go unused otherwise. If this
809 * fault ever gets "unreserved", simply moved the following code to a more
810 * suitable spot...
811 *
812 * ia64_syscall_setup() is a separate subroutine so that it can
813 * allocate stacked registers so it can safely demine any
814 * potential NaT values from the input registers.
815 *
816 * On entry:
817 * - executing on bank 0 or bank 1 register set (doesn't matter)
818 * - r1: stack pointer
819 * - r2: current task pointer
820 * - r3: preserved
821 * - r11: original contents (saved ar.pfs to be saved)
822 * - r12: original contents (sp to be saved)
823 * - r13: original contents (tp to be saved)
824 * - r15: original contents (syscall # to be saved)
825 * - r18: saved bsp (after switching to kernel stack)
826 * - r19: saved b6
827 * - r20: saved r1 (gp)
828 * - r21: saved ar.fpsr
829 * - r22: kernel's register backing store base (krbs_base)
830 * - r23: saved ar.bspstore
831 * - r24: saved ar.rnat
832 * - r25: saved ar.unat
833 * - r26: saved ar.pfs
834 * - r27: saved ar.rsc
835 * - r28: saved cr.iip
836 * - r29: saved cr.ipsr
837 * - r31: saved pr
838 * - b0: original contents (to be saved)
839 * On exit:
840 * - executing on bank 1 registers
841 * - psr.ic enabled, interrupts restored
842 * - p10: TRUE if syscall is invoked with more than 8 out
843 * registers or r15's Nat is true
844 * - r1: kernel's gp
845 * - r3: preserved (same as on entry)
846 * - r8: -EINVAL if p10 is true
847 * - r12: points to kernel stack
848 * - r13: points to current task
849 * - p15: TRUE if interrupts need to be re-enabled
850 * - ar.fpsr: set to kernel settings
851 */
852GLOBAL_ENTRY(ia64_syscall_setup)
853#if PT(B6) != 0
854# error This code assumes that b6 is the first field in pt_regs.
855#endif
856 st8 [r1]=r19 // save b6
857 add r16=PT(CR_IPSR),r1 // initialize first base pointer
858 add r17=PT(R11),r1 // initialize second base pointer
859 ;;
860 alloc r19=ar.pfs,8,0,0,0 // ensure in0-in7 are writable
861 st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR) // save cr.ipsr
862 tnat.nz p8,p0=in0
863
864 st8.spill [r17]=r11,PT(CR_IIP)-PT(R11) // save r11
865 tnat.nz p9,p0=in1
866(pKStk) mov r18=r0 // make sure r18 isn't NaT
867 ;;
868
869 st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS) // save ar.pfs
870 st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP) // save cr.iip
871 mov r28=b0 // save b0 (2 cyc)
872 ;;
873
874 st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT) // save ar.unat
875 dep r19=0,r19,38,26 // clear all bits but 0..37 [I0]
876(p8) mov in0=-1
877 ;;
878
879 st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS) // store ar.pfs.pfm in cr.ifs
880 extr.u r11=r19,7,7 // I0 // get sol of ar.pfs
881 and r8=0x7f,r19 // A // get sof of ar.pfs
882
883 st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
884 tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
885(p9) mov in1=-1
886 ;;
887
888(pUStk) sub r18=r18,r22 // r18=RSE.ndirty*8
889 tnat.nz p10,p0=in2
890 add r11=8,r11
891 ;;
892(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16 // skip over ar_rnat field
893(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17 // skip over ar_bspstore field
894 tnat.nz p11,p0=in3
895 ;;
896(p10) mov in2=-1
897 tnat.nz p12,p0=in4 // [I0]
898(p11) mov in3=-1
899 ;;
900(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT) // save ar.rnat
901(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE) // save ar.bspstore
902 shl r18=r18,16 // compute ar.rsc to be used for "loadrs"
903 ;;
904 st8 [r16]=r31,PT(LOADRS)-PT(PR) // save predicates
905 st8 [r17]=r28,PT(R1)-PT(B0) // save b0
906 tnat.nz p13,p0=in5 // [I0]
907 ;;
908 st8 [r16]=r18,PT(R12)-PT(LOADRS) // save ar.rsc value for "loadrs"
909 st8.spill [r17]=r20,PT(R13)-PT(R1) // save original r1
910(p12) mov in4=-1
911 ;;
912
913.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12) // save r12
914.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13) // save r13
915(p13) mov in5=-1
916 ;;
917 st8 [r16]=r21,PT(R8)-PT(AR_FPSR) // save ar.fpsr
918 tnat.nz p14,p0=in6
919 cmp.lt p10,p9=r11,r8 // frame size can't be more than local+8
920 ;;
921 stf8 [r16]=f1 // ensure pt_regs.r8 != 0 (see handle_syscall_error)
922(p9) tnat.nz p10,p0=r15
923 adds r12=-16,r1 // switch to kernel memory stack (with 16 bytes of scratch)
924
925 st8.spill [r17]=r15 // save r15
926 tnat.nz p8,p0=in7
927 nop.i 0
928
929 mov r13=r2 // establish `current'
930 movl r1=__gp // establish kernel global pointer
931 ;;
932(p14) mov in6=-1
933(p8) mov in7=-1
934 nop.i 0
935
936 cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
937 movl r17=FPSR_DEFAULT
938 ;;
939 mov.m ar.fpsr=r17 // set ar.fpsr to kernel default value
940(p10) mov r8=-EINVAL
941 br.ret.sptk.many b7
942END(ia64_syscall_setup)
943
944 .org ia64_ivt+0x3c00
945/////////////////////////////////////////////////////////////////////////////////////////
946// 0x3c00 Entry 15 (size 64 bundles) Reserved
947 DBG_FAULT(15)
948 FAULT(15)
949
950 /*
951 * Squatting in this space ...
952 *
953 * This special case dispatcher for illegal operation faults allows preserved
954 * registers to be modified through a callback function (asm only) that is handed
955 * back from the fault handler in r8. Up to three arguments can be passed to the
956 * callback function by returning an aggregate with the callback as its first
957 * element, followed by the arguments.
958 */
959ENTRY(dispatch_illegal_op_fault)
960 .prologue
961 .body
962 SAVE_MIN_WITH_COVER
963 ssm psr.ic | PSR_DEFAULT_BITS
964 ;;
965 srlz.i // guarantee that interruption collection is on
966 ;;
967(p15) ssm psr.i // restore psr.i
968 adds r3=8,r2 // set up second base pointer for SAVE_REST
969 ;;
970 alloc r14=ar.pfs,0,0,1,0 // must be first in insn group
971 mov out0=ar.ec
972 ;;
973 SAVE_REST
974 PT_REGS_UNWIND_INFO(0)
975 ;;
976 br.call.sptk.many rp=ia64_illegal_op_fault
977.ret0: ;;
978 alloc r14=ar.pfs,0,0,3,0 // must be first in insn group
979 mov out0=r9
980 mov out1=r10
981 mov out2=r11
982 movl r15=ia64_leave_kernel
983 ;;
984 mov rp=r15
985 mov b6=r8
986 ;;
987 cmp.ne p6,p0=0,r8
988(p6) br.call.dpnt.many b6=b6 // call returns to ia64_leave_kernel
989 br.sptk.many ia64_leave_kernel
990END(dispatch_illegal_op_fault)
991
992 .org ia64_ivt+0x4000
993/////////////////////////////////////////////////////////////////////////////////////////
994// 0x4000 Entry 16 (size 64 bundles) Reserved
995 DBG_FAULT(16)
996 FAULT(16)
997
998 .org ia64_ivt+0x4400
999/////////////////////////////////////////////////////////////////////////////////////////
1000// 0x4400 Entry 17 (size 64 bundles) Reserved
1001 DBG_FAULT(17)
1002 FAULT(17)
1003
1004ENTRY(non_syscall)
1005 SAVE_MIN_WITH_COVER
1006
1007 // There is no particular reason for this code to be here, other than that
1008 // there happens to be space here that would go unused otherwise. If this
1009 // fault ever gets "unreserved", simply moved the following code to a more
1010 // suitable spot...
1011
1012 alloc r14=ar.pfs,0,0,2,0
1013 mov out0=cr.iim
1014 add out1=16,sp
1015 adds r3=8,r2 // set up second base pointer for SAVE_REST
1016
1017 ssm psr.ic | PSR_DEFAULT_BITS
1018 ;;
1019 srlz.i // guarantee that interruption collection is on
1020 ;;
1021(p15) ssm psr.i // restore psr.i
1022 movl r15=ia64_leave_kernel
1023 ;;
1024 SAVE_REST
1025 mov rp=r15
1026 ;;
1027 br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr
1028END(non_syscall)
1029
1030 .org ia64_ivt+0x4800
1031/////////////////////////////////////////////////////////////////////////////////////////
1032// 0x4800 Entry 18 (size 64 bundles) Reserved
1033 DBG_FAULT(18)
1034 FAULT(18)
1035
1036 /*
1037 * There is no particular reason for this code to be here, other than that
1038 * there happens to be space here that would go unused otherwise. If this
1039 * fault ever gets "unreserved", simply moved the following code to a more
1040 * suitable spot...
1041 */
1042
1043ENTRY(dispatch_unaligned_handler)
1044 SAVE_MIN_WITH_COVER
1045 ;;
1046 alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
1047 mov out0=cr.ifa
1048 adds out1=16,sp
1049
1050 ssm psr.ic | PSR_DEFAULT_BITS
1051 ;;
1052 srlz.i // guarantee that interruption collection is on
1053 ;;
1054(p15) ssm psr.i // restore psr.i
1055 adds r3=8,r2 // set up second base pointer
1056 ;;
1057 SAVE_REST
1058 movl r14=ia64_leave_kernel
1059 ;;
1060 mov rp=r14
1061 br.sptk.many ia64_prepare_handle_unaligned
1062END(dispatch_unaligned_handler)
1063
1064 .org ia64_ivt+0x4c00
1065/////////////////////////////////////////////////////////////////////////////////////////
1066// 0x4c00 Entry 19 (size 64 bundles) Reserved
1067 DBG_FAULT(19)
1068 FAULT(19)
1069
1070 /*
1071 * There is no particular reason for this code to be here, other than that
1072 * there happens to be space here that would go unused otherwise. If this
1073 * fault ever gets "unreserved", simply moved the following code to a more
1074 * suitable spot...
1075 */
1076
1077ENTRY(dispatch_to_fault_handler)
1078 /*
1079 * Input:
1080 * psr.ic: off
1081 * r19: fault vector number (e.g., 24 for General Exception)
1082 * r31: contains saved predicates (pr)
1083 */
1084 SAVE_MIN_WITH_COVER_R19
1085 alloc r14=ar.pfs,0,0,5,0
1086 mov out0=r15
1087 mov out1=cr.isr
1088 mov out2=cr.ifa
1089 mov out3=cr.iim
1090 mov out4=cr.itir
1091 ;;
1092 ssm psr.ic | PSR_DEFAULT_BITS
1093 ;;
1094 srlz.i // guarantee that interruption collection is on
1095 ;;
1096(p15) ssm psr.i // restore psr.i
1097 adds r3=8,r2 // set up second base pointer for SAVE_REST
1098 ;;
1099 SAVE_REST
1100 movl r14=ia64_leave_kernel
1101 ;;
1102 mov rp=r14
1103 br.call.sptk.many b6=ia64_fault
1104END(dispatch_to_fault_handler)
1105
1106//
1107// --- End of long entries, Beginning of short entries
1108//
1109
1110 .org ia64_ivt+0x5000
1111/////////////////////////////////////////////////////////////////////////////////////////
1112// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
1113ENTRY(page_not_present)
1114 DBG_FAULT(20)
1115 mov r16=cr.ifa
1116 rsm psr.dt
1117 /*
1118 * The Linux page fault handler doesn't expect non-present pages to be in
1119 * the TLB. Flush the existing entry now, so we meet that expectation.
1120 */
1121 mov r17=PAGE_SHIFT<<2
1122 ;;
1123 ptc.l r16,r17
1124 ;;
1125 mov r31=pr
1126 srlz.d
1127 br.sptk.many page_fault
1128END(page_not_present)
1129
1130 .org ia64_ivt+0x5100
1131/////////////////////////////////////////////////////////////////////////////////////////
1132// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
1133ENTRY(key_permission)
1134 DBG_FAULT(21)
1135 mov r16=cr.ifa
1136 rsm psr.dt
1137 mov r31=pr
1138 ;;
1139 srlz.d
1140 br.sptk.many page_fault
1141END(key_permission)
1142
1143 .org ia64_ivt+0x5200
1144/////////////////////////////////////////////////////////////////////////////////////////
1145// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
1146ENTRY(iaccess_rights)
1147 DBG_FAULT(22)
1148 mov r16=cr.ifa
1149 rsm psr.dt
1150 mov r31=pr
1151 ;;
1152 srlz.d
1153 br.sptk.many page_fault
1154END(iaccess_rights)
1155
1156 .org ia64_ivt+0x5300
1157/////////////////////////////////////////////////////////////////////////////////////////
1158// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
1159ENTRY(daccess_rights)
1160 DBG_FAULT(23)
1161 mov r16=cr.ifa
1162 rsm psr.dt
1163 mov r31=pr
1164 ;;
1165 srlz.d
1166 br.sptk.many page_fault
1167END(daccess_rights)
1168
1169 .org ia64_ivt+0x5400
1170/////////////////////////////////////////////////////////////////////////////////////////
1171// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
1172ENTRY(general_exception)
1173 DBG_FAULT(24)
1174 mov r16=cr.isr
1175 mov r31=pr
1176 ;;
1177 cmp4.eq p6,p0=0,r16
1178(p6) br.sptk.many dispatch_illegal_op_fault
1179 ;;
1180 mov r19=24 // fault number
1181 br.sptk.many dispatch_to_fault_handler
1182END(general_exception)
1183
1184 .org ia64_ivt+0x5500
1185/////////////////////////////////////////////////////////////////////////////////////////
1186// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
1187ENTRY(disabled_fp_reg)
1188 DBG_FAULT(25)
1189 rsm psr.dfh // ensure we can access fph
1190 ;;
1191 srlz.d
1192 mov r31=pr
1193 mov r19=25
1194 br.sptk.many dispatch_to_fault_handler
1195END(disabled_fp_reg)
1196
1197 .org ia64_ivt+0x5600
1198/////////////////////////////////////////////////////////////////////////////////////////
1199// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
1200ENTRY(nat_consumption)
1201 DBG_FAULT(26)
1202 FAULT(26)
1203END(nat_consumption)
1204
1205 .org ia64_ivt+0x5700
1206/////////////////////////////////////////////////////////////////////////////////////////
1207// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
1208ENTRY(speculation_vector)
1209 DBG_FAULT(27)
1210 /*
1211 * A [f]chk.[as] instruction needs to take the branch to the recovery code but
1212 * this part of the architecture is not implemented in hardware on some CPUs, such
1213 * as Itanium. Thus, in general we need to emulate the behavior. IIM contains
1214 * the relative target (not yet sign extended). So after sign extending it we
1215 * simply add it to IIP. We also need to reset the EI field of the IPSR to zero,
1216 * i.e., the slot to restart into.
1217 *
1218 * cr.imm contains zero_ext(imm21)
1219 */
1220 mov r18=cr.iim
1221 ;;
1222 mov r17=cr.iip
1223 shl r18=r18,43 // put sign bit in position (43=64-21)
1224 ;;
1225
1226 mov r16=cr.ipsr
1227 shr r18=r18,39 // sign extend (39=43-4)
1228 ;;
1229
1230 add r17=r17,r18 // now add the offset
1231 ;;
1232 mov cr.iip=r17
1233 dep r16=0,r16,41,2 // clear EI
1234 ;;
1235
1236 mov cr.ipsr=r16
1237 ;;
1238
1239 rfi // and go back
1240END(speculation_vector)
1241
1242 .org ia64_ivt+0x5800
1243/////////////////////////////////////////////////////////////////////////////////////////
1244// 0x5800 Entry 28 (size 16 bundles) Reserved
1245 DBG_FAULT(28)
1246 FAULT(28)
1247
1248 .org ia64_ivt+0x5900
1249/////////////////////////////////////////////////////////////////////////////////////////
1250// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
1251ENTRY(debug_vector)
1252 DBG_FAULT(29)
1253 FAULT(29)
1254END(debug_vector)
1255
1256 .org ia64_ivt+0x5a00
1257/////////////////////////////////////////////////////////////////////////////////////////
1258// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
1259ENTRY(unaligned_access)
1260 DBG_FAULT(30)
1261 mov r16=cr.ipsr
1262 mov r31=pr // prepare to save predicates
1263 ;;
1264 br.sptk.many dispatch_unaligned_handler
1265END(unaligned_access)
1266
1267 .org ia64_ivt+0x5b00
1268/////////////////////////////////////////////////////////////////////////////////////////
1269// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
1270ENTRY(unsupported_data_reference)
1271 DBG_FAULT(31)
1272 FAULT(31)
1273END(unsupported_data_reference)
1274
1275 .org ia64_ivt+0x5c00
1276/////////////////////////////////////////////////////////////////////////////////////////
1277// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
1278ENTRY(floating_point_fault)
1279 DBG_FAULT(32)
1280 FAULT(32)
1281END(floating_point_fault)
1282
1283 .org ia64_ivt+0x5d00
1284/////////////////////////////////////////////////////////////////////////////////////////
1285// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
1286ENTRY(floating_point_trap)
1287 DBG_FAULT(33)
1288 FAULT(33)
1289END(floating_point_trap)
1290
1291 .org ia64_ivt+0x5e00
1292/////////////////////////////////////////////////////////////////////////////////////////
1293// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
1294ENTRY(lower_privilege_trap)
1295 DBG_FAULT(34)
1296 FAULT(34)
1297END(lower_privilege_trap)
1298
1299 .org ia64_ivt+0x5f00
1300/////////////////////////////////////////////////////////////////////////////////////////
1301// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
1302ENTRY(taken_branch_trap)
1303 DBG_FAULT(35)
1304 FAULT(35)
1305END(taken_branch_trap)
1306
1307 .org ia64_ivt+0x6000
1308/////////////////////////////////////////////////////////////////////////////////////////
1309// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
1310ENTRY(single_step_trap)
1311 DBG_FAULT(36)
1312 FAULT(36)
1313END(single_step_trap)
1314
1315 .org ia64_ivt+0x6100
1316/////////////////////////////////////////////////////////////////////////////////////////
1317// 0x6100 Entry 37 (size 16 bundles) Reserved
1318 DBG_FAULT(37)
1319 FAULT(37)
1320
1321 .org ia64_ivt+0x6200
1322/////////////////////////////////////////////////////////////////////////////////////////
1323// 0x6200 Entry 38 (size 16 bundles) Reserved
1324 DBG_FAULT(38)
1325 FAULT(38)
1326
1327 .org ia64_ivt+0x6300
1328/////////////////////////////////////////////////////////////////////////////////////////
1329// 0x6300 Entry 39 (size 16 bundles) Reserved
1330 DBG_FAULT(39)
1331 FAULT(39)
1332
1333 .org ia64_ivt+0x6400
1334/////////////////////////////////////////////////////////////////////////////////////////
1335// 0x6400 Entry 40 (size 16 bundles) Reserved
1336 DBG_FAULT(40)
1337 FAULT(40)
1338
1339 .org ia64_ivt+0x6500
1340/////////////////////////////////////////////////////////////////////////////////////////
1341// 0x6500 Entry 41 (size 16 bundles) Reserved
1342 DBG_FAULT(41)
1343 FAULT(41)
1344
1345 .org ia64_ivt+0x6600
1346/////////////////////////////////////////////////////////////////////////////////////////
1347// 0x6600 Entry 42 (size 16 bundles) Reserved
1348 DBG_FAULT(42)
1349 FAULT(42)
1350
1351 .org ia64_ivt+0x6700
1352/////////////////////////////////////////////////////////////////////////////////////////
1353// 0x6700 Entry 43 (size 16 bundles) Reserved
1354 DBG_FAULT(43)
1355 FAULT(43)
1356
1357 .org ia64_ivt+0x6800
1358/////////////////////////////////////////////////////////////////////////////////////////
1359// 0x6800 Entry 44 (size 16 bundles) Reserved
1360 DBG_FAULT(44)
1361 FAULT(44)
1362
1363 .org ia64_ivt+0x6900
1364/////////////////////////////////////////////////////////////////////////////////////////
1365// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
1366ENTRY(ia32_exception)
1367 DBG_FAULT(45)
1368 FAULT(45)
1369END(ia32_exception)
1370
1371 .org ia64_ivt+0x6a00
1372/////////////////////////////////////////////////////////////////////////////////////////
1373// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept (30,31,59,70,71)
1374ENTRY(ia32_intercept)
1375 DBG_FAULT(46)
1376#ifdef CONFIG_IA32_SUPPORT
1377 mov r31=pr
1378 mov r16=cr.isr
1379 ;;
1380 extr.u r17=r16,16,8 // get ISR.code
1381 mov r18=ar.eflag
1382 mov r19=cr.iim // old eflag value
1383 ;;
1384 cmp.ne p6,p0=2,r17
1385(p6) br.cond.spnt 1f // not a system flag fault
1386 xor r16=r18,r19
1387 ;;
1388 extr.u r17=r16,18,1 // get the eflags.ac bit
1389 ;;
1390 cmp.eq p6,p0=0,r17
1391(p6) br.cond.spnt 1f // eflags.ac bit didn't change
1392 ;;
1393 mov pr=r31,-1 // restore predicate registers
1394 rfi
1395
13961:
1397#endif // CONFIG_IA32_SUPPORT
1398 FAULT(46)
1399END(ia32_intercept)
1400
1401 .org ia64_ivt+0x6b00
1402/////////////////////////////////////////////////////////////////////////////////////////
1403// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt (74)
1404ENTRY(ia32_interrupt)
1405 DBG_FAULT(47)
1406#ifdef CONFIG_IA32_SUPPORT
1407 mov r31=pr
1408 br.sptk.many dispatch_to_ia32_handler
1409#else
1410 FAULT(47)
1411#endif
1412END(ia32_interrupt)
1413
1414 .org ia64_ivt+0x6c00
1415/////////////////////////////////////////////////////////////////////////////////////////
1416// 0x6c00 Entry 48 (size 16 bundles) Reserved
1417 DBG_FAULT(48)
1418 FAULT(48)
1419
1420 .org ia64_ivt+0x6d00
1421/////////////////////////////////////////////////////////////////////////////////////////
1422// 0x6d00 Entry 49 (size 16 bundles) Reserved
1423 DBG_FAULT(49)
1424 FAULT(49)
1425
1426 .org ia64_ivt+0x6e00
1427/////////////////////////////////////////////////////////////////////////////////////////
1428// 0x6e00 Entry 50 (size 16 bundles) Reserved
1429 DBG_FAULT(50)
1430 FAULT(50)
1431
1432 .org ia64_ivt+0x6f00
1433/////////////////////////////////////////////////////////////////////////////////////////
1434// 0x6f00 Entry 51 (size 16 bundles) Reserved
1435 DBG_FAULT(51)
1436 FAULT(51)
1437
1438 .org ia64_ivt+0x7000
1439/////////////////////////////////////////////////////////////////////////////////////////
1440// 0x7000 Entry 52 (size 16 bundles) Reserved
1441 DBG_FAULT(52)
1442 FAULT(52)
1443
1444 .org ia64_ivt+0x7100
1445/////////////////////////////////////////////////////////////////////////////////////////
1446// 0x7100 Entry 53 (size 16 bundles) Reserved
1447 DBG_FAULT(53)
1448 FAULT(53)
1449
1450 .org ia64_ivt+0x7200
1451/////////////////////////////////////////////////////////////////////////////////////////
1452// 0x7200 Entry 54 (size 16 bundles) Reserved
1453 DBG_FAULT(54)
1454 FAULT(54)
1455
1456 .org ia64_ivt+0x7300
1457/////////////////////////////////////////////////////////////////////////////////////////
1458// 0x7300 Entry 55 (size 16 bundles) Reserved
1459 DBG_FAULT(55)
1460 FAULT(55)
1461
1462 .org ia64_ivt+0x7400
1463/////////////////////////////////////////////////////////////////////////////////////////
1464// 0x7400 Entry 56 (size 16 bundles) Reserved
1465 DBG_FAULT(56)
1466 FAULT(56)
1467
1468 .org ia64_ivt+0x7500
1469/////////////////////////////////////////////////////////////////////////////////////////
1470// 0x7500 Entry 57 (size 16 bundles) Reserved
1471 DBG_FAULT(57)
1472 FAULT(57)
1473
1474 .org ia64_ivt+0x7600
1475/////////////////////////////////////////////////////////////////////////////////////////
1476// 0x7600 Entry 58 (size 16 bundles) Reserved
1477 DBG_FAULT(58)
1478 FAULT(58)
1479
1480 .org ia64_ivt+0x7700
1481/////////////////////////////////////////////////////////////////////////////////////////
1482// 0x7700 Entry 59 (size 16 bundles) Reserved
1483 DBG_FAULT(59)
1484 FAULT(59)
1485
1486 .org ia64_ivt+0x7800
1487/////////////////////////////////////////////////////////////////////////////////////////
1488// 0x7800 Entry 60 (size 16 bundles) Reserved
1489 DBG_FAULT(60)
1490 FAULT(60)
1491
1492 .org ia64_ivt+0x7900
1493/////////////////////////////////////////////////////////////////////////////////////////
1494// 0x7900 Entry 61 (size 16 bundles) Reserved
1495 DBG_FAULT(61)
1496 FAULT(61)
1497
1498 .org ia64_ivt+0x7a00
1499/////////////////////////////////////////////////////////////////////////////////////////
1500// 0x7a00 Entry 62 (size 16 bundles) Reserved
1501 DBG_FAULT(62)
1502 FAULT(62)
1503
1504 .org ia64_ivt+0x7b00
1505/////////////////////////////////////////////////////////////////////////////////////////
1506// 0x7b00 Entry 63 (size 16 bundles) Reserved
1507 DBG_FAULT(63)
1508 FAULT(63)
1509
1510 .org ia64_ivt+0x7c00
1511/////////////////////////////////////////////////////////////////////////////////////////
1512// 0x7c00 Entry 64 (size 16 bundles) Reserved
1513 DBG_FAULT(64)
1514 FAULT(64)
1515
1516 .org ia64_ivt+0x7d00
1517/////////////////////////////////////////////////////////////////////////////////////////
1518// 0x7d00 Entry 65 (size 16 bundles) Reserved
1519 DBG_FAULT(65)
1520 FAULT(65)
1521
1522 .org ia64_ivt+0x7e00
1523/////////////////////////////////////////////////////////////////////////////////////////
1524// 0x7e00 Entry 66 (size 16 bundles) Reserved
1525 DBG_FAULT(66)
1526 FAULT(66)
1527
1528 .org ia64_ivt+0x7f00
1529/////////////////////////////////////////////////////////////////////////////////////////
1530// 0x7f00 Entry 67 (size 16 bundles) Reserved
1531 DBG_FAULT(67)
1532 FAULT(67)
1533
1534#ifdef CONFIG_IA32_SUPPORT
1535
1536 /*
1537 * There is no particular reason for this code to be here, other than that
1538 * there happens to be space here that would go unused otherwise. If this
1539 * fault ever gets "unreserved", simply moved the following code to a more
1540 * suitable spot...
1541 */
1542
1543 // IA32 interrupt entry point
1544
1545ENTRY(dispatch_to_ia32_handler)
1546 SAVE_MIN
1547 ;;
1548 mov r14=cr.isr
1549 ssm psr.ic | PSR_DEFAULT_BITS
1550 ;;
1551 srlz.i // guarantee that interruption collection is on
1552 ;;
1553(p15) ssm psr.i
1554 adds r3=8,r2 // Base pointer for SAVE_REST
1555 ;;
1556 SAVE_REST
1557 ;;
1558 mov r15=0x80
1559 shr r14=r14,16 // Get interrupt number
1560 ;;
1561 cmp.ne p6,p0=r14,r15
1562(p6) br.call.dpnt.many b6=non_ia32_syscall
1563
1564 adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
1565 adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
1566 ;;
1567 cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
1568 ld8 r8=[r14] // get r8
1569 ;;
1570 st8 [r15]=r8 // save original EAX in r1 (IA32 procs don't use the GP)
1571 ;;
1572 alloc r15=ar.pfs,0,0,6,0 // must first in an insn group
1573 ;;
1574 ld4 r8=[r14],8 // r8 == eax (syscall number)
1575 mov r15=IA32_NR_syscalls
1576 ;;
1577 cmp.ltu.unc p6,p7=r8,r15
1578 ld4 out1=[r14],8 // r9 == ecx
1579 ;;
1580 ld4 out2=[r14],8 // r10 == edx
1581 ;;
1582 ld4 out0=[r14] // r11 == ebx
1583 adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
1584 ;;
1585 ld4 out5=[r14],PT(R14)-PT(R13) // r13 == ebp
1586 ;;
1587 ld4 out3=[r14],PT(R15)-PT(R14) // r14 == esi
1588 adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
1589 ;;
1590 ld4 out4=[r14] // r15 == edi
1591 movl r16=ia32_syscall_table
1592 ;;
1593(p6) shladd r16=r8,3,r16 // force ni_syscall if not valid syscall number
1594 ld4 r2=[r2] // r2 = current_thread_info()->flags
1595 ;;
1596 ld8 r16=[r16]
1597 and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit
1598 ;;
1599 mov b6=r16
1600 movl r15=ia32_ret_from_syscall
1601 cmp.eq p8,p0=r2,r0
1602 ;;
1603 mov rp=r15
1604(p8) br.call.sptk.many b6=b6
1605 br.cond.sptk ia32_trace_syscall
1606
1607non_ia32_syscall:
1608 alloc r15=ar.pfs,0,0,2,0
1609 mov out0=r14 // interrupt #
1610 add out1=16,sp // pointer to pt_regs
1611 ;; // avoid WAW on CFM
1612 br.call.sptk.many rp=ia32_bad_interrupt
1613.ret1: movl r15=ia64_leave_kernel
1614 ;;
1615 mov rp=r15
1616 br.ret.sptk.many rp
1617END(dispatch_to_ia32_handler)
1618
1619#endif /* CONFIG_IA32_SUPPORT */
diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c
new file mode 100644
index 000000000000..c3a04ee7f4f6
--- /dev/null
+++ b/arch/ia64/kernel/machvec.c
@@ -0,0 +1,70 @@
1#include <linux/config.h>
2#include <linux/module.h>
3
4#include <asm/machvec.h>
5#include <asm/system.h>
6
7#ifdef CONFIG_IA64_GENERIC
8
9#include <linux/kernel.h>
10#include <linux/string.h>
11
12#include <asm/page.h>
13
14struct ia64_machine_vector ia64_mv;
15EXPORT_SYMBOL(ia64_mv);
16
17static struct ia64_machine_vector *
18lookup_machvec (const char *name)
19{
20 extern struct ia64_machine_vector machvec_start[];
21 extern struct ia64_machine_vector machvec_end[];
22 struct ia64_machine_vector *mv;
23
24 for (mv = machvec_start; mv < machvec_end; ++mv)
25 if (strcmp (mv->name, name) == 0)
26 return mv;
27
28 return 0;
29}
30
31void
32machvec_init (const char *name)
33{
34 struct ia64_machine_vector *mv;
35
36 mv = lookup_machvec(name);
37 if (!mv) {
38 panic("generic kernel failed to find machine vector for platform %s!", name);
39 }
40 ia64_mv = *mv;
41 printk(KERN_INFO "booting generic kernel on platform %s\n", name);
42}
43
44#endif /* CONFIG_IA64_GENERIC */
45
46void
47machvec_setup (char **arg)
48{
49}
50EXPORT_SYMBOL(machvec_setup);
51
52void
53machvec_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
54{
55}
56EXPORT_SYMBOL(machvec_timer_interrupt);
57
58void
59machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir)
60{
61 mb();
62}
63EXPORT_SYMBOL(machvec_dma_sync_single);
64
65void
66machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir)
67{
68 mb();
69}
70EXPORT_SYMBOL(machvec_dma_sync_sg);
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
new file mode 100644
index 000000000000..4d6c7b8f667b
--- /dev/null
+++ b/arch/ia64/kernel/mca.c
@@ -0,0 +1,1470 @@
1/*
2 * File: mca.c
3 * Purpose: Generic MCA handling layer
4 *
5 * Updated for latest kernel
6 * Copyright (C) 2003 Hewlett-Packard Co
7 * David Mosberger-Tang <davidm@hpl.hp.com>
8 *
9 * Copyright (C) 2002 Dell Inc.
10 * Copyright (C) Matt Domsch (Matt_Domsch@dell.com)
11 *
12 * Copyright (C) 2002 Intel
13 * Copyright (C) Jenna Hall (jenna.s.hall@intel.com)
14 *
15 * Copyright (C) 2001 Intel
16 * Copyright (C) Fred Lewis (frederick.v.lewis@intel.com)
17 *
18 * Copyright (C) 2000 Intel
19 * Copyright (C) Chuck Fleckenstein (cfleck@co.intel.com)
20 *
21 * Copyright (C) 1999, 2004 Silicon Graphics, Inc.
22 * Copyright (C) Vijay Chander(vijay@engr.sgi.com)
23 *
24 * 03/04/15 D. Mosberger Added INIT backtrace support.
25 * 02/03/25 M. Domsch GUID cleanups
26 *
27 * 02/01/04 J. Hall Aligned MCA stack to 16 bytes, added platform vs. CPU
28 * error flag, set SAL default return values, changed
29 * error record structure to linked list, added init call
30 * to sal_get_state_info_size().
31 *
32 * 01/01/03 F. Lewis Added setup of CMCI and CPEI IRQs, logging of corrected
33 * platform errors, completed code for logging of
34 * corrected & uncorrected machine check errors, and
35 * updated for conformance with Nov. 2000 revision of the
36 * SAL 3.0 spec.
37 * 00/03/29 C. Fleckenstein Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
38 * added min save state dump, added INIT handler.
39 *
40 * 2003-12-08 Keith Owens <kaos@sgi.com>
41 * smp_call_function() must not be called from interrupt context (can
42 * deadlock on tasklist_lock). Use keventd to call smp_call_function().
43 *
44 * 2004-02-01 Keith Owens <kaos@sgi.com>
45 * Avoid deadlock when using printk() for MCA and INIT records.
46 * Delete all record printing code, moved to salinfo_decode in user space.
47 * Mark variables and functions static where possible.
48 * Delete dead variables and functions.
49 * Reorder to remove the need for forward declarations and to consolidate
50 * related code.
51 */
52#include <linux/config.h>
53#include <linux/types.h>
54#include <linux/init.h>
55#include <linux/sched.h>
56#include <linux/interrupt.h>
57#include <linux/irq.h>
58#include <linux/kallsyms.h>
59#include <linux/smp_lock.h>
60#include <linux/bootmem.h>
61#include <linux/acpi.h>
62#include <linux/timer.h>
63#include <linux/module.h>
64#include <linux/kernel.h>
65#include <linux/smp.h>
66#include <linux/workqueue.h>
67
68#include <asm/delay.h>
69#include <asm/machvec.h>
70#include <asm/meminit.h>
71#include <asm/page.h>
72#include <asm/ptrace.h>
73#include <asm/system.h>
74#include <asm/sal.h>
75#include <asm/mca.h>
76
77#include <asm/irq.h>
78#include <asm/hw_irq.h>
79
80#if defined(IA64_MCA_DEBUG_INFO)
81# define IA64_MCA_DEBUG(fmt...) printk(fmt)
82#else
83# define IA64_MCA_DEBUG(fmt...)
84#endif
85
86/* Used by mca_asm.S */
87ia64_mca_sal_to_os_state_t ia64_sal_to_os_handoff_state;
88ia64_mca_os_to_sal_state_t ia64_os_to_sal_handoff_state;
89u64 ia64_mca_serialize;
90DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */
91DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */
92DEFINE_PER_CPU(u64, ia64_mca_pal_pte); /* PTE to map PAL code */
93DEFINE_PER_CPU(u64, ia64_mca_pal_base); /* vaddr PAL code granule */
94
95unsigned long __per_cpu_mca[NR_CPUS];
96
97/* In mca_asm.S */
98extern void ia64_monarch_init_handler (void);
99extern void ia64_slave_init_handler (void);
100
101static ia64_mc_info_t ia64_mc_info;
102
103#define MAX_CPE_POLL_INTERVAL (15*60*HZ) /* 15 minutes */
104#define MIN_CPE_POLL_INTERVAL (2*60*HZ) /* 2 minutes */
105#define CMC_POLL_INTERVAL (1*60*HZ) /* 1 minute */
106#define CPE_HISTORY_LENGTH 5
107#define CMC_HISTORY_LENGTH 5
108
109static struct timer_list cpe_poll_timer;
110static struct timer_list cmc_poll_timer;
111/*
112 * This variable tells whether we are currently in polling mode.
113 * Start with this in the wrong state so we won't play w/ timers
114 * before the system is ready.
115 */
116static int cmc_polling_enabled = 1;
117
118/*
119 * Clearing this variable prevents CPE polling from getting activated
120 * in mca_late_init. Use it if your system doesn't provide a CPEI,
121 * but encounters problems retrieving CPE logs. This should only be
122 * necessary for debugging.
123 */
124static int cpe_poll_enabled = 1;
125
126extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
127
128static int mca_init;
129
130/*
131 * IA64_MCA log support
132 */
133#define IA64_MAX_LOGS 2 /* Double-buffering for nested MCAs */
134#define IA64_MAX_LOG_TYPES 4 /* MCA, INIT, CMC, CPE */
135
136typedef struct ia64_state_log_s
137{
138 spinlock_t isl_lock;
139 int isl_index;
140 unsigned long isl_count;
141 ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
142} ia64_state_log_t;
143
144static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
145
146#define IA64_LOG_ALLOCATE(it, size) \
147 {ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] = \
148 (ia64_err_rec_t *)alloc_bootmem(size); \
149 ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] = \
150 (ia64_err_rec_t *)alloc_bootmem(size);}
151#define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock)
152#define IA64_LOG_LOCK(it) spin_lock_irqsave(&ia64_state_log[it].isl_lock, s)
153#define IA64_LOG_UNLOCK(it) spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s)
154#define IA64_LOG_NEXT_INDEX(it) ia64_state_log[it].isl_index
155#define IA64_LOG_CURR_INDEX(it) 1 - ia64_state_log[it].isl_index
156#define IA64_LOG_INDEX_INC(it) \
157 {ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index; \
158 ia64_state_log[it].isl_count++;}
159#define IA64_LOG_INDEX_DEC(it) \
160 ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index
161#define IA64_LOG_NEXT_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)]))
162#define IA64_LOG_CURR_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)]))
163#define IA64_LOG_COUNT(it) ia64_state_log[it].isl_count
164
165/*
166 * ia64_log_init
167 * Reset the OS ia64 log buffer
168 * Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
169 * Outputs : None
170 */
171static void
172ia64_log_init(int sal_info_type)
173{
174 u64 max_size = 0;
175
176 IA64_LOG_NEXT_INDEX(sal_info_type) = 0;
177 IA64_LOG_LOCK_INIT(sal_info_type);
178
179 // SAL will tell us the maximum size of any error record of this type
180 max_size = ia64_sal_get_state_info_size(sal_info_type);
181 if (!max_size)
182 /* alloc_bootmem() doesn't like zero-sized allocations! */
183 return;
184
185 // set up OS data structures to hold error info
186 IA64_LOG_ALLOCATE(sal_info_type, max_size);
187 memset(IA64_LOG_CURR_BUFFER(sal_info_type), 0, max_size);
188 memset(IA64_LOG_NEXT_BUFFER(sal_info_type), 0, max_size);
189}
190
191/*
192 * ia64_log_get
193 *
194 * Get the current MCA log from SAL and copy it into the OS log buffer.
195 *
196 * Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
197 * irq_safe whether you can use printk at this point
198 * Outputs : size (total record length)
199 * *buffer (ptr to error record)
200 *
201 */
202static u64
203ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe)
204{
205 sal_log_record_header_t *log_buffer;
206 u64 total_len = 0;
207 int s;
208
209 IA64_LOG_LOCK(sal_info_type);
210
211 /* Get the process state information */
212 log_buffer = IA64_LOG_NEXT_BUFFER(sal_info_type);
213
214 total_len = ia64_sal_get_state_info(sal_info_type, (u64 *)log_buffer);
215
216 if (total_len) {
217 IA64_LOG_INDEX_INC(sal_info_type);
218 IA64_LOG_UNLOCK(sal_info_type);
219 if (irq_safe) {
220 IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. "
221 "Record length = %ld\n", __FUNCTION__, sal_info_type, total_len);
222 }
223 *buffer = (u8 *) log_buffer;
224 return total_len;
225 } else {
226 IA64_LOG_UNLOCK(sal_info_type);
227 return 0;
228 }
229}
230
231/*
232 * ia64_mca_log_sal_error_record
233 *
234 * This function retrieves a specified error record type from SAL
235 * and wakes up any processes waiting for error records.
236 *
237 * Inputs : sal_info_type (Type of error record MCA/CMC/CPE/INIT)
238 */
239static void
240ia64_mca_log_sal_error_record(int sal_info_type)
241{
242 u8 *buffer;
243 sal_log_record_header_t *rh;
244 u64 size;
245 int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA && sal_info_type != SAL_INFO_TYPE_INIT;
246#ifdef IA64_MCA_DEBUG_INFO
247 static const char * const rec_name[] = { "MCA", "INIT", "CMC", "CPE" };
248#endif
249
250 size = ia64_log_get(sal_info_type, &buffer, irq_safe);
251 if (!size)
252 return;
253
254 salinfo_log_wakeup(sal_info_type, buffer, size, irq_safe);
255
256 if (irq_safe)
257 IA64_MCA_DEBUG("CPU %d: SAL log contains %s error record\n",
258 smp_processor_id(),
259 sal_info_type < ARRAY_SIZE(rec_name) ? rec_name[sal_info_type] : "UNKNOWN");
260
261 /* Clear logs from corrected errors in case there's no user-level logger */
262 rh = (sal_log_record_header_t *)buffer;
263 if (rh->severity == sal_log_severity_corrected)
264 ia64_sal_clear_state_info(sal_info_type);
265}
266
267/*
268 * platform dependent error handling
269 */
270#ifndef PLATFORM_MCA_HANDLERS
271
272#ifdef CONFIG_ACPI
273
274static int cpe_vector = -1;
275
276static irqreturn_t
277ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
278{
279 static unsigned long cpe_history[CPE_HISTORY_LENGTH];
280 static int index;
281 static DEFINE_SPINLOCK(cpe_history_lock);
282
283 IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
284 __FUNCTION__, cpe_irq, smp_processor_id());
285
286 /* SAL spec states this should run w/ interrupts enabled */
287 local_irq_enable();
288
289 /* Get the CPE error record and log it */
290 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
291
292 spin_lock(&cpe_history_lock);
293 if (!cpe_poll_enabled && cpe_vector >= 0) {
294
295 int i, count = 1; /* we know 1 happened now */
296 unsigned long now = jiffies;
297
298 for (i = 0; i < CPE_HISTORY_LENGTH; i++) {
299 if (now - cpe_history[i] <= HZ)
300 count++;
301 }
302
303 IA64_MCA_DEBUG(KERN_INFO "CPE threshold %d/%d\n", count, CPE_HISTORY_LENGTH);
304 if (count >= CPE_HISTORY_LENGTH) {
305
306 cpe_poll_enabled = 1;
307 spin_unlock(&cpe_history_lock);
308 disable_irq_nosync(local_vector_to_irq(IA64_CPE_VECTOR));
309
310 /*
311 * Corrected errors will still be corrected, but
312 * make sure there's a log somewhere that indicates
313 * something is generating more than we can handle.
314 */
315 printk(KERN_WARNING "WARNING: Switching to polling CPE handler; error records may be lost\n");
316
317 mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL);
318
319 /* lock already released, get out now */
320 return IRQ_HANDLED;
321 } else {
322 cpe_history[index++] = now;
323 if (index == CPE_HISTORY_LENGTH)
324 index = 0;
325 }
326 }
327 spin_unlock(&cpe_history_lock);
328 return IRQ_HANDLED;
329}
330
331#endif /* CONFIG_ACPI */
332
333static void
334show_min_state (pal_min_state_area_t *minstate)
335{
336 u64 iip = minstate->pmsa_iip + ((struct ia64_psr *)(&minstate->pmsa_ipsr))->ri;
337 u64 xip = minstate->pmsa_xip + ((struct ia64_psr *)(&minstate->pmsa_xpsr))->ri;
338
339 printk("NaT bits\t%016lx\n", minstate->pmsa_nat_bits);
340 printk("pr\t\t%016lx\n", minstate->pmsa_pr);
341 printk("b0\t\t%016lx ", minstate->pmsa_br0); print_symbol("%s\n", minstate->pmsa_br0);
342 printk("ar.rsc\t\t%016lx\n", minstate->pmsa_rsc);
343 printk("cr.iip\t\t%016lx ", iip); print_symbol("%s\n", iip);
344 printk("cr.ipsr\t\t%016lx\n", minstate->pmsa_ipsr);
345 printk("cr.ifs\t\t%016lx\n", minstate->pmsa_ifs);
346 printk("xip\t\t%016lx ", xip); print_symbol("%s\n", xip);
347 printk("xpsr\t\t%016lx\n", minstate->pmsa_xpsr);
348 printk("xfs\t\t%016lx\n", minstate->pmsa_xfs);
349 printk("b1\t\t%016lx ", minstate->pmsa_br1);
350 print_symbol("%s\n", minstate->pmsa_br1);
351
352 printk("\nstatic registers r0-r15:\n");
353 printk(" r0- 3 %016lx %016lx %016lx %016lx\n",
354 0UL, minstate->pmsa_gr[0], minstate->pmsa_gr[1], minstate->pmsa_gr[2]);
355 printk(" r4- 7 %016lx %016lx %016lx %016lx\n",
356 minstate->pmsa_gr[3], minstate->pmsa_gr[4],
357 minstate->pmsa_gr[5], minstate->pmsa_gr[6]);
358 printk(" r8-11 %016lx %016lx %016lx %016lx\n",
359 minstate->pmsa_gr[7], minstate->pmsa_gr[8],
360 minstate->pmsa_gr[9], minstate->pmsa_gr[10]);
361 printk("r12-15 %016lx %016lx %016lx %016lx\n",
362 minstate->pmsa_gr[11], minstate->pmsa_gr[12],
363 minstate->pmsa_gr[13], minstate->pmsa_gr[14]);
364
365 printk("\nbank 0:\n");
366 printk("r16-19 %016lx %016lx %016lx %016lx\n",
367 minstate->pmsa_bank0_gr[0], minstate->pmsa_bank0_gr[1],
368 minstate->pmsa_bank0_gr[2], minstate->pmsa_bank0_gr[3]);
369 printk("r20-23 %016lx %016lx %016lx %016lx\n",
370 minstate->pmsa_bank0_gr[4], minstate->pmsa_bank0_gr[5],
371 minstate->pmsa_bank0_gr[6], minstate->pmsa_bank0_gr[7]);
372 printk("r24-27 %016lx %016lx %016lx %016lx\n",
373 minstate->pmsa_bank0_gr[8], minstate->pmsa_bank0_gr[9],
374 minstate->pmsa_bank0_gr[10], minstate->pmsa_bank0_gr[11]);
375 printk("r28-31 %016lx %016lx %016lx %016lx\n",
376 minstate->pmsa_bank0_gr[12], minstate->pmsa_bank0_gr[13],
377 minstate->pmsa_bank0_gr[14], minstate->pmsa_bank0_gr[15]);
378
379 printk("\nbank 1:\n");
380 printk("r16-19 %016lx %016lx %016lx %016lx\n",
381 minstate->pmsa_bank1_gr[0], minstate->pmsa_bank1_gr[1],
382 minstate->pmsa_bank1_gr[2], minstate->pmsa_bank1_gr[3]);
383 printk("r20-23 %016lx %016lx %016lx %016lx\n",
384 minstate->pmsa_bank1_gr[4], minstate->pmsa_bank1_gr[5],
385 minstate->pmsa_bank1_gr[6], minstate->pmsa_bank1_gr[7]);
386 printk("r24-27 %016lx %016lx %016lx %016lx\n",
387 minstate->pmsa_bank1_gr[8], minstate->pmsa_bank1_gr[9],
388 minstate->pmsa_bank1_gr[10], minstate->pmsa_bank1_gr[11]);
389 printk("r28-31 %016lx %016lx %016lx %016lx\n",
390 minstate->pmsa_bank1_gr[12], minstate->pmsa_bank1_gr[13],
391 minstate->pmsa_bank1_gr[14], minstate->pmsa_bank1_gr[15]);
392}
393
394static void
395fetch_min_state (pal_min_state_area_t *ms, struct pt_regs *pt, struct switch_stack *sw)
396{
397 u64 *dst_banked, *src_banked, bit, shift, nat_bits;
398 int i;
399
400 /*
401 * First, update the pt-regs and switch-stack structures with the contents stored
402 * in the min-state area:
403 */
404 if (((struct ia64_psr *) &ms->pmsa_ipsr)->ic == 0) {
405 pt->cr_ipsr = ms->pmsa_xpsr;
406 pt->cr_iip = ms->pmsa_xip;
407 pt->cr_ifs = ms->pmsa_xfs;
408 } else {
409 pt->cr_ipsr = ms->pmsa_ipsr;
410 pt->cr_iip = ms->pmsa_iip;
411 pt->cr_ifs = ms->pmsa_ifs;
412 }
413 pt->ar_rsc = ms->pmsa_rsc;
414 pt->pr = ms->pmsa_pr;
415 pt->r1 = ms->pmsa_gr[0];
416 pt->r2 = ms->pmsa_gr[1];
417 pt->r3 = ms->pmsa_gr[2];
418 sw->r4 = ms->pmsa_gr[3];
419 sw->r5 = ms->pmsa_gr[4];
420 sw->r6 = ms->pmsa_gr[5];
421 sw->r7 = ms->pmsa_gr[6];
422 pt->r8 = ms->pmsa_gr[7];
423 pt->r9 = ms->pmsa_gr[8];
424 pt->r10 = ms->pmsa_gr[9];
425 pt->r11 = ms->pmsa_gr[10];
426 pt->r12 = ms->pmsa_gr[11];
427 pt->r13 = ms->pmsa_gr[12];
428 pt->r14 = ms->pmsa_gr[13];
429 pt->r15 = ms->pmsa_gr[14];
430 dst_banked = &pt->r16; /* r16-r31 are contiguous in struct pt_regs */
431 src_banked = ms->pmsa_bank1_gr;
432 for (i = 0; i < 16; ++i)
433 dst_banked[i] = src_banked[i];
434 pt->b0 = ms->pmsa_br0;
435 sw->b1 = ms->pmsa_br1;
436
437 /* construct the NaT bits for the pt-regs structure: */
438# define PUT_NAT_BIT(dst, addr) \
439 do { \
440 bit = nat_bits & 1; nat_bits >>= 1; \
441 shift = ((unsigned long) addr >> 3) & 0x3f; \
442 dst = ((dst) & ~(1UL << shift)) | (bit << shift); \
443 } while (0)
444
445 /* Rotate the saved NaT bits such that bit 0 corresponds to pmsa_gr[0]: */
446 shift = ((unsigned long) &ms->pmsa_gr[0] >> 3) & 0x3f;
447 nat_bits = (ms->pmsa_nat_bits >> shift) | (ms->pmsa_nat_bits << (64 - shift));
448
449 PUT_NAT_BIT(sw->caller_unat, &pt->r1);
450 PUT_NAT_BIT(sw->caller_unat, &pt->r2);
451 PUT_NAT_BIT(sw->caller_unat, &pt->r3);
452 PUT_NAT_BIT(sw->ar_unat, &sw->r4);
453 PUT_NAT_BIT(sw->ar_unat, &sw->r5);
454 PUT_NAT_BIT(sw->ar_unat, &sw->r6);
455 PUT_NAT_BIT(sw->ar_unat, &sw->r7);
456 PUT_NAT_BIT(sw->caller_unat, &pt->r8); PUT_NAT_BIT(sw->caller_unat, &pt->r9);
457 PUT_NAT_BIT(sw->caller_unat, &pt->r10); PUT_NAT_BIT(sw->caller_unat, &pt->r11);
458 PUT_NAT_BIT(sw->caller_unat, &pt->r12); PUT_NAT_BIT(sw->caller_unat, &pt->r13);
459 PUT_NAT_BIT(sw->caller_unat, &pt->r14); PUT_NAT_BIT(sw->caller_unat, &pt->r15);
460 nat_bits >>= 16; /* skip over bank0 NaT bits */
461 PUT_NAT_BIT(sw->caller_unat, &pt->r16); PUT_NAT_BIT(sw->caller_unat, &pt->r17);
462 PUT_NAT_BIT(sw->caller_unat, &pt->r18); PUT_NAT_BIT(sw->caller_unat, &pt->r19);
463 PUT_NAT_BIT(sw->caller_unat, &pt->r20); PUT_NAT_BIT(sw->caller_unat, &pt->r21);
464 PUT_NAT_BIT(sw->caller_unat, &pt->r22); PUT_NAT_BIT(sw->caller_unat, &pt->r23);
465 PUT_NAT_BIT(sw->caller_unat, &pt->r24); PUT_NAT_BIT(sw->caller_unat, &pt->r25);
466 PUT_NAT_BIT(sw->caller_unat, &pt->r26); PUT_NAT_BIT(sw->caller_unat, &pt->r27);
467 PUT_NAT_BIT(sw->caller_unat, &pt->r28); PUT_NAT_BIT(sw->caller_unat, &pt->r29);
468 PUT_NAT_BIT(sw->caller_unat, &pt->r30); PUT_NAT_BIT(sw->caller_unat, &pt->r31);
469}
470
471static void
472init_handler_platform (pal_min_state_area_t *ms,
473 struct pt_regs *pt, struct switch_stack *sw)
474{
475 struct unw_frame_info info;
476
477 /* if a kernel debugger is available call it here else just dump the registers */
478
479 /*
480 * Wait for a bit. On some machines (e.g., HP's zx2000 and zx6000, INIT can be
481 * generated via the BMC's command-line interface, but since the console is on the
482 * same serial line, the user will need some time to switch out of the BMC before
483 * the dump begins.
484 */
485 printk("Delaying for 5 seconds...\n");
486 udelay(5*1000000);
487 show_min_state(ms);
488
489 printk("Backtrace of current task (pid %d, %s)\n", current->pid, current->comm);
490 fetch_min_state(ms, pt, sw);
491 unw_init_from_interruption(&info, current, pt, sw);
492 ia64_do_show_stack(&info, NULL);
493
494#ifdef CONFIG_SMP
495 /* read_trylock() would be handy... */
496 if (!tasklist_lock.write_lock)
497 read_lock(&tasklist_lock);
498#endif
499 {
500 struct task_struct *g, *t;
501 do_each_thread (g, t) {
502 if (t == current)
503 continue;
504
505 printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
506 show_stack(t, NULL);
507 } while_each_thread (g, t);
508 }
509#ifdef CONFIG_SMP
510 if (!tasklist_lock.write_lock)
511 read_unlock(&tasklist_lock);
512#endif
513
514 printk("\nINIT dump complete. Please reboot now.\n");
515 while (1); /* hang city if no debugger */
516}
517
518#ifdef CONFIG_ACPI
519/*
520 * ia64_mca_register_cpev
521 *
522 * Register the corrected platform error vector with SAL.
523 *
524 * Inputs
525 * cpev Corrected Platform Error Vector number
526 *
527 * Outputs
528 * None
529 */
530static void
531ia64_mca_register_cpev (int cpev)
532{
533 /* Register the CPE interrupt vector with SAL */
534 struct ia64_sal_retval isrv;
535
536 isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0);
537 if (isrv.status) {
538 printk(KERN_ERR "Failed to register Corrected Platform "
539 "Error interrupt vector with SAL (status %ld)\n", isrv.status);
540 return;
541 }
542
543 IA64_MCA_DEBUG("%s: corrected platform error "
544 "vector %#x registered\n", __FUNCTION__, cpev);
545}
546#endif /* CONFIG_ACPI */
547
548#endif /* PLATFORM_MCA_HANDLERS */
549
550/*
551 * ia64_mca_cmc_vector_setup
552 *
553 * Setup the corrected machine check vector register in the processor.
554 * (The interrupt is masked on boot. ia64_mca_late_init unmask this.)
555 * This function is invoked on a per-processor basis.
556 *
557 * Inputs
558 * None
559 *
560 * Outputs
561 * None
562 */
563void
564ia64_mca_cmc_vector_setup (void)
565{
566 cmcv_reg_t cmcv;
567
568 cmcv.cmcv_regval = 0;
569 cmcv.cmcv_mask = 1; /* Mask/disable interrupt at first */
570 cmcv.cmcv_vector = IA64_CMC_VECTOR;
571 ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
572
573 IA64_MCA_DEBUG("%s: CPU %d corrected "
574 "machine check vector %#x registered.\n",
575 __FUNCTION__, smp_processor_id(), IA64_CMC_VECTOR);
576
577 IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n",
578 __FUNCTION__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV));
579}
580
581/*
582 * ia64_mca_cmc_vector_disable
583 *
584 * Mask the corrected machine check vector register in the processor.
585 * This function is invoked on a per-processor basis.
586 *
587 * Inputs
588 * dummy(unused)
589 *
590 * Outputs
591 * None
592 */
593static void
594ia64_mca_cmc_vector_disable (void *dummy)
595{
596 cmcv_reg_t cmcv;
597
598 cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
599
600 cmcv.cmcv_mask = 1; /* Mask/disable interrupt */
601 ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
602
603 IA64_MCA_DEBUG("%s: CPU %d corrected "
604 "machine check vector %#x disabled.\n",
605 __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector);
606}
607
608/*
609 * ia64_mca_cmc_vector_enable
610 *
611 * Unmask the corrected machine check vector register in the processor.
612 * This function is invoked on a per-processor basis.
613 *
614 * Inputs
615 * dummy(unused)
616 *
617 * Outputs
618 * None
619 */
620static void
621ia64_mca_cmc_vector_enable (void *dummy)
622{
623 cmcv_reg_t cmcv;
624
625 cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
626
627 cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */
628 ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
629
630 IA64_MCA_DEBUG("%s: CPU %d corrected "
631 "machine check vector %#x enabled.\n",
632 __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector);
633}
634
635/*
636 * ia64_mca_cmc_vector_disable_keventd
637 *
638 * Called via keventd (smp_call_function() is not safe in interrupt context) to
639 * disable the cmc interrupt vector.
640 */
641static void
642ia64_mca_cmc_vector_disable_keventd(void *unused)
643{
644 on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0);
645}
646
647/*
648 * ia64_mca_cmc_vector_enable_keventd
649 *
650 * Called via keventd (smp_call_function() is not safe in interrupt context) to
651 * enable the cmc interrupt vector.
652 */
653static void
654ia64_mca_cmc_vector_enable_keventd(void *unused)
655{
656 on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0);
657}
658
659/*
660 * ia64_mca_wakeup_ipi_wait
661 *
662 * Wait for the inter-cpu interrupt to be sent by the
663 * monarch processor once it is done with handling the
664 * MCA.
665 *
666 * Inputs : None
667 * Outputs : None
668 */
669static void
670ia64_mca_wakeup_ipi_wait(void)
671{
672 int irr_num = (IA64_MCA_WAKEUP_VECTOR >> 6);
673 int irr_bit = (IA64_MCA_WAKEUP_VECTOR & 0x3f);
674 u64 irr = 0;
675
676 do {
677 switch(irr_num) {
678 case 0:
679 irr = ia64_getreg(_IA64_REG_CR_IRR0);
680 break;
681 case 1:
682 irr = ia64_getreg(_IA64_REG_CR_IRR1);
683 break;
684 case 2:
685 irr = ia64_getreg(_IA64_REG_CR_IRR2);
686 break;
687 case 3:
688 irr = ia64_getreg(_IA64_REG_CR_IRR3);
689 break;
690 }
691 cpu_relax();
692 } while (!(irr & (1UL << irr_bit))) ;
693}
694
695/*
696 * ia64_mca_wakeup
697 *
698 * Send an inter-cpu interrupt to wake-up a particular cpu
699 * and mark that cpu to be out of rendez.
700 *
701 * Inputs : cpuid
702 * Outputs : None
703 */
704static void
705ia64_mca_wakeup(int cpu)
706{
707 platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0);
708 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
709
710}
711
712/*
713 * ia64_mca_wakeup_all
714 *
715 * Wakeup all the cpus which have rendez'ed previously.
716 *
717 * Inputs : None
718 * Outputs : None
719 */
720static void
721ia64_mca_wakeup_all(void)
722{
723 int cpu;
724
725 /* Clear the Rendez checkin flag for all cpus */
726 for(cpu = 0; cpu < NR_CPUS; cpu++) {
727 if (!cpu_online(cpu))
728 continue;
729 if (ia64_mc_info.imi_rendez_checkin[cpu] == IA64_MCA_RENDEZ_CHECKIN_DONE)
730 ia64_mca_wakeup(cpu);
731 }
732
733}
734
735/*
736 * ia64_mca_rendez_interrupt_handler
737 *
738 * This is handler used to put slave processors into spinloop
739 * while the monarch processor does the mca handling and later
740 * wake each slave up once the monarch is done.
741 *
742 * Inputs : None
743 * Outputs : None
744 */
745static irqreturn_t
746ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
747{
748 unsigned long flags;
749 int cpu = smp_processor_id();
750
751 /* Mask all interrupts */
752 local_irq_save(flags);
753
754 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
755 /* Register with the SAL monarch that the slave has
756 * reached SAL
757 */
758 ia64_sal_mc_rendez();
759
760 /* Wait for the wakeup IPI from the monarch
761 * This waiting is done by polling on the wakeup-interrupt
762 * vector bit in the processor's IRRs
763 */
764 ia64_mca_wakeup_ipi_wait();
765
766 /* Enable all interrupts */
767 local_irq_restore(flags);
768 return IRQ_HANDLED;
769}
770
771/*
772 * ia64_mca_wakeup_int_handler
773 *
774 * The interrupt handler for processing the inter-cpu interrupt to the
775 * slave cpu which was spinning in the rendez loop.
776 * Since this spinning is done by turning off the interrupts and
777 * polling on the wakeup-interrupt bit in the IRR, there is
778 * nothing useful to be done in the handler.
779 *
780 * Inputs : wakeup_irq (Wakeup-interrupt bit)
781 * arg (Interrupt handler specific argument)
782 * ptregs (Exception frame at the time of the interrupt)
783 * Outputs : None
784 *
785 */
786static irqreturn_t
787ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg, struct pt_regs *ptregs)
788{
789 return IRQ_HANDLED;
790}
791
792/*
793 * ia64_return_to_sal_check
794 *
795 * This is function called before going back from the OS_MCA handler
796 * to the OS_MCA dispatch code which finally takes the control back
797 * to the SAL.
798 * The main purpose of this routine is to setup the OS_MCA to SAL
799 * return state which can be used by the OS_MCA dispatch code
800 * just before going back to SAL.
801 *
802 * Inputs : None
803 * Outputs : None
804 */
805
806static void
807ia64_return_to_sal_check(int recover)
808{
809
810 /* Copy over some relevant stuff from the sal_to_os_mca_handoff
811 * so that it can be used at the time of os_mca_to_sal_handoff
812 */
813 ia64_os_to_sal_handoff_state.imots_sal_gp =
814 ia64_sal_to_os_handoff_state.imsto_sal_gp;
815
816 ia64_os_to_sal_handoff_state.imots_sal_check_ra =
817 ia64_sal_to_os_handoff_state.imsto_sal_check_ra;
818
819 if (recover)
820 ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_CORRECTED;
821 else
822 ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_COLD_BOOT;
823
824 /* Default = tell SAL to return to same context */
825 ia64_os_to_sal_handoff_state.imots_context = IA64_MCA_SAME_CONTEXT;
826
827 ia64_os_to_sal_handoff_state.imots_new_min_state =
828 (u64 *)ia64_sal_to_os_handoff_state.pal_min_state;
829
830}
831
832/* Function pointer for extra MCA recovery */
833int (*ia64_mca_ucmc_extension)
834 (void*,ia64_mca_sal_to_os_state_t*,ia64_mca_os_to_sal_state_t*)
835 = NULL;
836
837int
838ia64_reg_MCA_extension(void *fn)
839{
840 if (ia64_mca_ucmc_extension)
841 return 1;
842
843 ia64_mca_ucmc_extension = fn;
844 return 0;
845}
846
847void
848ia64_unreg_MCA_extension(void)
849{
850 if (ia64_mca_ucmc_extension)
851 ia64_mca_ucmc_extension = NULL;
852}
853
854EXPORT_SYMBOL(ia64_reg_MCA_extension);
855EXPORT_SYMBOL(ia64_unreg_MCA_extension);
856
857/*
858 * ia64_mca_ucmc_handler
859 *
860 * This is uncorrectable machine check handler called from OS_MCA
861 * dispatch code which is in turn called from SAL_CHECK().
862 * This is the place where the core of OS MCA handling is done.
863 * Right now the logs are extracted and displayed in a well-defined
864 * format. This handler code is supposed to be run only on the
865 * monarch processor. Once the monarch is done with MCA handling
866 * further MCA logging is enabled by clearing logs.
867 * Monarch also has the duty of sending wakeup-IPIs to pull the
868 * slave processors out of rendezvous spinloop.
869 *
870 * Inputs : None
871 * Outputs : None
872 */
873void
874ia64_mca_ucmc_handler(void)
875{
876 pal_processor_state_info_t *psp = (pal_processor_state_info_t *)
877 &ia64_sal_to_os_handoff_state.proc_state_param;
878 int recover;
879
880 /* Get the MCA error record and log it */
881 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
882
883 /* TLB error is only exist in this SAL error record */
884 recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc))
885 /* other error recovery */
886 || (ia64_mca_ucmc_extension
887 && ia64_mca_ucmc_extension(
888 IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
889 &ia64_sal_to_os_handoff_state,
890 &ia64_os_to_sal_handoff_state));
891
892 if (recover) {
893 sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
894 rh->severity = sal_log_severity_corrected;
895 ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
896 }
897 /*
898 * Wakeup all the processors which are spinning in the rendezvous
899 * loop.
900 */
901 ia64_mca_wakeup_all();
902
903 /* Return to SAL */
904 ia64_return_to_sal_check(recover);
905}
906
907static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL);
908static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL);
909
910/*
911 * ia64_mca_cmc_int_handler
912 *
913 * This is corrected machine check interrupt handler.
914 * Right now the logs are extracted and displayed in a well-defined
915 * format.
916 *
917 * Inputs
918 * interrupt number
919 * client data arg ptr
920 * saved registers ptr
921 *
922 * Outputs
923 * None
924 */
925static irqreturn_t
926ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
927{
928 static unsigned long cmc_history[CMC_HISTORY_LENGTH];
929 static int index;
930 static DEFINE_SPINLOCK(cmc_history_lock);
931
932 IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
933 __FUNCTION__, cmc_irq, smp_processor_id());
934
935 /* SAL spec states this should run w/ interrupts enabled */
936 local_irq_enable();
937
938 /* Get the CMC error record and log it */
939 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC);
940
941 spin_lock(&cmc_history_lock);
942 if (!cmc_polling_enabled) {
943 int i, count = 1; /* we know 1 happened now */
944 unsigned long now = jiffies;
945
946 for (i = 0; i < CMC_HISTORY_LENGTH; i++) {
947 if (now - cmc_history[i] <= HZ)
948 count++;
949 }
950
951 IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH);
952 if (count >= CMC_HISTORY_LENGTH) {
953
954 cmc_polling_enabled = 1;
955 spin_unlock(&cmc_history_lock);
956 schedule_work(&cmc_disable_work);
957
958 /*
959 * Corrected errors will still be corrected, but
960 * make sure there's a log somewhere that indicates
961 * something is generating more than we can handle.
962 */
963 printk(KERN_WARNING "WARNING: Switching to polling CMC handler; error records may be lost\n");
964
965 mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
966
967 /* lock already released, get out now */
968 return IRQ_HANDLED;
969 } else {
970 cmc_history[index++] = now;
971 if (index == CMC_HISTORY_LENGTH)
972 index = 0;
973 }
974 }
975 spin_unlock(&cmc_history_lock);
976 return IRQ_HANDLED;
977}
978
979/*
980 * ia64_mca_cmc_int_caller
981 *
982 * Triggered by sw interrupt from CMC polling routine. Calls
983 * real interrupt handler and either triggers a sw interrupt
984 * on the next cpu or does cleanup at the end.
985 *
986 * Inputs
987 * interrupt number
988 * client data arg ptr
989 * saved registers ptr
990 * Outputs
991 * handled
992 */
993static irqreturn_t
994ia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs)
995{
996 static int start_count = -1;
997 unsigned int cpuid;
998
999 cpuid = smp_processor_id();
1000
1001 /* If first cpu, update count */
1002 if (start_count == -1)
1003 start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC);
1004
1005 ia64_mca_cmc_int_handler(cmc_irq, arg, ptregs);
1006
1007 for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
1008
1009 if (cpuid < NR_CPUS) {
1010 platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
1011 } else {
1012 /* If no log record, switch out of polling mode */
1013 if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) {
1014
1015 printk(KERN_WARNING "Returning to interrupt driven CMC handler\n");
1016 schedule_work(&cmc_enable_work);
1017 cmc_polling_enabled = 0;
1018
1019 } else {
1020
1021 mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
1022 }
1023
1024 start_count = -1;
1025 }
1026
1027 return IRQ_HANDLED;
1028}
1029
1030/*
1031 * ia64_mca_cmc_poll
1032 *
1033 * Poll for Corrected Machine Checks (CMCs)
1034 *
1035 * Inputs : dummy(unused)
1036 * Outputs : None
1037 *
1038 */
1039static void
1040ia64_mca_cmc_poll (unsigned long dummy)
1041{
1042 /* Trigger a CMC interrupt cascade */
1043 platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
1044}
1045
1046/*
1047 * ia64_mca_cpe_int_caller
1048 *
1049 * Triggered by sw interrupt from CPE polling routine. Calls
1050 * real interrupt handler and either triggers a sw interrupt
1051 * on the next cpu or does cleanup at the end.
1052 *
1053 * Inputs
1054 * interrupt number
1055 * client data arg ptr
1056 * saved registers ptr
1057 * Outputs
1058 * handled
1059 */
1060#ifdef CONFIG_ACPI
1061
1062static irqreturn_t
1063ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs)
1064{
1065 static int start_count = -1;
1066 static int poll_time = MIN_CPE_POLL_INTERVAL;
1067 unsigned int cpuid;
1068
1069 cpuid = smp_processor_id();
1070
1071 /* If first cpu, update count */
1072 if (start_count == -1)
1073 start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE);
1074
1075 ia64_mca_cpe_int_handler(cpe_irq, arg, ptregs);
1076
1077 for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
1078
1079 if (cpuid < NR_CPUS) {
1080 platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
1081 } else {
1082 /*
1083 * If a log was recorded, increase our polling frequency,
1084 * otherwise, backoff or return to interrupt mode.
1085 */
1086 if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) {
1087 poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time / 2);
1088 } else if (cpe_vector < 0) {
1089 poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2);
1090 } else {
1091 poll_time = MIN_CPE_POLL_INTERVAL;
1092
1093 printk(KERN_WARNING "Returning to interrupt driven CPE handler\n");
1094 enable_irq(local_vector_to_irq(IA64_CPE_VECTOR));
1095 cpe_poll_enabled = 0;
1096 }
1097
1098 if (cpe_poll_enabled)
1099 mod_timer(&cpe_poll_timer, jiffies + poll_time);
1100 start_count = -1;
1101 }
1102
1103 return IRQ_HANDLED;
1104}
1105
1106#endif /* CONFIG_ACPI */
1107
1108/*
1109 * ia64_mca_cpe_poll
1110 *
1111 * Poll for Corrected Platform Errors (CPEs), trigger interrupt
1112 * on first cpu, from there it will trickle through all the cpus.
1113 *
1114 * Inputs : dummy(unused)
1115 * Outputs : None
1116 *
1117 */
1118static void
1119ia64_mca_cpe_poll (unsigned long dummy)
1120{
1121 /* Trigger a CPE interrupt cascade */
1122 platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
1123}
1124
1125/*
1126 * C portion of the OS INIT handler
1127 *
1128 * Called from ia64_monarch_init_handler
1129 *
1130 * Inputs: pointer to pt_regs where processor info was saved.
1131 *
1132 * Returns:
1133 * 0 if SAL must warm boot the System
1134 * 1 if SAL must return to interrupted context using PAL_MC_RESUME
1135 *
1136 */
1137void
1138ia64_init_handler (struct pt_regs *pt, struct switch_stack *sw)
1139{
1140 pal_min_state_area_t *ms;
1141
1142 oops_in_progress = 1; /* avoid deadlock in printk, but it makes recovery dodgy */
1143 console_loglevel = 15; /* make sure printks make it to console */
1144
1145 printk(KERN_INFO "Entered OS INIT handler. PSP=%lx\n",
1146 ia64_sal_to_os_handoff_state.proc_state_param);
1147
1148 /*
1149 * Address of minstate area provided by PAL is physical,
1150 * uncacheable (bit 63 set). Convert to Linux virtual
1151 * address in region 6.
1152 */
1153 ms = (pal_min_state_area_t *)(ia64_sal_to_os_handoff_state.pal_min_state | (6ul<<61));
1154
1155 init_handler_platform(ms, pt, sw); /* call platform specific routines */
1156}
1157
1158static int __init
1159ia64_mca_disable_cpe_polling(char *str)
1160{
1161 cpe_poll_enabled = 0;
1162 return 1;
1163}
1164
1165__setup("disable_cpe_poll", ia64_mca_disable_cpe_polling);
1166
1167static struct irqaction cmci_irqaction = {
1168 .handler = ia64_mca_cmc_int_handler,
1169 .flags = SA_INTERRUPT,
1170 .name = "cmc_hndlr"
1171};
1172
1173static struct irqaction cmcp_irqaction = {
1174 .handler = ia64_mca_cmc_int_caller,
1175 .flags = SA_INTERRUPT,
1176 .name = "cmc_poll"
1177};
1178
1179static struct irqaction mca_rdzv_irqaction = {
1180 .handler = ia64_mca_rendez_int_handler,
1181 .flags = SA_INTERRUPT,
1182 .name = "mca_rdzv"
1183};
1184
1185static struct irqaction mca_wkup_irqaction = {
1186 .handler = ia64_mca_wakeup_int_handler,
1187 .flags = SA_INTERRUPT,
1188 .name = "mca_wkup"
1189};
1190
1191#ifdef CONFIG_ACPI
1192static struct irqaction mca_cpe_irqaction = {
1193 .handler = ia64_mca_cpe_int_handler,
1194 .flags = SA_INTERRUPT,
1195 .name = "cpe_hndlr"
1196};
1197
1198static struct irqaction mca_cpep_irqaction = {
1199 .handler = ia64_mca_cpe_int_caller,
1200 .flags = SA_INTERRUPT,
1201 .name = "cpe_poll"
1202};
1203#endif /* CONFIG_ACPI */
1204
1205/* Do per-CPU MCA-related initialization. */
1206
1207void __devinit
1208ia64_mca_cpu_init(void *cpu_data)
1209{
1210 void *pal_vaddr;
1211
1212 if (smp_processor_id() == 0) {
1213 void *mca_data;
1214 int cpu;
1215
1216 mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu)
1217 * NR_CPUS);
1218 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1219 __per_cpu_mca[cpu] = __pa(mca_data);
1220 mca_data += sizeof(struct ia64_mca_cpu);
1221 }
1222 }
1223
1224 /*
1225 * The MCA info structure was allocated earlier and its
1226 * physical address saved in __per_cpu_mca[cpu]. Copy that
1227 * address * to ia64_mca_data so we can access it as a per-CPU
1228 * variable.
1229 */
1230 __get_cpu_var(ia64_mca_data) = __per_cpu_mca[smp_processor_id()];
1231
1232 /*
1233 * Stash away a copy of the PTE needed to map the per-CPU page.
1234 * We may need it during MCA recovery.
1235 */
1236 __get_cpu_var(ia64_mca_per_cpu_pte) =
1237 pte_val(mk_pte_phys(__pa(cpu_data), PAGE_KERNEL));
1238
1239 /*
1240 * Also, stash away a copy of the PAL address and the PTE
1241 * needed to map it.
1242 */
1243 pal_vaddr = efi_get_pal_addr();
1244 if (!pal_vaddr)
1245 return;
1246 __get_cpu_var(ia64_mca_pal_base) =
1247 GRANULEROUNDDOWN((unsigned long) pal_vaddr);
1248 __get_cpu_var(ia64_mca_pal_pte) = pte_val(mk_pte_phys(__pa(pal_vaddr),
1249 PAGE_KERNEL));
1250}
1251
1252/*
1253 * ia64_mca_init
1254 *
1255 * Do all the system level mca specific initialization.
1256 *
1257 * 1. Register spinloop and wakeup request interrupt vectors
1258 *
1259 * 2. Register OS_MCA handler entry point
1260 *
1261 * 3. Register OS_INIT handler entry point
1262 *
1263 * 4. Initialize MCA/CMC/INIT related log buffers maintained by the OS.
1264 *
1265 * Note that this initialization is done very early before some kernel
1266 * services are available.
1267 *
1268 * Inputs : None
1269 *
1270 * Outputs : None
1271 */
1272void __init
1273ia64_mca_init(void)
1274{
1275 ia64_fptr_t *mon_init_ptr = (ia64_fptr_t *)ia64_monarch_init_handler;
1276 ia64_fptr_t *slave_init_ptr = (ia64_fptr_t *)ia64_slave_init_handler;
1277 ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch;
1278 int i;
1279 s64 rc;
1280 struct ia64_sal_retval isrv;
1281 u64 timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */
1282
1283 IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__);
1284
1285 /* Clear the Rendez checkin flag for all cpus */
1286 for(i = 0 ; i < NR_CPUS; i++)
1287 ia64_mc_info.imi_rendez_checkin[i] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
1288
1289 /*
1290 * Register the rendezvous spinloop and wakeup mechanism with SAL
1291 */
1292
1293 /* Register the rendezvous interrupt vector with SAL */
1294 while (1) {
1295 isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT,
1296 SAL_MC_PARAM_MECHANISM_INT,
1297 IA64_MCA_RENDEZ_VECTOR,
1298 timeout,
1299 SAL_MC_PARAM_RZ_ALWAYS);
1300 rc = isrv.status;
1301 if (rc == 0)
1302 break;
1303 if (rc == -2) {
1304 printk(KERN_INFO "Increasing MCA rendezvous timeout from "
1305 "%ld to %ld milliseconds\n", timeout, isrv.v0);
1306 timeout = isrv.v0;
1307 continue;
1308 }
1309 printk(KERN_ERR "Failed to register rendezvous interrupt "
1310 "with SAL (status %ld)\n", rc);
1311 return;
1312 }
1313
1314 /* Register the wakeup interrupt vector with SAL */
1315 isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP,
1316 SAL_MC_PARAM_MECHANISM_INT,
1317 IA64_MCA_WAKEUP_VECTOR,
1318 0, 0);
1319 rc = isrv.status;
1320 if (rc) {
1321 printk(KERN_ERR "Failed to register wakeup interrupt with SAL "
1322 "(status %ld)\n", rc);
1323 return;
1324 }
1325
1326 IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __FUNCTION__);
1327
1328 ia64_mc_info.imi_mca_handler = ia64_tpa(mca_hldlr_ptr->fp);
1329 /*
1330 * XXX - disable SAL checksum by setting size to 0; should be
1331 * ia64_tpa(ia64_os_mca_dispatch_end) - ia64_tpa(ia64_os_mca_dispatch);
1332 */
1333 ia64_mc_info.imi_mca_handler_size = 0;
1334
1335 /* Register the os mca handler with SAL */
1336 if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_MCA,
1337 ia64_mc_info.imi_mca_handler,
1338 ia64_tpa(mca_hldlr_ptr->gp),
1339 ia64_mc_info.imi_mca_handler_size,
1340 0, 0, 0)))
1341 {
1342 printk(KERN_ERR "Failed to register OS MCA handler with SAL "
1343 "(status %ld)\n", rc);
1344 return;
1345 }
1346
1347 IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __FUNCTION__,
1348 ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp));
1349
1350 /*
1351 * XXX - disable SAL checksum by setting size to 0, should be
1352 * size of the actual init handler in mca_asm.S.
1353 */
1354 ia64_mc_info.imi_monarch_init_handler = ia64_tpa(mon_init_ptr->fp);
1355 ia64_mc_info.imi_monarch_init_handler_size = 0;
1356 ia64_mc_info.imi_slave_init_handler = ia64_tpa(slave_init_ptr->fp);
1357 ia64_mc_info.imi_slave_init_handler_size = 0;
1358
1359 IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __FUNCTION__,
1360 ia64_mc_info.imi_monarch_init_handler);
1361
1362 /* Register the os init handler with SAL */
1363 if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_INIT,
1364 ia64_mc_info.imi_monarch_init_handler,
1365 ia64_tpa(ia64_getreg(_IA64_REG_GP)),
1366 ia64_mc_info.imi_monarch_init_handler_size,
1367 ia64_mc_info.imi_slave_init_handler,
1368 ia64_tpa(ia64_getreg(_IA64_REG_GP)),
1369 ia64_mc_info.imi_slave_init_handler_size)))
1370 {
1371 printk(KERN_ERR "Failed to register m/s INIT handlers with SAL "
1372 "(status %ld)\n", rc);
1373 return;
1374 }
1375
1376 IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__);
1377
1378 /*
1379 * Configure the CMCI/P vector and handler. Interrupts for CMC are
1380 * per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c).
1381 */
1382 register_percpu_irq(IA64_CMC_VECTOR, &cmci_irqaction);
1383 register_percpu_irq(IA64_CMCP_VECTOR, &cmcp_irqaction);
1384 ia64_mca_cmc_vector_setup(); /* Setup vector on BSP */
1385
1386 /* Setup the MCA rendezvous interrupt vector */
1387 register_percpu_irq(IA64_MCA_RENDEZ_VECTOR, &mca_rdzv_irqaction);
1388
1389 /* Setup the MCA wakeup interrupt vector */
1390 register_percpu_irq(IA64_MCA_WAKEUP_VECTOR, &mca_wkup_irqaction);
1391
1392#ifdef CONFIG_ACPI
1393 /* Setup the CPEI/P vector and handler */
1394 cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI);
1395 register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction);
1396#endif
1397
1398 /* Initialize the areas set aside by the OS to buffer the
1399 * platform/processor error states for MCA/INIT/CMC
1400 * handling.
1401 */
1402 ia64_log_init(SAL_INFO_TYPE_MCA);
1403 ia64_log_init(SAL_INFO_TYPE_INIT);
1404 ia64_log_init(SAL_INFO_TYPE_CMC);
1405 ia64_log_init(SAL_INFO_TYPE_CPE);
1406
1407 mca_init = 1;
1408 printk(KERN_INFO "MCA related initialization done\n");
1409}
1410
1411/*
1412 * ia64_mca_late_init
1413 *
1414 * Opportunity to setup things that require initialization later
1415 * than ia64_mca_init. Setup a timer to poll for CPEs if the
1416 * platform doesn't support an interrupt driven mechanism.
1417 *
1418 * Inputs : None
1419 * Outputs : Status
1420 */
1421static int __init
1422ia64_mca_late_init(void)
1423{
1424 if (!mca_init)
1425 return 0;
1426
1427 /* Setup the CMCI/P vector and handler */
1428 init_timer(&cmc_poll_timer);
1429 cmc_poll_timer.function = ia64_mca_cmc_poll;
1430
1431 /* Unmask/enable the vector */
1432 cmc_polling_enabled = 0;
1433 schedule_work(&cmc_enable_work);
1434
1435 IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __FUNCTION__);
1436
1437#ifdef CONFIG_ACPI
1438 /* Setup the CPEI/P vector and handler */
1439 init_timer(&cpe_poll_timer);
1440 cpe_poll_timer.function = ia64_mca_cpe_poll;
1441
1442 {
1443 irq_desc_t *desc;
1444 unsigned int irq;
1445
1446 if (cpe_vector >= 0) {
1447 /* If platform supports CPEI, enable the irq. */
1448 cpe_poll_enabled = 0;
1449 for (irq = 0; irq < NR_IRQS; ++irq)
1450 if (irq_to_vector(irq) == cpe_vector) {
1451 desc = irq_descp(irq);
1452 desc->status |= IRQ_PER_CPU;
1453 setup_irq(irq, &mca_cpe_irqaction);
1454 }
1455 ia64_mca_register_cpev(cpe_vector);
1456 IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__);
1457 } else {
1458 /* If platform doesn't support CPEI, get the timer going. */
1459 if (cpe_poll_enabled) {
1460 ia64_mca_cpe_poll(0UL);
1461 IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __FUNCTION__);
1462 }
1463 }
1464 }
1465#endif
1466
1467 return 0;
1468}
1469
1470device_initcall(ia64_mca_late_init);
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
new file mode 100644
index 000000000000..cf3f8014f9ad
--- /dev/null
+++ b/arch/ia64/kernel/mca_asm.S
@@ -0,0 +1,928 @@
1//
2// assembly portion of the IA64 MCA handling
3//
4// Mods by cfleck to integrate into kernel build
5// 00/03/15 davidm Added various stop bits to get a clean compile
6//
7// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp
8// kstack, switch modes, jump to C INIT handler
9//
10// 02/01/04 J.Hall <jenna.s.hall@intel.com>
11// Before entering virtual mode code:
12// 1. Check for TLB CPU error
13// 2. Restore current thread pointer to kr6
14// 3. Move stack ptr 16 bytes to conform to C calling convention
15//
16// 04/11/12 Russ Anderson <rja@sgi.com>
17// Added per cpu MCA/INIT stack save areas.
18//
19#include <linux/config.h>
20#include <linux/threads.h>
21
22#include <asm/asmmacro.h>
23#include <asm/pgtable.h>
24#include <asm/processor.h>
25#include <asm/mca_asm.h>
26#include <asm/mca.h>
27
28/*
29 * When we get a machine check, the kernel stack pointer is no longer
30 * valid, so we need to set a new stack pointer.
31 */
32#define MINSTATE_PHYS /* Make sure stack access is physical for MINSTATE */
33
34/*
35 * Needed for return context to SAL
36 */
37#define IA64_MCA_SAME_CONTEXT 0
38#define IA64_MCA_COLD_BOOT -2
39
40#include "minstate.h"
41
42/*
43 * SAL_TO_OS_MCA_HANDOFF_STATE (SAL 3.0 spec)
44 * 1. GR1 = OS GP
45 * 2. GR8 = PAL_PROC physical address
46 * 3. GR9 = SAL_PROC physical address
47 * 4. GR10 = SAL GP (physical)
48 * 5. GR11 = Rendez state
49 * 6. GR12 = Return address to location within SAL_CHECK
50 */
51#define SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(_tmp) \
52 LOAD_PHYSICAL(p0, _tmp, ia64_sal_to_os_handoff_state);; \
53 st8 [_tmp]=r1,0x08;; \
54 st8 [_tmp]=r8,0x08;; \
55 st8 [_tmp]=r9,0x08;; \
56 st8 [_tmp]=r10,0x08;; \
57 st8 [_tmp]=r11,0x08;; \
58 st8 [_tmp]=r12,0x08;; \
59 st8 [_tmp]=r17,0x08;; \
60 st8 [_tmp]=r18,0x08
61
62/*
63 * OS_MCA_TO_SAL_HANDOFF_STATE (SAL 3.0 spec)
64 * (p6) is executed if we never entered virtual mode (TLB error)
65 * (p7) is executed if we entered virtual mode as expected (normal case)
66 * 1. GR8 = OS_MCA return status
67 * 2. GR9 = SAL GP (physical)
68 * 3. GR10 = 0/1 returning same/new context
69 * 4. GR22 = New min state save area pointer
70 * returns ptr to SAL rtn save loc in _tmp
71 */
72#define OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(_tmp) \
73 movl _tmp=ia64_os_to_sal_handoff_state;; \
74 DATA_VA_TO_PA(_tmp);; \
75 ld8 r8=[_tmp],0x08;; \
76 ld8 r9=[_tmp],0x08;; \
77 ld8 r10=[_tmp],0x08;; \
78 ld8 r22=[_tmp],0x08;;
79 // now _tmp is pointing to SAL rtn save location
80
81/*
82 * COLD_BOOT_HANDOFF_STATE() sets ia64_mca_os_to_sal_state
83 * imots_os_status=IA64_MCA_COLD_BOOT
84 * imots_sal_gp=SAL GP
85 * imots_context=IA64_MCA_SAME_CONTEXT
86 * imots_new_min_state=Min state save area pointer
87 * imots_sal_check_ra=Return address to location within SAL_CHECK
88 *
89 */
90#define COLD_BOOT_HANDOFF_STATE(sal_to_os_handoff,os_to_sal_handoff,tmp)\
91 movl tmp=IA64_MCA_COLD_BOOT; \
92 movl sal_to_os_handoff=__pa(ia64_sal_to_os_handoff_state); \
93 movl os_to_sal_handoff=__pa(ia64_os_to_sal_handoff_state);; \
94 st8 [os_to_sal_handoff]=tmp,8;; \
95 ld8 tmp=[sal_to_os_handoff],48;; \
96 st8 [os_to_sal_handoff]=tmp,8;; \
97 movl tmp=IA64_MCA_SAME_CONTEXT;; \
98 st8 [os_to_sal_handoff]=tmp,8;; \
99 ld8 tmp=[sal_to_os_handoff],-8;; \
100 st8 [os_to_sal_handoff]=tmp,8;; \
101 ld8 tmp=[sal_to_os_handoff];; \
102 st8 [os_to_sal_handoff]=tmp;;
103
104#define GET_IA64_MCA_DATA(reg) \
105 GET_THIS_PADDR(reg, ia64_mca_data) \
106 ;; \
107 ld8 reg=[reg]
108
109 .global ia64_os_mca_dispatch
110 .global ia64_os_mca_dispatch_end
111 .global ia64_sal_to_os_handoff_state
112 .global ia64_os_to_sal_handoff_state
113
114 .text
115 .align 16
116
117ia64_os_mca_dispatch:
118
119 // Serialize all MCA processing
120 mov r3=1;;
121 LOAD_PHYSICAL(p0,r2,ia64_mca_serialize);;
122ia64_os_mca_spin:
123 xchg8 r4=[r2],r3;;
124 cmp.ne p6,p0=r4,r0
125(p6) br ia64_os_mca_spin
126
127 // Save the SAL to OS MCA handoff state as defined
128 // by SAL SPEC 3.0
129 // NOTE : The order in which the state gets saved
130 // is dependent on the way the C-structure
131 // for ia64_mca_sal_to_os_state_t has been
132 // defined in include/asm/mca.h
133 SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2)
134 ;;
135
136 // LOG PROCESSOR STATE INFO FROM HERE ON..
137begin_os_mca_dump:
138 br ia64_os_mca_proc_state_dump;;
139
140ia64_os_mca_done_dump:
141
142 LOAD_PHYSICAL(p0,r16,ia64_sal_to_os_handoff_state+56)
143 ;;
144 ld8 r18=[r16] // Get processor state parameter on existing PALE_CHECK.
145 ;;
146 tbit.nz p6,p7=r18,60
147(p7) br.spnt done_tlb_purge_and_reload
148
149 // The following code purges TC and TR entries. Then reload all TC entries.
150 // Purge percpu data TC entries.
151begin_tlb_purge_and_reload:
152
153#define O(member) IA64_CPUINFO_##member##_OFFSET
154
155 GET_THIS_PADDR(r2, cpu_info) // load phys addr of cpu_info into r2
156 ;;
157 addl r17=O(PTCE_STRIDE),r2
158 addl r2=O(PTCE_BASE),r2
159 ;;
160 ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));; // r18=ptce_base
161 ld4 r19=[r2],4 // r19=ptce_count[0]
162 ld4 r21=[r17],4 // r21=ptce_stride[0]
163 ;;
164 ld4 r20=[r2] // r20=ptce_count[1]
165 ld4 r22=[r17] // r22=ptce_stride[1]
166 mov r24=0
167 ;;
168 adds r20=-1,r20
169 ;;
170#undef O
171
1722:
173 cmp.ltu p6,p7=r24,r19
174(p7) br.cond.dpnt.few 4f
175 mov ar.lc=r20
1763:
177 ptc.e r18
178 ;;
179 add r18=r22,r18
180 br.cloop.sptk.few 3b
181 ;;
182 add r18=r21,r18
183 add r24=1,r24
184 ;;
185 br.sptk.few 2b
1864:
187 srlz.i // srlz.i implies srlz.d
188 ;;
189
190 // Now purge addresses formerly mapped by TR registers
191 // 1. Purge ITR&DTR for kernel.
192 movl r16=KERNEL_START
193 mov r18=KERNEL_TR_PAGE_SHIFT<<2
194 ;;
195 ptr.i r16, r18
196 ptr.d r16, r18
197 ;;
198 srlz.i
199 ;;
200 srlz.d
201 ;;
202 // 2. Purge DTR for PERCPU data.
203 movl r16=PERCPU_ADDR
204 mov r18=PERCPU_PAGE_SHIFT<<2
205 ;;
206 ptr.d r16,r18
207 ;;
208 srlz.d
209 ;;
210 // 3. Purge ITR for PAL code.
211 GET_THIS_PADDR(r2, ia64_mca_pal_base)
212 ;;
213 ld8 r16=[r2]
214 mov r18=IA64_GRANULE_SHIFT<<2
215 ;;
216 ptr.i r16,r18
217 ;;
218 srlz.i
219 ;;
220 // 4. Purge DTR for stack.
221 mov r16=IA64_KR(CURRENT_STACK)
222 ;;
223 shl r16=r16,IA64_GRANULE_SHIFT
224 movl r19=PAGE_OFFSET
225 ;;
226 add r16=r19,r16
227 mov r18=IA64_GRANULE_SHIFT<<2
228 ;;
229 ptr.d r16,r18
230 ;;
231 srlz.i
232 ;;
233 // Finally reload the TR registers.
234 // 1. Reload DTR/ITR registers for kernel.
235 mov r18=KERNEL_TR_PAGE_SHIFT<<2
236 movl r17=KERNEL_START
237 ;;
238 mov cr.itir=r18
239 mov cr.ifa=r17
240 mov r16=IA64_TR_KERNEL
241 mov r19=ip
242 movl r18=PAGE_KERNEL
243 ;;
244 dep r17=0,r19,0, KERNEL_TR_PAGE_SHIFT
245 ;;
246 or r18=r17,r18
247 ;;
248 itr.i itr[r16]=r18
249 ;;
250 itr.d dtr[r16]=r18
251 ;;
252 srlz.i
253 srlz.d
254 ;;
255 // 2. Reload DTR register for PERCPU data.
256 GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte)
257 ;;
258 movl r16=PERCPU_ADDR // vaddr
259 movl r18=PERCPU_PAGE_SHIFT<<2
260 ;;
261 mov cr.itir=r18
262 mov cr.ifa=r16
263 ;;
264 ld8 r18=[r2] // load per-CPU PTE
265 mov r16=IA64_TR_PERCPU_DATA;
266 ;;
267 itr.d dtr[r16]=r18
268 ;;
269 srlz.d
270 ;;
271 // 3. Reload ITR for PAL code.
272 GET_THIS_PADDR(r2, ia64_mca_pal_pte)
273 ;;
274 ld8 r18=[r2] // load PAL PTE
275 ;;
276 GET_THIS_PADDR(r2, ia64_mca_pal_base)
277 ;;
278 ld8 r16=[r2] // load PAL vaddr
279 mov r19=IA64_GRANULE_SHIFT<<2
280 ;;
281 mov cr.itir=r19
282 mov cr.ifa=r16
283 mov r20=IA64_TR_PALCODE
284 ;;
285 itr.i itr[r20]=r18
286 ;;
287 srlz.i
288 ;;
289 // 4. Reload DTR for stack.
290 mov r16=IA64_KR(CURRENT_STACK)
291 ;;
292 shl r16=r16,IA64_GRANULE_SHIFT
293 movl r19=PAGE_OFFSET
294 ;;
295 add r18=r19,r16
296 movl r20=PAGE_KERNEL
297 ;;
298 add r16=r20,r16
299 mov r19=IA64_GRANULE_SHIFT<<2
300 ;;
301 mov cr.itir=r19
302 mov cr.ifa=r18
303 mov r20=IA64_TR_CURRENT_STACK
304 ;;
305 itr.d dtr[r20]=r16
306 ;;
307 srlz.d
308 ;;
309 br.sptk.many done_tlb_purge_and_reload
310err:
311 COLD_BOOT_HANDOFF_STATE(r20,r21,r22)
312 br.sptk.many ia64_os_mca_done_restore
313
314done_tlb_purge_and_reload:
315
316 // Setup new stack frame for OS_MCA handling
317 GET_IA64_MCA_DATA(r2)
318 ;;
319 add r3 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2
320 add r2 = IA64_MCA_CPU_RBSTORE_OFFSET, r2
321 ;;
322 rse_switch_context(r6,r3,r2);; // RSC management in this new context
323
324 GET_IA64_MCA_DATA(r2)
325 ;;
326 add r2 = IA64_MCA_CPU_STACK_OFFSET+IA64_MCA_STACK_SIZE-16, r2
327 ;;
328 mov r12=r2 // establish new stack-pointer
329
330 // Enter virtual mode from physical mode
331 VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4)
332ia64_os_mca_virtual_begin:
333
334 // Call virtual mode handler
335 movl r2=ia64_mca_ucmc_handler;;
336 mov b6=r2;;
337 br.call.sptk.many b0=b6;;
338.ret0:
339 // Revert back to physical mode before going back to SAL
340 PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4)
341ia64_os_mca_virtual_end:
342
343 // restore the original stack frame here
344 GET_IA64_MCA_DATA(r2)
345 ;;
346 add r2 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2
347 ;;
348 movl r4=IA64_PSR_MC
349 ;;
350 rse_return_context(r4,r3,r2) // switch from interrupt context for RSE
351
352 // let us restore all the registers from our PSI structure
353 mov r8=gp
354 ;;
355begin_os_mca_restore:
356 br ia64_os_mca_proc_state_restore;;
357
358ia64_os_mca_done_restore:
359 OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(r2);;
360 // branch back to SALE_CHECK
361 ld8 r3=[r2];;
362 mov b0=r3;; // SAL_CHECK return address
363
364 // release lock
365 movl r3=ia64_mca_serialize;;
366 DATA_VA_TO_PA(r3);;
367 st8.rel [r3]=r0
368
369 br b0
370 ;;
371ia64_os_mca_dispatch_end:
372//EndMain//////////////////////////////////////////////////////////////////////
373
374
375//++
376// Name:
377// ia64_os_mca_proc_state_dump()
378//
379// Stub Description:
380//
381// This stub dumps the processor state during MCHK to a data area
382//
383//--
384
385ia64_os_mca_proc_state_dump:
386// Save bank 1 GRs 16-31 which will be used by c-language code when we switch
387// to virtual addressing mode.
388 GET_IA64_MCA_DATA(r2)
389 ;;
390 add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2
391 ;;
392// save ar.NaT
393 mov r5=ar.unat // ar.unat
394
395// save banked GRs 16-31 along with NaT bits
396 bsw.1;;
397 st8.spill [r2]=r16,8;;
398 st8.spill [r2]=r17,8;;
399 st8.spill [r2]=r18,8;;
400 st8.spill [r2]=r19,8;;
401 st8.spill [r2]=r20,8;;
402 st8.spill [r2]=r21,8;;
403 st8.spill [r2]=r22,8;;
404 st8.spill [r2]=r23,8;;
405 st8.spill [r2]=r24,8;;
406 st8.spill [r2]=r25,8;;
407 st8.spill [r2]=r26,8;;
408 st8.spill [r2]=r27,8;;
409 st8.spill [r2]=r28,8;;
410 st8.spill [r2]=r29,8;;
411 st8.spill [r2]=r30,8;;
412 st8.spill [r2]=r31,8;;
413
414 mov r4=ar.unat;;
415 st8 [r2]=r4,8 // save User NaT bits for r16-r31
416 mov ar.unat=r5 // restore original unat
417 bsw.0;;
418
419//save BRs
420 add r4=8,r2 // duplicate r2 in r4
421 add r6=2*8,r2 // duplicate r2 in r4
422
423 mov r3=b0
424 mov r5=b1
425 mov r7=b2;;
426 st8 [r2]=r3,3*8
427 st8 [r4]=r5,3*8
428 st8 [r6]=r7,3*8;;
429
430 mov r3=b3
431 mov r5=b4
432 mov r7=b5;;
433 st8 [r2]=r3,3*8
434 st8 [r4]=r5,3*8
435 st8 [r6]=r7,3*8;;
436
437 mov r3=b6
438 mov r5=b7;;
439 st8 [r2]=r3,2*8
440 st8 [r4]=r5,2*8;;
441
442cSaveCRs:
443// save CRs
444 add r4=8,r2 // duplicate r2 in r4
445 add r6=2*8,r2 // duplicate r2 in r4
446
447 mov r3=cr.dcr
448 mov r5=cr.itm
449 mov r7=cr.iva;;
450
451 st8 [r2]=r3,8*8
452 st8 [r4]=r5,3*8
453 st8 [r6]=r7,3*8;; // 48 byte rements
454
455 mov r3=cr.pta;;
456 st8 [r2]=r3,8*8;; // 64 byte rements
457
458// if PSR.ic=0, reading interruption registers causes an illegal operation fault
459 mov r3=psr;;
460 tbit.nz.unc p6,p0=r3,PSR_IC;; // PSI Valid Log bit pos. test
461(p6) st8 [r2]=r0,9*8+160 // increment by 232 byte inc.
462begin_skip_intr_regs:
463(p6) br SkipIntrRegs;;
464
465 add r4=8,r2 // duplicate r2 in r4
466 add r6=2*8,r2 // duplicate r2 in r6
467
468 mov r3=cr.ipsr
469 mov r5=cr.isr
470 mov r7=r0;;
471 st8 [r2]=r3,3*8
472 st8 [r4]=r5,3*8
473 st8 [r6]=r7,3*8;;
474
475 mov r3=cr.iip
476 mov r5=cr.ifa
477 mov r7=cr.itir;;
478 st8 [r2]=r3,3*8
479 st8 [r4]=r5,3*8
480 st8 [r6]=r7,3*8;;
481
482 mov r3=cr.iipa
483 mov r5=cr.ifs
484 mov r7=cr.iim;;
485 st8 [r2]=r3,3*8
486 st8 [r4]=r5,3*8
487 st8 [r6]=r7,3*8;;
488
489 mov r3=cr25;; // cr.iha
490 st8 [r2]=r3,160;; // 160 byte rement
491
492SkipIntrRegs:
493 st8 [r2]=r0,152;; // another 152 byte .
494
495 add r4=8,r2 // duplicate r2 in r4
496 add r6=2*8,r2 // duplicate r2 in r6
497
498 mov r3=cr.lid
499// mov r5=cr.ivr // cr.ivr, don't read it
500 mov r7=cr.tpr;;
501 st8 [r2]=r3,3*8
502 st8 [r4]=r5,3*8
503 st8 [r6]=r7,3*8;;
504
505 mov r3=r0 // cr.eoi => cr67
506 mov r5=r0 // cr.irr0 => cr68
507 mov r7=r0;; // cr.irr1 => cr69
508 st8 [r2]=r3,3*8
509 st8 [r4]=r5,3*8
510 st8 [r6]=r7,3*8;;
511
512 mov r3=r0 // cr.irr2 => cr70
513 mov r5=r0 // cr.irr3 => cr71
514 mov r7=cr.itv;;
515 st8 [r2]=r3,3*8
516 st8 [r4]=r5,3*8
517 st8 [r6]=r7,3*8;;
518
519 mov r3=cr.pmv
520 mov r5=cr.cmcv;;
521 st8 [r2]=r3,7*8
522 st8 [r4]=r5,7*8;;
523
524 mov r3=r0 // cr.lrr0 => cr80
525 mov r5=r0;; // cr.lrr1 => cr81
526 st8 [r2]=r3,23*8
527 st8 [r4]=r5,23*8;;
528
529 adds r2=25*8,r2;;
530
531cSaveARs:
532// save ARs
533 add r4=8,r2 // duplicate r2 in r4
534 add r6=2*8,r2 // duplicate r2 in r6
535
536 mov r3=ar.k0
537 mov r5=ar.k1
538 mov r7=ar.k2;;
539 st8 [r2]=r3,3*8
540 st8 [r4]=r5,3*8
541 st8 [r6]=r7,3*8;;
542
543 mov r3=ar.k3
544 mov r5=ar.k4
545 mov r7=ar.k5;;
546 st8 [r2]=r3,3*8
547 st8 [r4]=r5,3*8
548 st8 [r6]=r7,3*8;;
549
550 mov r3=ar.k6
551 mov r5=ar.k7
552 mov r7=r0;; // ar.kr8
553 st8 [r2]=r3,10*8
554 st8 [r4]=r5,10*8
555 st8 [r6]=r7,10*8;; // rement by 72 bytes
556
557 mov r3=ar.rsc
558 mov ar.rsc=r0 // put RSE in enforced lazy mode
559 mov r5=ar.bsp
560 ;;
561 mov r7=ar.bspstore;;
562 st8 [r2]=r3,3*8
563 st8 [r4]=r5,3*8
564 st8 [r6]=r7,3*8;;
565
566 mov r3=ar.rnat;;
567 st8 [r2]=r3,8*13 // increment by 13x8 bytes
568
569 mov r3=ar.ccv;;
570 st8 [r2]=r3,8*4
571
572 mov r3=ar.unat;;
573 st8 [r2]=r3,8*4
574
575 mov r3=ar.fpsr;;
576 st8 [r2]=r3,8*4
577
578 mov r3=ar.itc;;
579 st8 [r2]=r3,160 // 160
580
581 mov r3=ar.pfs;;
582 st8 [r2]=r3,8
583
584 mov r3=ar.lc;;
585 st8 [r2]=r3,8
586
587 mov r3=ar.ec;;
588 st8 [r2]=r3
589 add r2=8*62,r2 //padding
590
591// save RRs
592 mov ar.lc=0x08-1
593 movl r4=0x00;;
594
595cStRR:
596 dep.z r5=r4,61,3;;
597 mov r3=rr[r5];;
598 st8 [r2]=r3,8
599 add r4=1,r4
600 br.cloop.sptk.few cStRR
601 ;;
602end_os_mca_dump:
603 br ia64_os_mca_done_dump;;
604
605//EndStub//////////////////////////////////////////////////////////////////////
606
607
608//++
609// Name:
610// ia64_os_mca_proc_state_restore()
611//
612// Stub Description:
613//
614// This is a stub to restore the saved processor state during MCHK
615//
616//--
617
618ia64_os_mca_proc_state_restore:
619
620// Restore bank1 GR16-31
621 GET_IA64_MCA_DATA(r2)
622 ;;
623 add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2
624
625restore_GRs: // restore bank-1 GRs 16-31
626 bsw.1;;
627 add r3=16*8,r2;; // to get to NaT of GR 16-31
628 ld8 r3=[r3];;
629 mov ar.unat=r3;; // first restore NaT
630
631 ld8.fill r16=[r2],8;;
632 ld8.fill r17=[r2],8;;
633 ld8.fill r18=[r2],8;;
634 ld8.fill r19=[r2],8;;
635 ld8.fill r20=[r2],8;;
636 ld8.fill r21=[r2],8;;
637 ld8.fill r22=[r2],8;;
638 ld8.fill r23=[r2],8;;
639 ld8.fill r24=[r2],8;;
640 ld8.fill r25=[r2],8;;
641 ld8.fill r26=[r2],8;;
642 ld8.fill r27=[r2],8;;
643 ld8.fill r28=[r2],8;;
644 ld8.fill r29=[r2],8;;
645 ld8.fill r30=[r2],8;;
646 ld8.fill r31=[r2],8;;
647
648 ld8 r3=[r2],8;; // increment to skip NaT
649 bsw.0;;
650
651restore_BRs:
652 add r4=8,r2 // duplicate r2 in r4
653 add r6=2*8,r2;; // duplicate r2 in r4
654
655 ld8 r3=[r2],3*8
656 ld8 r5=[r4],3*8
657 ld8 r7=[r6],3*8;;
658 mov b0=r3
659 mov b1=r5
660 mov b2=r7;;
661
662 ld8 r3=[r2],3*8
663 ld8 r5=[r4],3*8
664 ld8 r7=[r6],3*8;;
665 mov b3=r3
666 mov b4=r5
667 mov b5=r7;;
668
669 ld8 r3=[r2],2*8
670 ld8 r5=[r4],2*8;;
671 mov b6=r3
672 mov b7=r5;;
673
674restore_CRs:
675 add r4=8,r2 // duplicate r2 in r4
676 add r6=2*8,r2;; // duplicate r2 in r4
677
678 ld8 r3=[r2],8*8
679 ld8 r5=[r4],3*8
680 ld8 r7=[r6],3*8;; // 48 byte increments
681 mov cr.dcr=r3
682 mov cr.itm=r5
683 mov cr.iva=r7;;
684
685 ld8 r3=[r2],8*8;; // 64 byte increments
686// mov cr.pta=r3
687
688
689// if PSR.ic=1, reading interruption registers causes an illegal operation fault
690 mov r3=psr;;
691 tbit.nz.unc p6,p0=r3,PSR_IC;; // PSI Valid Log bit pos. test
692(p6) st8 [r2]=r0,9*8+160 // increment by 232 byte inc.
693
694begin_rskip_intr_regs:
695(p6) br rSkipIntrRegs;;
696
697 add r4=8,r2 // duplicate r2 in r4
698 add r6=2*8,r2;; // duplicate r2 in r4
699
700 ld8 r3=[r2],3*8
701 ld8 r5=[r4],3*8
702 ld8 r7=[r6],3*8;;
703 mov cr.ipsr=r3
704// mov cr.isr=r5 // cr.isr is read only
705
706 ld8 r3=[r2],3*8
707 ld8 r5=[r4],3*8
708 ld8 r7=[r6],3*8;;
709 mov cr.iip=r3
710 mov cr.ifa=r5
711 mov cr.itir=r7;;
712
713 ld8 r3=[r2],3*8
714 ld8 r5=[r4],3*8
715 ld8 r7=[r6],3*8;;
716 mov cr.iipa=r3
717 mov cr.ifs=r5
718 mov cr.iim=r7
719
720 ld8 r3=[r2],160;; // 160 byte increment
721 mov cr.iha=r3
722
723rSkipIntrRegs:
724 ld8 r3=[r2],152;; // another 152 byte inc.
725
726 add r4=8,r2 // duplicate r2 in r4
727 add r6=2*8,r2;; // duplicate r2 in r6
728
729 ld8 r3=[r2],8*3
730 ld8 r5=[r4],8*3
731 ld8 r7=[r6],8*3;;
732 mov cr.lid=r3
733// mov cr.ivr=r5 // cr.ivr is read only
734 mov cr.tpr=r7;;
735
736 ld8 r3=[r2],8*3
737 ld8 r5=[r4],8*3
738 ld8 r7=[r6],8*3;;
739// mov cr.eoi=r3
740// mov cr.irr0=r5 // cr.irr0 is read only
741// mov cr.irr1=r7;; // cr.irr1 is read only
742
743 ld8 r3=[r2],8*3
744 ld8 r5=[r4],8*3
745 ld8 r7=[r6],8*3;;
746// mov cr.irr2=r3 // cr.irr2 is read only
747// mov cr.irr3=r5 // cr.irr3 is read only
748 mov cr.itv=r7;;
749
750 ld8 r3=[r2],8*7
751 ld8 r5=[r4],8*7;;
752 mov cr.pmv=r3
753 mov cr.cmcv=r5;;
754
755 ld8 r3=[r2],8*23
756 ld8 r5=[r4],8*23;;
757 adds r2=8*23,r2
758 adds r4=8*23,r4;;
759// mov cr.lrr0=r3
760// mov cr.lrr1=r5
761
762 adds r2=8*2,r2;;
763
764restore_ARs:
765 add r4=8,r2 // duplicate r2 in r4
766 add r6=2*8,r2;; // duplicate r2 in r4
767
768 ld8 r3=[r2],3*8
769 ld8 r5=[r4],3*8
770 ld8 r7=[r6],3*8;;
771 mov ar.k0=r3
772 mov ar.k1=r5
773 mov ar.k2=r7;;
774
775 ld8 r3=[r2],3*8
776 ld8 r5=[r4],3*8
777 ld8 r7=[r6],3*8;;
778 mov ar.k3=r3
779 mov ar.k4=r5
780 mov ar.k5=r7;;
781
782 ld8 r3=[r2],10*8
783 ld8 r5=[r4],10*8
784 ld8 r7=[r6],10*8;;
785 mov ar.k6=r3
786 mov ar.k7=r5
787 ;;
788
789 ld8 r3=[r2],3*8
790 ld8 r5=[r4],3*8
791 ld8 r7=[r6],3*8;;
792// mov ar.rsc=r3
793// mov ar.bsp=r5 // ar.bsp is read only
794 mov ar.rsc=r0 // make sure that RSE is in enforced lazy mode
795 ;;
796 mov ar.bspstore=r7;;
797
798 ld8 r9=[r2],8*13;;
799 mov ar.rnat=r9
800
801 mov ar.rsc=r3
802 ld8 r3=[r2],8*4;;
803 mov ar.ccv=r3
804
805 ld8 r3=[r2],8*4;;
806 mov ar.unat=r3
807
808 ld8 r3=[r2],8*4;;
809 mov ar.fpsr=r3
810
811 ld8 r3=[r2],160;; // 160
812// mov ar.itc=r3
813
814 ld8 r3=[r2],8;;
815 mov ar.pfs=r3
816
817 ld8 r3=[r2],8;;
818 mov ar.lc=r3
819
820 ld8 r3=[r2];;
821 mov ar.ec=r3
822 add r2=8*62,r2;; // padding
823
824restore_RRs:
825 mov r5=ar.lc
826 mov ar.lc=0x08-1
827 movl r4=0x00;;
828cStRRr:
829 dep.z r7=r4,61,3
830 ld8 r3=[r2],8;;
831 mov rr[r7]=r3 // what are its access previledges?
832 add r4=1,r4
833 br.cloop.sptk.few cStRRr
834 ;;
835 mov ar.lc=r5
836 ;;
837end_os_mca_restore:
838 br ia64_os_mca_done_restore;;
839
840//EndStub//////////////////////////////////////////////////////////////////////
841
842
843// ok, the issue here is that we need to save state information so
844// it can be useable by the kernel debugger and show regs routines.
845// In order to do this, our best bet is save the current state (plus
846// the state information obtain from the MIN_STATE_AREA) into a pt_regs
847// format. This way we can pass it on in a useable format.
848//
849
850//
851// SAL to OS entry point for INIT on the monarch processor
852// This has been defined for registration purposes with SAL
853// as a part of ia64_mca_init.
854//
855// When we get here, the following registers have been
856// set by the SAL for our use
857//
858// 1. GR1 = OS INIT GP
859// 2. GR8 = PAL_PROC physical address
860// 3. GR9 = SAL_PROC physical address
861// 4. GR10 = SAL GP (physical)
862// 5. GR11 = Init Reason
863// 0 = Received INIT for event other than crash dump switch
864// 1 = Received wakeup at the end of an OS_MCA corrected machine check
865// 2 = Received INIT dude to CrashDump switch assertion
866//
867// 6. GR12 = Return address to location within SAL_INIT procedure
868
869
870GLOBAL_ENTRY(ia64_monarch_init_handler)
871 .prologue
872 // stash the information the SAL passed to os
873 SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2)
874 ;;
875 SAVE_MIN_WITH_COVER
876 ;;
877 mov r8=cr.ifa
878 mov r9=cr.isr
879 adds r3=8,r2 // set up second base pointer
880 ;;
881 SAVE_REST
882
883// ok, enough should be saved at this point to be dangerous, and supply
884// information for a dump
885// We need to switch to Virtual mode before hitting the C functions.
886
887 movl r2=IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN
888 mov r3=psr // get the current psr, minimum enabled at this point
889 ;;
890 or r2=r2,r3
891 ;;
892 movl r3=IVirtual_Switch
893 ;;
894 mov cr.iip=r3 // short return to set the appropriate bits
895 mov cr.ipsr=r2 // need to do an rfi to set appropriate bits
896 ;;
897 rfi
898 ;;
899IVirtual_Switch:
900 //
901 // We should now be running virtual
902 //
903 // Let's call the C handler to get the rest of the state info
904 //
905 alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
906 ;;
907 adds out0=16,sp // out0 = pointer to pt_regs
908 ;;
909 DO_SAVE_SWITCH_STACK
910 .body
911 adds out1=16,sp // out0 = pointer to switch_stack
912
913 br.call.sptk.many rp=ia64_init_handler
914.ret1:
915
916return_from_init:
917 br.sptk return_from_init
918END(ia64_monarch_init_handler)
919
920//
921// SAL to OS entry point for INIT on the slave processor
922// This has been defined for registration purposes with SAL
923// as a part of ia64_mca_init.
924//
925
926GLOBAL_ENTRY(ia64_slave_init_handler)
9271: br.sptk 1b
928END(ia64_slave_init_handler)
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
new file mode 100644
index 000000000000..ab478172c349
--- /dev/null
+++ b/arch/ia64/kernel/mca_drv.c
@@ -0,0 +1,639 @@
1/*
2 * File: mca_drv.c
3 * Purpose: Generic MCA handling layer
4 *
5 * Copyright (C) 2004 FUJITSU LIMITED
6 * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
7 */
8#include <linux/config.h>
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/sched.h>
12#include <linux/interrupt.h>
13#include <linux/irq.h>
14#include <linux/kallsyms.h>
15#include <linux/smp_lock.h>
16#include <linux/bootmem.h>
17#include <linux/acpi.h>
18#include <linux/timer.h>
19#include <linux/module.h>
20#include <linux/kernel.h>
21#include <linux/smp.h>
22#include <linux/workqueue.h>
23#include <linux/mm.h>
24
25#include <asm/delay.h>
26#include <asm/machvec.h>
27#include <asm/page.h>
28#include <asm/ptrace.h>
29#include <asm/system.h>
30#include <asm/sal.h>
31#include <asm/mca.h>
32
33#include <asm/irq.h>
34#include <asm/hw_irq.h>
35
36#include "mca_drv.h"
37
38/* max size of SAL error record (default) */
39static int sal_rec_max = 10000;
40
41/* from mca.c */
42static ia64_mca_sal_to_os_state_t *sal_to_os_handoff_state;
43static ia64_mca_os_to_sal_state_t *os_to_sal_handoff_state;
44
45/* from mca_drv_asm.S */
46extern void *mca_handler_bhhook(void);
47
48static DEFINE_SPINLOCK(mca_bh_lock);
49
50typedef enum {
51 MCA_IS_LOCAL = 0,
52 MCA_IS_GLOBAL = 1
53} mca_type_t;
54
55#define MAX_PAGE_ISOLATE 1024
56
57static struct page *page_isolate[MAX_PAGE_ISOLATE];
58static int num_page_isolate = 0;
59
60typedef enum {
61 ISOLATE_NG = 0,
62 ISOLATE_OK = 1
63} isolate_status_t;
64
65/*
66 * This pool keeps pointers to the section part of SAL error record
67 */
68static struct {
69 slidx_list_t *buffer; /* section pointer list pool */
70 int cur_idx; /* Current index of section pointer list pool */
71 int max_idx; /* Maximum index of section pointer list pool */
72} slidx_pool;
73
74/**
75 * mca_page_isolate - isolate a poisoned page in order not to use it later
76 * @paddr: poisoned memory location
77 *
78 * Return value:
79 * ISOLATE_OK / ISOLATE_NG
80 */
81
82static isolate_status_t
83mca_page_isolate(unsigned long paddr)
84{
85 int i;
86 struct page *p;
87
88 /* whether physical address is valid or not */
89 if ( !ia64_phys_addr_valid(paddr) )
90 return ISOLATE_NG;
91
92 /* convert physical address to physical page number */
93 p = pfn_to_page(paddr>>PAGE_SHIFT);
94
95 /* check whether a page number have been already registered or not */
96 for( i = 0; i < num_page_isolate; i++ )
97 if( page_isolate[i] == p )
98 return ISOLATE_OK; /* already listed */
99
100 /* limitation check */
101 if( num_page_isolate == MAX_PAGE_ISOLATE )
102 return ISOLATE_NG;
103
104 /* kick pages having attribute 'SLAB' or 'Reserved' */
105 if( PageSlab(p) || PageReserved(p) )
106 return ISOLATE_NG;
107
108 /* add attribute 'Reserved' and register the page */
109 SetPageReserved(p);
110 page_isolate[num_page_isolate++] = p;
111
112 return ISOLATE_OK;
113}
114
115/**
116 * mca_hanlder_bh - Kill the process which occurred memory read error
117 * @paddr: poisoned address received from MCA Handler
118 */
119
120void
121mca_handler_bh(unsigned long paddr)
122{
123 printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
124 current->pid, current->comm);
125
126 spin_lock(&mca_bh_lock);
127 if (mca_page_isolate(paddr) == ISOLATE_OK) {
128 printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr);
129 } else {
130 printk(KERN_DEBUG "Page isolation: ( %lx ) failure.\n", paddr);
131 }
132 spin_unlock(&mca_bh_lock);
133
134 /* This process is about to be killed itself */
135 force_sig(SIGKILL, current);
136 schedule();
137}
138
139/**
140 * mca_make_peidx - Make index of processor error section
141 * @slpi: pointer to record of processor error section
142 * @peidx: pointer to index of processor error section
143 */
144
145static void
146mca_make_peidx(sal_log_processor_info_t *slpi, peidx_table_t *peidx)
147{
148 /*
149 * calculate the start address of
150 * "struct cpuid_info" and "sal_processor_static_info_t".
151 */
152 u64 total_check_num = slpi->valid.num_cache_check
153 + slpi->valid.num_tlb_check
154 + slpi->valid.num_bus_check
155 + slpi->valid.num_reg_file_check
156 + slpi->valid.num_ms_check;
157 u64 head_size = sizeof(sal_log_mod_error_info_t) * total_check_num
158 + sizeof(sal_log_processor_info_t);
159 u64 mid_size = slpi->valid.cpuid_info * sizeof(struct sal_cpuid_info);
160
161 peidx_head(peidx) = slpi;
162 peidx_mid(peidx) = (struct sal_cpuid_info *)
163 (slpi->valid.cpuid_info ? ((char*)slpi + head_size) : NULL);
164 peidx_bottom(peidx) = (sal_processor_static_info_t *)
165 (slpi->valid.psi_static_struct ?
166 ((char*)slpi + head_size + mid_size) : NULL);
167}
168
169/**
170 * mca_make_slidx - Make index of SAL error record
171 * @buffer: pointer to SAL error record
172 * @slidx: pointer to index of SAL error record
173 *
174 * Return value:
175 * 1 if record has platform error / 0 if not
176 */
177#define LOG_INDEX_ADD_SECT_PTR(sect, ptr) \
178 { slidx_list_t *hl = &slidx_pool.buffer[slidx_pool.cur_idx]; \
179 hl->hdr = ptr; \
180 list_add(&hl->list, &(sect)); \
181 slidx_pool.cur_idx = (slidx_pool.cur_idx + 1)%slidx_pool.max_idx; }
182
183static int
184mca_make_slidx(void *buffer, slidx_table_t *slidx)
185{
186 int platform_err = 0;
187 int record_len = ((sal_log_record_header_t*)buffer)->len;
188 u32 ercd_pos;
189 int sects;
190 sal_log_section_hdr_t *sp;
191
192 /*
193 * Initialize index referring current record
194 */
195 INIT_LIST_HEAD(&(slidx->proc_err));
196 INIT_LIST_HEAD(&(slidx->mem_dev_err));
197 INIT_LIST_HEAD(&(slidx->sel_dev_err));
198 INIT_LIST_HEAD(&(slidx->pci_bus_err));
199 INIT_LIST_HEAD(&(slidx->smbios_dev_err));
200 INIT_LIST_HEAD(&(slidx->pci_comp_err));
201 INIT_LIST_HEAD(&(slidx->plat_specific_err));
202 INIT_LIST_HEAD(&(slidx->host_ctlr_err));
203 INIT_LIST_HEAD(&(slidx->plat_bus_err));
204 INIT_LIST_HEAD(&(slidx->unsupported));
205
206 /*
207 * Extract a Record Header
208 */
209 slidx->header = buffer;
210
211 /*
212 * Extract each section records
213 * (arranged from "int ia64_log_platform_info_print()")
214 */
215 for (ercd_pos = sizeof(sal_log_record_header_t), sects = 0;
216 ercd_pos < record_len; ercd_pos += sp->len, sects++) {
217 sp = (sal_log_section_hdr_t *)((char*)buffer + ercd_pos);
218 if (!efi_guidcmp(sp->guid, SAL_PROC_DEV_ERR_SECT_GUID)) {
219 LOG_INDEX_ADD_SECT_PTR(slidx->proc_err, sp);
220 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID)) {
221 platform_err = 1;
222 LOG_INDEX_ADD_SECT_PTR(slidx->mem_dev_err, sp);
223 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SEL_DEV_ERR_SECT_GUID)) {
224 platform_err = 1;
225 LOG_INDEX_ADD_SECT_PTR(slidx->sel_dev_err, sp);
226 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_PCI_BUS_ERR_SECT_GUID)) {
227 platform_err = 1;
228 LOG_INDEX_ADD_SECT_PTR(slidx->pci_bus_err, sp);
229 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID)) {
230 platform_err = 1;
231 LOG_INDEX_ADD_SECT_PTR(slidx->smbios_dev_err, sp);
232 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_PCI_COMP_ERR_SECT_GUID)) {
233 platform_err = 1;
234 LOG_INDEX_ADD_SECT_PTR(slidx->pci_comp_err, sp);
235 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SPECIFIC_ERR_SECT_GUID)) {
236 platform_err = 1;
237 LOG_INDEX_ADD_SECT_PTR(slidx->plat_specific_err, sp);
238 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_HOST_CTLR_ERR_SECT_GUID)) {
239 platform_err = 1;
240 LOG_INDEX_ADD_SECT_PTR(slidx->host_ctlr_err, sp);
241 } else if (!efi_guidcmp(sp->guid, SAL_PLAT_BUS_ERR_SECT_GUID)) {
242 platform_err = 1;
243 LOG_INDEX_ADD_SECT_PTR(slidx->plat_bus_err, sp);
244 } else {
245 LOG_INDEX_ADD_SECT_PTR(slidx->unsupported, sp);
246 }
247 }
248 slidx->n_sections = sects;
249
250 return platform_err;
251}
252
253/**
254 * init_record_index_pools - Initialize pool of lists for SAL record index
255 *
256 * Return value:
257 * 0 on Success / -ENOMEM on Failure
258 */
259static int
260init_record_index_pools(void)
261{
262 int i;
263 int rec_max_size; /* Maximum size of SAL error records */
264 int sect_min_size; /* Minimum size of SAL error sections */
265 /* minimum size table of each section */
266 static int sal_log_sect_min_sizes[] = {
267 sizeof(sal_log_processor_info_t) + sizeof(sal_processor_static_info_t),
268 sizeof(sal_log_mem_dev_err_info_t),
269 sizeof(sal_log_sel_dev_err_info_t),
270 sizeof(sal_log_pci_bus_err_info_t),
271 sizeof(sal_log_smbios_dev_err_info_t),
272 sizeof(sal_log_pci_comp_err_info_t),
273 sizeof(sal_log_plat_specific_err_info_t),
274 sizeof(sal_log_host_ctlr_err_info_t),
275 sizeof(sal_log_plat_bus_err_info_t),
276 };
277
278 /*
279 * MCA handler cannot allocate new memory on flight,
280 * so we preallocate enough memory to handle a SAL record.
281 *
282 * Initialize a handling set of slidx_pool:
283 * 1. Pick up the max size of SAL error records
284 * 2. Pick up the min size of SAL error sections
285 * 3. Allocate the pool as enough to 2 SAL records
286 * (now we can estimate the maxinum of section in a record.)
287 */
288
289 /* - 1 - */
290 rec_max_size = sal_rec_max;
291
292 /* - 2 - */
293 sect_min_size = sal_log_sect_min_sizes[0];
294 for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++)
295 if (sect_min_size > sal_log_sect_min_sizes[i])
296 sect_min_size = sal_log_sect_min_sizes[i];
297
298 /* - 3 - */
299 slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1;
300 slidx_pool.buffer = (slidx_list_t *) kmalloc(slidx_pool.max_idx * sizeof(slidx_list_t), GFP_KERNEL);
301
302 return slidx_pool.buffer ? 0 : -ENOMEM;
303}
304
305
306/*****************************************************************************
307 * Recovery functions *
308 *****************************************************************************/
309
310/**
311 * is_mca_global - Check whether this MCA is global or not
312 * @peidx: pointer of index of processor error section
313 * @pbci: pointer to pal_bus_check_info_t
314 *
315 * Return value:
316 * MCA_IS_LOCAL / MCA_IS_GLOBAL
317 */
318
319static mca_type_t
320is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci)
321{
322 pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx);
323
324 /*
325 * PAL can request a rendezvous, if the MCA has a global scope.
326 * If "rz_always" flag is set, SAL requests MCA rendezvous
327 * in spite of global MCA.
328 * Therefore it is local MCA when rendezvous has not been requested.
329 * Failed to rendezvous, the system must be down.
330 */
331 switch (sal_to_os_handoff_state->imsto_rendez_state) {
332 case -1: /* SAL rendezvous unsuccessful */
333 return MCA_IS_GLOBAL;
334 case 0: /* SAL rendezvous not required */
335 return MCA_IS_LOCAL;
336 case 1: /* SAL rendezvous successful int */
337 case 2: /* SAL rendezvous successful int with init */
338 default:
339 break;
340 }
341
342 /*
343 * If One or more Cache/TLB/Reg_File/Uarch_Check is here,
344 * it would be a local MCA. (i.e. processor internal error)
345 */
346 if (psp->tc || psp->cc || psp->rc || psp->uc)
347 return MCA_IS_LOCAL;
348
349 /*
350 * Bus_Check structure with Bus_Check.ib (internal bus error) flag set
351 * would be a global MCA. (e.g. a system bus address parity error)
352 */
353 if (!pbci || pbci->ib)
354 return MCA_IS_GLOBAL;
355
356 /*
357 * Bus_Check structure with Bus_Check.eb (external bus error) flag set
358 * could be either a local MCA or a global MCA.
359 *
360 * Referring Bus_Check.bsi:
361 * 0: Unknown/unclassified
362 * 1: BERR#
363 * 2: BINIT#
364 * 3: Hard Fail
365 * (FIXME: Are these SGI specific or generic bsi values?)
366 */
367 if (pbci->eb)
368 switch (pbci->bsi) {
369 case 0:
370 /* e.g. a load from poisoned memory */
371 return MCA_IS_LOCAL;
372 case 1:
373 case 2:
374 case 3:
375 return MCA_IS_GLOBAL;
376 }
377
378 return MCA_IS_GLOBAL;
379}
380
381/**
382 * recover_from_read_error - Try to recover the errors which type are "read"s.
383 * @slidx: pointer of index of SAL error record
384 * @peidx: pointer of index of processor error section
385 * @pbci: pointer of pal_bus_check_info
386 *
387 * Return value:
388 * 1 on Success / 0 on Failure
389 */
390
391static int
392recover_from_read_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci)
393{
394 sal_log_mod_error_info_t *smei;
395 pal_min_state_area_t *pmsa;
396 struct ia64_psr *psr1, *psr2;
397 ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
398
399 /* Is target address valid? */
400 if (!pbci->tv)
401 return 0;
402
403 /*
404 * cpu read or memory-mapped io read
405 *
406 * offending process affected process OS MCA do
407 * kernel mode kernel mode down system
408 * kernel mode user mode kill the process
409 * user mode kernel mode down system (*)
410 * user mode user mode kill the process
411 *
412 * (*) You could terminate offending user-mode process
413 * if (pbci->pv && pbci->pl != 0) *and* if you sure
414 * the process not have any locks of kernel.
415 */
416
417 psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
418
419 /*
420 * Check the privilege level of interrupted context.
421 * If it is user-mode, then terminate affected process.
422 */
423 if (psr1->cpl != 0) {
424 smei = peidx_bus_check(peidx, 0);
425 if (smei->valid.target_identifier) {
426 /*
427 * setup for resume to bottom half of MCA,
428 * "mca_handler_bhhook"
429 */
430 pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
431 /* pass to bhhook as 1st argument (gr8) */
432 pmsa->pmsa_gr[8-1] = smei->target_identifier;
433 /* set interrupted return address (but no use) */
434 pmsa->pmsa_br0 = pmsa->pmsa_iip;
435 /* change resume address to bottom half */
436 pmsa->pmsa_iip = mca_hdlr_bh->fp;
437 pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
438 /* set cpl with kernel mode */
439 psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
440 psr2->cpl = 0;
441 psr2->ri = 0;
442
443 return 1;
444 }
445
446 }
447
448 return 0;
449}
450
451/**
452 * recover_from_platform_error - Recover from platform error.
453 * @slidx: pointer of index of SAL error record
454 * @peidx: pointer of index of processor error section
455 * @pbci: pointer of pal_bus_check_info
456 *
457 * Return value:
458 * 1 on Success / 0 on Failure
459 */
460
461static int
462recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci)
463{
464 int status = 0;
465 pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx);
466
467 if (psp->bc && pbci->eb && pbci->bsi == 0) {
468 switch(pbci->type) {
469 case 1: /* partial read */
470 case 3: /* full line(cpu) read */
471 case 9: /* I/O space read */
472 status = recover_from_read_error(slidx, peidx, pbci);
473 break;
474 case 0: /* unknown */
475 case 2: /* partial write */
476 case 4: /* full line write */
477 case 5: /* implicit or explicit write-back operation */
478 case 6: /* snoop probe */
479 case 7: /* incoming or outgoing ptc.g */
480 case 8: /* write coalescing transactions */
481 case 10: /* I/O space write */
482 case 11: /* inter-processor interrupt message(IPI) */
483 case 12: /* interrupt acknowledge or external task priority cycle */
484 default:
485 break;
486 }
487 }
488
489 return status;
490}
491
492/**
493 * recover_from_processor_error
494 * @platform: whether there are some platform error section or not
495 * @slidx: pointer of index of SAL error record
496 * @peidx: pointer of index of processor error section
497 * @pbci: pointer of pal_bus_check_info
498 *
499 * Return value:
500 * 1 on Success / 0 on Failure
501 */
502/*
503 * Later we try to recover when below all conditions are satisfied.
504 * 1. Only one processor error section is exist.
505 * 2. BUS_CHECK is exist and the others are not exist.(Except TLB_CHECK)
506 * 3. The entry of BUS_CHECK_INFO is 1.
507 * 4. "External bus error" flag is set and the others are not set.
508 */
509
510static int
511recover_from_processor_error(int platform, slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci)
512{
513 pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx);
514
515 /*
516 * We cannot recover errors with other than bus_check.
517 */
518 if (psp->cc || psp->rc || psp->uc)
519 return 0;
520
521 /*
522 * If there is no bus error, record is weird but we need not to recover.
523 */
524 if (psp->bc == 0 || pbci == NULL)
525 return 1;
526
527 /*
528 * Sorry, we cannot handle so many.
529 */
530 if (peidx_bus_check_num(peidx) > 1)
531 return 0;
532 /*
533 * Well, here is only one bus error.
534 */
535 if (pbci->ib || pbci->cc)
536 return 0;
537 if (pbci->eb && pbci->bsi > 0)
538 return 0;
539 if (psp->ci == 0)
540 return 0;
541
542 /*
543 * This is a local MCA and estimated as recoverble external bus error.
544 * (e.g. a load from poisoned memory)
545 * This means "there are some platform errors".
546 */
547 if (platform)
548 return recover_from_platform_error(slidx, peidx, pbci);
549 /*
550 * On account of strange SAL error record, we cannot recover.
551 */
552 return 0;
553}
554
555/**
556 * mca_try_to_recover - Try to recover from MCA
557 * @rec: pointer to a SAL error record
558 *
559 * Return value:
560 * 1 on Success / 0 on Failure
561 */
562
563static int
564mca_try_to_recover(void *rec,
565 ia64_mca_sal_to_os_state_t *sal_to_os_state,
566 ia64_mca_os_to_sal_state_t *os_to_sal_state)
567{
568 int platform_err;
569 int n_proc_err;
570 slidx_table_t slidx;
571 peidx_table_t peidx;
572 pal_bus_check_info_t pbci;
573
574 /* handoff state from/to mca.c */
575 sal_to_os_handoff_state = sal_to_os_state;
576 os_to_sal_handoff_state = os_to_sal_state;
577
578 /* Make index of SAL error record */
579 platform_err = mca_make_slidx(rec, &slidx);
580
581 /* Count processor error sections */
582 n_proc_err = slidx_count(&slidx, proc_err);
583
584 /* Now, OS can recover when there is one processor error section */
585 if (n_proc_err > 1)
586 return 0;
587 else if (n_proc_err == 0) {
588 /* Weird SAL record ... We need not to recover */
589
590 return 1;
591 }
592
593 /* Make index of processor error section */
594 mca_make_peidx((sal_log_processor_info_t*)slidx_first_entry(&slidx.proc_err)->hdr, &peidx);
595
596 /* Extract Processor BUS_CHECK[0] */
597 *((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0);
598
599 /* Check whether MCA is global or not */
600 if (is_mca_global(&peidx, &pbci))
601 return 0;
602
603 /* Try to recover a processor error */
604 return recover_from_processor_error(platform_err, &slidx, &peidx, &pbci);
605}
606
607/*
608 * =============================================================================
609 */
610
611int __init mca_external_handler_init(void)
612{
613 if (init_record_index_pools())
614 return -ENOMEM;
615
616 /* register external mca handlers */
617 if (ia64_reg_MCA_extension(mca_try_to_recover)){
618 printk(KERN_ERR "ia64_reg_MCA_extension failed.\n");
619 kfree(slidx_pool.buffer);
620 return -EFAULT;
621 }
622 return 0;
623}
624
625void __exit mca_external_handler_exit(void)
626{
627 /* unregister external mca handlers */
628 ia64_unreg_MCA_extension();
629 kfree(slidx_pool.buffer);
630}
631
632module_init(mca_external_handler_init);
633module_exit(mca_external_handler_exit);
634
635module_param(sal_rec_max, int, 0644);
636MODULE_PARM_DESC(sal_rec_max, "Max size of SAL error record");
637
638MODULE_DESCRIPTION("ia64 platform dependent mca handler driver");
639MODULE_LICENSE("GPL");
diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h
new file mode 100644
index 000000000000..0227b761f2c4
--- /dev/null
+++ b/arch/ia64/kernel/mca_drv.h
@@ -0,0 +1,113 @@
1/*
2 * File: mca_drv.h
3 * Purpose: Define helpers for Generic MCA handling
4 *
5 * Copyright (C) 2004 FUJITSU LIMITED
6 * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
7 */
8/*
9 * Processor error section:
10 *
11 * +-sal_log_processor_info_t *info-------------+
12 * | sal_log_section_hdr_t header; |
13 * | ... |
14 * | sal_log_mod_error_info_t info[0]; |
15 * +-+----------------+-------------------------+
16 * | CACHE_CHECK | ^ num_cache_check v
17 * +----------------+
18 * | TLB_CHECK | ^ num_tlb_check v
19 * +----------------+
20 * | BUS_CHECK | ^ num_bus_check v
21 * +----------------+
22 * | REG_FILE_CHECK | ^ num_reg_file_check v
23 * +----------------+
24 * | MS_CHECK | ^ num_ms_check v
25 * +-struct cpuid_info *id----------------------+
26 * | regs[5]; |
27 * | reserved; |
28 * +-sal_processor_static_info_t *regs----------+
29 * | valid; |
30 * | ... |
31 * | fr[128]; |
32 * +--------------------------------------------+
33 */
34
35/* peidx: index of processor error section */
36typedef struct peidx_table {
37 sal_log_processor_info_t *info;
38 struct sal_cpuid_info *id;
39 sal_processor_static_info_t *regs;
40} peidx_table_t;
41
42#define peidx_head(p) (((p)->info))
43#define peidx_mid(p) (((p)->id))
44#define peidx_bottom(p) (((p)->regs))
45
46#define peidx_psp(p) (&(peidx_head(p)->proc_state_parameter))
47#define peidx_field_valid(p) (&(peidx_head(p)->valid))
48#define peidx_minstate_area(p) (&(peidx_bottom(p)->min_state_area))
49
50#define peidx_cache_check_num(p) (peidx_head(p)->valid.num_cache_check)
51#define peidx_tlb_check_num(p) (peidx_head(p)->valid.num_tlb_check)
52#define peidx_bus_check_num(p) (peidx_head(p)->valid.num_bus_check)
53#define peidx_reg_file_check_num(p) (peidx_head(p)->valid.num_reg_file_check)
54#define peidx_ms_check_num(p) (peidx_head(p)->valid.num_ms_check)
55
56#define peidx_cache_check_idx(p, n) (n)
57#define peidx_tlb_check_idx(p, n) (peidx_cache_check_idx(p, peidx_cache_check_num(p)) + n)
58#define peidx_bus_check_idx(p, n) (peidx_tlb_check_idx(p, peidx_tlb_check_num(p)) + n)
59#define peidx_reg_file_check_idx(p, n) (peidx_bus_check_idx(p, peidx_bus_check_num(p)) + n)
60#define peidx_ms_check_idx(p, n) (peidx_reg_file_check_idx(p, peidx_reg_file_check_num(p)) + n)
61
62#define peidx_mod_error_info(p, name, n) \
63({ int __idx = peidx_##name##_idx(p, n); \
64 sal_log_mod_error_info_t *__ret = NULL; \
65 if (peidx_##name##_num(p) > n) /*BUG*/ \
66 __ret = &(peidx_head(p)->info[__idx]); \
67 __ret; })
68
69#define peidx_cache_check(p, n) peidx_mod_error_info(p, cache_check, n)
70#define peidx_tlb_check(p, n) peidx_mod_error_info(p, tlb_check, n)
71#define peidx_bus_check(p, n) peidx_mod_error_info(p, bus_check, n)
72#define peidx_reg_file_check(p, n) peidx_mod_error_info(p, reg_file_check, n)
73#define peidx_ms_check(p, n) peidx_mod_error_info(p, ms_check, n)
74
75#define peidx_check_info(proc, name, n) \
76({ \
77 sal_log_mod_error_info_t *__info = peidx_mod_error_info(proc, name, n);\
78 u64 __temp = __info && __info->valid.check_info \
79 ? __info->check_info : 0; \
80 __temp; })
81
82/* slidx: index of SAL log error record */
83
84typedef struct slidx_list {
85 struct list_head list;
86 sal_log_section_hdr_t *hdr;
87} slidx_list_t;
88
89typedef struct slidx_table {
90 sal_log_record_header_t *header;
91 int n_sections; /* # of section headers */
92 struct list_head proc_err;
93 struct list_head mem_dev_err;
94 struct list_head sel_dev_err;
95 struct list_head pci_bus_err;
96 struct list_head smbios_dev_err;
97 struct list_head pci_comp_err;
98 struct list_head plat_specific_err;
99 struct list_head host_ctlr_err;
100 struct list_head plat_bus_err;
101 struct list_head unsupported; /* list of unsupported sections */
102} slidx_table_t;
103
104#define slidx_foreach_entry(pos, head) \
105 list_for_each_entry(pos, head, list)
106#define slidx_first_entry(head) \
107 (((head)->next != (head)) ? list_entry((head)->next, typeof(slidx_list_t), list) : NULL)
108#define slidx_count(slidx, sec) \
109({ int __count = 0; \
110 slidx_list_t *__pos; \
111 slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
112 __count; })
113
diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S
new file mode 100644
index 000000000000..bcfa05acc561
--- /dev/null
+++ b/arch/ia64/kernel/mca_drv_asm.S
@@ -0,0 +1,45 @@
1/*
2 * File: mca_drv_asm.S
3 * Purpose: Assembly portion of Generic MCA handling
4 *
5 * Copyright (C) 2004 FUJITSU LIMITED
6 * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
7 */
8#include <linux/config.h>
9#include <linux/threads.h>
10
11#include <asm/asmmacro.h>
12#include <asm/processor.h>
13
14GLOBAL_ENTRY(mca_handler_bhhook)
15 invala // clear RSE ?
16 ;; //
17 cover //
18 ;; //
19 clrrrb //
20 ;;
21 alloc r16=ar.pfs,0,2,1,0 // make a new frame
22 ;;
23 mov r13=IA64_KR(CURRENT) // current task pointer
24 ;;
25 adds r12=IA64_TASK_THREAD_KSP_OFFSET,r13
26 ;;
27 ld8 r12=[r12] // stack pointer
28 ;;
29 mov loc0=r16
30 movl loc1=mca_handler_bh // recovery C function
31 ;;
32 mov out0=r8 // poisoned address
33 mov b6=loc1
34 ;;
35 mov loc1=rp
36 ;;
37 br.call.sptk.many rp=b6 // not return ...
38 ;;
39 mov ar.pfs=loc0
40 mov rp=loc1
41 ;;
42 mov r8=r0
43 br.ret.sptk.many rp
44 ;;
45END(mca_handler_bhhook)
diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
new file mode 100644
index 000000000000..1dbc7b2497c9
--- /dev/null
+++ b/arch/ia64/kernel/minstate.h
@@ -0,0 +1,251 @@
1#include <linux/config.h>
2
3#include <asm/cache.h>
4
5#include "entry.h"
6
7/*
8 * For ivt.s we want to access the stack virtually so we don't have to disable translation
9 * on interrupts.
10 *
11 * On entry:
12 * r1: pointer to current task (ar.k6)
13 */
14#define MINSTATE_START_SAVE_MIN_VIRT \
15(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
16 ;; \
17(pUStk) mov.m r24=ar.rnat; \
18(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \
19(pKStk) mov r1=sp; /* get sp */ \
20 ;; \
21(pUStk) lfetch.fault.excl.nt1 [r22]; \
22(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
23(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \
24 ;; \
25(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \
26(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
27 ;; \
28(pUStk) mov r18=ar.bsp; \
29(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */
30
31#define MINSTATE_END_SAVE_MIN_VIRT \
32 bsw.1; /* switch back to bank 1 (must be last in insn group) */ \
33 ;;
34
35/*
36 * For mca_asm.S we want to access the stack physically since the state is saved before we
37 * go virtual and don't want to destroy the iip or ipsr.
38 */
39#define MINSTATE_START_SAVE_MIN_PHYS \
40(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \
41(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \
42(pKStk) ld8 r3 = [r3];; \
43(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \
44(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \
45(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
46(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \
47 ;; \
48(pUStk) mov r24=ar.rnat; \
49(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
50(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \
51(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \
52 ;; \
53(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
54(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \
55 ;; \
56(pUStk) mov r18=ar.bsp; \
57(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \
58
59#define MINSTATE_END_SAVE_MIN_PHYS \
60 dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \
61 ;;
62
63#ifdef MINSTATE_VIRT
64# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT)
65# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT
66# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT
67#endif
68
69#ifdef MINSTATE_PHYS
70# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg
71# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS
72# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS
73#endif
74
75/*
76 * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
77 * the minimum state necessary that allows us to turn psr.ic back
78 * on.
79 *
80 * Assumed state upon entry:
81 * psr.ic: off
82 * r31: contains saved predicates (pr)
83 *
84 * Upon exit, the state is as follows:
85 * psr.ic: off
86 * r2 = points to &pt_regs.r16
87 * r8 = contents of ar.ccv
88 * r9 = contents of ar.csd
89 * r10 = contents of ar.ssd
90 * r11 = FPSR_DEFAULT
91 * r12 = kernel sp (kernel virtual address)
92 * r13 = points to current task_struct (kernel virtual address)
93 * p15 = TRUE if psr.i is set in cr.ipsr
94 * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
95 * preserved
96 *
97 * Note that psr.ic is NOT turned on by this macro. This is so that
98 * we can pass interruption state as arguments to a handler.
99 */
100#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \
101 MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \
102 mov r27=ar.rsc; /* M */ \
103 mov r20=r1; /* A */ \
104 mov r25=ar.unat; /* M */ \
105 mov r29=cr.ipsr; /* M */ \
106 mov r26=ar.pfs; /* I */ \
107 mov r28=cr.iip; /* M */ \
108 mov r21=ar.fpsr; /* M */ \
109 COVER; /* B;; (or nothing) */ \
110 ;; \
111 adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \
112 ;; \
113 ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \
114 st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \
115 adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \
116 /* switch from user to kernel RBS: */ \
117 ;; \
118 invala; /* M */ \
119 SAVE_IFS; \
120 cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \
121 ;; \
122 MINSTATE_START_SAVE_MIN \
123 adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \
124 adds r16=PT(CR_IPSR),r1; \
125 ;; \
126 lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
127 st8 [r16]=r29; /* save cr.ipsr */ \
128 ;; \
129 lfetch.fault.excl.nt1 [r17]; \
130 tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \
131 mov r29=b0 \
132 ;; \
133 adds r16=PT(R8),r1; /* initialize first base pointer */ \
134 adds r17=PT(R9),r1; /* initialize second base pointer */ \
135(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \
136 ;; \
137.mem.offset 0,0; st8.spill [r16]=r8,16; \
138.mem.offset 8,0; st8.spill [r17]=r9,16; \
139 ;; \
140.mem.offset 0,0; st8.spill [r16]=r10,24; \
141.mem.offset 8,0; st8.spill [r17]=r11,24; \
142 ;; \
143 st8 [r16]=r28,16; /* save cr.iip */ \
144 st8 [r17]=r30,16; /* save cr.ifs */ \
145(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \
146 mov r8=ar.ccv; \
147 mov r9=ar.csd; \
148 mov r10=ar.ssd; \
149 movl r11=FPSR_DEFAULT; /* L-unit */ \
150 ;; \
151 st8 [r16]=r25,16; /* save ar.unat */ \
152 st8 [r17]=r26,16; /* save ar.pfs */ \
153 shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
154 ;; \
155 st8 [r16]=r27,16; /* save ar.rsc */ \
156(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \
157(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \
158 ;; /* avoid RAW on r16 & r17 */ \
159(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \
160 st8 [r17]=r31,16; /* save predicates */ \
161(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \
162 ;; \
163 st8 [r16]=r29,16; /* save b0 */ \
164 st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \
165 cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \
166 ;; \
167.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \
168.mem.offset 8,0; st8.spill [r17]=r12,16; \
169 adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \
170 ;; \
171.mem.offset 0,0; st8.spill [r16]=r13,16; \
172.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \
173 mov r13=IA64_KR(CURRENT); /* establish `current' */ \
174 ;; \
175.mem.offset 0,0; st8.spill [r16]=r15,16; \
176.mem.offset 8,0; st8.spill [r17]=r14,16; \
177 ;; \
178.mem.offset 0,0; st8.spill [r16]=r2,16; \
179.mem.offset 8,0; st8.spill [r17]=r3,16; \
180 adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
181 ;; \
182 EXTRA; \
183 movl r1=__gp; /* establish kernel global pointer */ \
184 ;; \
185 MINSTATE_END_SAVE_MIN
186
187/*
188 * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
189 *
190 * Assumed state upon entry:
191 * psr.ic: on
192 * r2: points to &pt_regs.r16
193 * r3: points to &pt_regs.r17
194 * r8: contents of ar.ccv
195 * r9: contents of ar.csd
196 * r10: contents of ar.ssd
197 * r11: FPSR_DEFAULT
198 *
199 * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
200 */
201#define SAVE_REST \
202.mem.offset 0,0; st8.spill [r2]=r16,16; \
203.mem.offset 8,0; st8.spill [r3]=r17,16; \
204 ;; \
205.mem.offset 0,0; st8.spill [r2]=r18,16; \
206.mem.offset 8,0; st8.spill [r3]=r19,16; \
207 ;; \
208.mem.offset 0,0; st8.spill [r2]=r20,16; \
209.mem.offset 8,0; st8.spill [r3]=r21,16; \
210 mov r18=b6; \
211 ;; \
212.mem.offset 0,0; st8.spill [r2]=r22,16; \
213.mem.offset 8,0; st8.spill [r3]=r23,16; \
214 mov r19=b7; \
215 ;; \
216.mem.offset 0,0; st8.spill [r2]=r24,16; \
217.mem.offset 8,0; st8.spill [r3]=r25,16; \
218 ;; \
219.mem.offset 0,0; st8.spill [r2]=r26,16; \
220.mem.offset 8,0; st8.spill [r3]=r27,16; \
221 ;; \
222.mem.offset 0,0; st8.spill [r2]=r28,16; \
223.mem.offset 8,0; st8.spill [r3]=r29,16; \
224 ;; \
225.mem.offset 0,0; st8.spill [r2]=r30,16; \
226.mem.offset 8,0; st8.spill [r3]=r31,32; \
227 ;; \
228 mov ar.fpsr=r11; /* M-unit */ \
229 st8 [r2]=r8,8; /* ar.ccv */ \
230 adds r24=PT(B6)-PT(F7),r3; \
231 ;; \
232 stf.spill [r2]=f6,32; \
233 stf.spill [r3]=f7,32; \
234 ;; \
235 stf.spill [r2]=f8,32; \
236 stf.spill [r3]=f9,32; \
237 ;; \
238 stf.spill [r2]=f10; \
239 stf.spill [r3]=f11; \
240 adds r25=PT(B7)-PT(F11),r3; \
241 ;; \
242 st8 [r24]=r18,16; /* b6 */ \
243 st8 [r25]=r19,16; /* b7 */ \
244 ;; \
245 st8 [r24]=r9; /* ar.csd */ \
246 st8 [r25]=r10; /* ar.ssd */ \
247 ;;
248
249#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,)
250#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
251#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, )
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
new file mode 100644
index 000000000000..febc091c2f02
--- /dev/null
+++ b/arch/ia64/kernel/module.c
@@ -0,0 +1,952 @@
1/*
2 * IA-64-specific support for kernel module loader.
3 *
4 * Copyright (C) 2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * Loosely based on patch by Rusty Russell.
8 */
9
10/* relocs tested so far:
11
12 DIR64LSB
13 FPTR64LSB
14 GPREL22
15 LDXMOV
16 LDXMOV
17 LTOFF22
18 LTOFF22X
19 LTOFF22X
20 LTOFF_FPTR22
21 PCREL21B (for br.call only; br.cond is not supported out of modules!)
22 PCREL60B (for brl.cond only; brl.call is not supported for modules!)
23 PCREL64LSB
24 SECREL32LSB
25 SEGREL64LSB
26 */
27
28#include <linux/config.h>
29
30#include <linux/kernel.h>
31#include <linux/sched.h>
32#include <linux/elf.h>
33#include <linux/moduleloader.h>
34#include <linux/string.h>
35#include <linux/vmalloc.h>
36
37#include <asm/patch.h>
38#include <asm/unaligned.h>
39
40#define ARCH_MODULE_DEBUG 0
41
42#if ARCH_MODULE_DEBUG
43# define DEBUGP printk
44# define inline
45#else
46# define DEBUGP(fmt , a...)
47#endif
48
49#ifdef CONFIG_ITANIUM
50# define USE_BRL 0
51#else
52# define USE_BRL 1
53#endif
54
55#define MAX_LTOFF ((uint64_t) (1 << 22)) /* max. allowable linkage-table offset */
56
57/* Define some relocation helper macros/types: */
58
59#define FORMAT_SHIFT 0
60#define FORMAT_BITS 3
61#define FORMAT_MASK ((1 << FORMAT_BITS) - 1)
62#define VALUE_SHIFT 3
63#define VALUE_BITS 5
64#define VALUE_MASK ((1 << VALUE_BITS) - 1)
65
66enum reloc_target_format {
67 /* direct encoded formats: */
68 RF_NONE = 0,
69 RF_INSN14 = 1,
70 RF_INSN22 = 2,
71 RF_INSN64 = 3,
72 RF_32MSB = 4,
73 RF_32LSB = 5,
74 RF_64MSB = 6,
75 RF_64LSB = 7,
76
77 /* formats that cannot be directly decoded: */
78 RF_INSN60,
79 RF_INSN21B, /* imm21 form 1 */
80 RF_INSN21M, /* imm21 form 2 */
81 RF_INSN21F /* imm21 form 3 */
82};
83
84enum reloc_value_formula {
85 RV_DIRECT = 4, /* S + A */
86 RV_GPREL = 5, /* @gprel(S + A) */
87 RV_LTREL = 6, /* @ltoff(S + A) */
88 RV_PLTREL = 7, /* @pltoff(S + A) */
89 RV_FPTR = 8, /* @fptr(S + A) */
90 RV_PCREL = 9, /* S + A - P */
91 RV_LTREL_FPTR = 10, /* @ltoff(@fptr(S + A)) */
92 RV_SEGREL = 11, /* @segrel(S + A) */
93 RV_SECREL = 12, /* @secrel(S + A) */
94 RV_BDREL = 13, /* BD + A */
95 RV_LTV = 14, /* S + A (like RV_DIRECT, except frozen at static link-time) */
96 RV_PCREL2 = 15, /* S + A - P */
97 RV_SPECIAL = 16, /* various (see below) */
98 RV_RSVD17 = 17,
99 RV_TPREL = 18, /* @tprel(S + A) */
100 RV_LTREL_TPREL = 19, /* @ltoff(@tprel(S + A)) */
101 RV_DTPMOD = 20, /* @dtpmod(S + A) */
102 RV_LTREL_DTPMOD = 21, /* @ltoff(@dtpmod(S + A)) */
103 RV_DTPREL = 22, /* @dtprel(S + A) */
104 RV_LTREL_DTPREL = 23, /* @ltoff(@dtprel(S + A)) */
105 RV_RSVD24 = 24,
106 RV_RSVD25 = 25,
107 RV_RSVD26 = 26,
108 RV_RSVD27 = 27
109 /* 28-31 reserved for implementation-specific purposes. */
110};
111
112#define N(reloc) [R_IA64_##reloc] = #reloc
113
114static const char *reloc_name[256] = {
115 N(NONE), N(IMM14), N(IMM22), N(IMM64),
116 N(DIR32MSB), N(DIR32LSB), N(DIR64MSB), N(DIR64LSB),
117 N(GPREL22), N(GPREL64I), N(GPREL32MSB), N(GPREL32LSB),
118 N(GPREL64MSB), N(GPREL64LSB), N(LTOFF22), N(LTOFF64I),
119 N(PLTOFF22), N(PLTOFF64I), N(PLTOFF64MSB), N(PLTOFF64LSB),
120 N(FPTR64I), N(FPTR32MSB), N(FPTR32LSB), N(FPTR64MSB),
121 N(FPTR64LSB), N(PCREL60B), N(PCREL21B), N(PCREL21M),
122 N(PCREL21F), N(PCREL32MSB), N(PCREL32LSB), N(PCREL64MSB),
123 N(PCREL64LSB), N(LTOFF_FPTR22), N(LTOFF_FPTR64I), N(LTOFF_FPTR32MSB),
124 N(LTOFF_FPTR32LSB), N(LTOFF_FPTR64MSB), N(LTOFF_FPTR64LSB), N(SEGREL32MSB),
125 N(SEGREL32LSB), N(SEGREL64MSB), N(SEGREL64LSB), N(SECREL32MSB),
126 N(SECREL32LSB), N(SECREL64MSB), N(SECREL64LSB), N(REL32MSB),
127 N(REL32LSB), N(REL64MSB), N(REL64LSB), N(LTV32MSB),
128 N(LTV32LSB), N(LTV64MSB), N(LTV64LSB), N(PCREL21BI),
129 N(PCREL22), N(PCREL64I), N(IPLTMSB), N(IPLTLSB),
130 N(COPY), N(LTOFF22X), N(LDXMOV), N(TPREL14),
131 N(TPREL22), N(TPREL64I), N(TPREL64MSB), N(TPREL64LSB),
132 N(LTOFF_TPREL22), N(DTPMOD64MSB), N(DTPMOD64LSB), N(LTOFF_DTPMOD22),
133 N(DTPREL14), N(DTPREL22), N(DTPREL64I), N(DTPREL32MSB),
134 N(DTPREL32LSB), N(DTPREL64MSB), N(DTPREL64LSB), N(LTOFF_DTPREL22)
135};
136
137#undef N
138
139struct got_entry {
140 uint64_t val;
141};
142
143struct fdesc {
144 uint64_t ip;
145 uint64_t gp;
146};
147
148/* Opaque struct for insns, to protect against derefs. */
149struct insn;
150
151static inline uint64_t
152bundle (const struct insn *insn)
153{
154 return (uint64_t) insn & ~0xfUL;
155}
156
157static inline int
158slot (const struct insn *insn)
159{
160 return (uint64_t) insn & 0x3;
161}
162
163static int
164apply_imm64 (struct module *mod, struct insn *insn, uint64_t val)
165{
166 if (slot(insn) != 2) {
167 printk(KERN_ERR "%s: invalid slot number %d for IMM64\n",
168 mod->name, slot(insn));
169 return 0;
170 }
171 ia64_patch_imm64((u64) insn, val);
172 return 1;
173}
174
175static int
176apply_imm60 (struct module *mod, struct insn *insn, uint64_t val)
177{
178 if (slot(insn) != 2) {
179 printk(KERN_ERR "%s: invalid slot number %d for IMM60\n",
180 mod->name, slot(insn));
181 return 0;
182 }
183 if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) {
184 printk(KERN_ERR "%s: value %ld out of IMM60 range\n", mod->name, (int64_t) val);
185 return 0;
186 }
187 ia64_patch_imm60((u64) insn, val);
188 return 1;
189}
190
191static int
192apply_imm22 (struct module *mod, struct insn *insn, uint64_t val)
193{
194 if (val + (1 << 21) >= (1 << 22)) {
195 printk(KERN_ERR "%s: value %li out of IMM22 range\n", mod->name, (int64_t)val);
196 return 0;
197 }
198 ia64_patch((u64) insn, 0x01fffcfe000UL, ( ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
199 | ((val & 0x1f0000UL) << 6) /* bit 16 -> 22 */
200 | ((val & 0x00ff80UL) << 20) /* bit 7 -> 27 */
201 | ((val & 0x00007fUL) << 13) /* bit 0 -> 13 */));
202 return 1;
203}
204
205static int
206apply_imm21b (struct module *mod, struct insn *insn, uint64_t val)
207{
208 if (val + (1 << 20) >= (1 << 21)) {
209 printk(KERN_ERR "%s: value %li out of IMM21b range\n", mod->name, (int64_t)val);
210 return 0;
211 }
212 ia64_patch((u64) insn, 0x11ffffe000UL, ( ((val & 0x100000UL) << 16) /* bit 20 -> 36 */
213 | ((val & 0x0fffffUL) << 13) /* bit 0 -> 13 */));
214 return 1;
215}
216
217#if USE_BRL
218
219struct plt_entry {
220 /* Three instruction bundles in PLT. */
221 unsigned char bundle[2][16];
222};
223
224static const struct plt_entry ia64_plt_template = {
225 {
226 {
227 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
228 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /* movl gp=TARGET_GP */
229 0x00, 0x00, 0x00, 0x60
230 },
231 {
232 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
233 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* brl.many gp=TARGET_GP */
234 0x08, 0x00, 0x00, 0xc0
235 }
236 }
237};
238
239static int
240patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
241{
242 if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_gp)
243 && apply_imm60(mod, (struct insn *) (plt->bundle[1] + 2),
244 (target_ip - (int64_t) plt->bundle[1]) / 16))
245 return 1;
246 return 0;
247}
248
249unsigned long
250plt_target (struct plt_entry *plt)
251{
252 uint64_t b0, b1, *b = (uint64_t *) plt->bundle[1];
253 long off;
254
255 b0 = b[0]; b1 = b[1];
256 off = ( ((b1 & 0x00fffff000000000UL) >> 36) /* imm20b -> bit 0 */
257 | ((b0 >> 48) << 20) | ((b1 & 0x7fffffUL) << 36) /* imm39 -> bit 20 */
258 | ((b1 & 0x0800000000000000UL) << 0)); /* i -> bit 59 */
259 return (long) plt->bundle[1] + 16*off;
260}
261
262#else /* !USE_BRL */
263
264struct plt_entry {
265 /* Three instruction bundles in PLT. */
266 unsigned char bundle[3][16];
267};
268
269static const struct plt_entry ia64_plt_template = {
270 {
271 {
272 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
273 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* movl r16=TARGET_IP */
274 0x02, 0x00, 0x00, 0x60
275 },
276 {
277 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
278 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /* movl gp=TARGET_GP */
279 0x00, 0x00, 0x00, 0x60
280 },
281 {
282 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MIB] nop.m 0 */
283 0x60, 0x80, 0x04, 0x80, 0x03, 0x00, /* mov b6=r16 */
284 0x60, 0x00, 0x80, 0x00 /* br.few b6 */
285 }
286 }
287};
288
289static int
290patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
291{
292 if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_ip)
293 && apply_imm64(mod, (struct insn *) (plt->bundle[1] + 2), target_gp))
294 return 1;
295 return 0;
296}
297
298unsigned long
299plt_target (struct plt_entry *plt)
300{
301 uint64_t b0, b1, *b = (uint64_t *) plt->bundle[0];
302
303 b0 = b[0]; b1 = b[1];
304 return ( ((b1 & 0x000007f000000000) >> 36) /* imm7b -> bit 0 */
305 | ((b1 & 0x07fc000000000000) >> 43) /* imm9d -> bit 7 */
306 | ((b1 & 0x0003e00000000000) >> 29) /* imm5c -> bit 16 */
307 | ((b1 & 0x0000100000000000) >> 23) /* ic -> bit 21 */
308 | ((b0 >> 46) << 22) | ((b1 & 0x7fffff) << 40) /* imm41 -> bit 22 */
309 | ((b1 & 0x0800000000000000) << 4)); /* i -> bit 63 */
310}
311
312#endif /* !USE_BRL */
313
314void *
315module_alloc (unsigned long size)
316{
317 if (!size)
318 return NULL;
319 return vmalloc(size);
320}
321
322void
323module_free (struct module *mod, void *module_region)
324{
325 if (mod->arch.init_unw_table && module_region == mod->module_init) {
326 unw_remove_unwind_table(mod->arch.init_unw_table);
327 mod->arch.init_unw_table = NULL;
328 }
329 vfree(module_region);
330}
331
332/* Have we already seen one of these relocations? */
333/* FIXME: we could look in other sections, too --RR */
334static int
335duplicate_reloc (const Elf64_Rela *rela, unsigned int num)
336{
337 unsigned int i;
338
339 for (i = 0; i < num; i++) {
340 if (rela[i].r_info == rela[num].r_info && rela[i].r_addend == rela[num].r_addend)
341 return 1;
342 }
343 return 0;
344}
345
346/* Count how many GOT entries we may need */
347static unsigned int
348count_gots (const Elf64_Rela *rela, unsigned int num)
349{
350 unsigned int i, ret = 0;
351
352 /* Sure, this is order(n^2), but it's usually short, and not
353 time critical */
354 for (i = 0; i < num; i++) {
355 switch (ELF64_R_TYPE(rela[i].r_info)) {
356 case R_IA64_LTOFF22:
357 case R_IA64_LTOFF22X:
358 case R_IA64_LTOFF64I:
359 case R_IA64_LTOFF_FPTR22:
360 case R_IA64_LTOFF_FPTR64I:
361 case R_IA64_LTOFF_FPTR32MSB:
362 case R_IA64_LTOFF_FPTR32LSB:
363 case R_IA64_LTOFF_FPTR64MSB:
364 case R_IA64_LTOFF_FPTR64LSB:
365 if (!duplicate_reloc(rela, i))
366 ret++;
367 break;
368 }
369 }
370 return ret;
371}
372
373/* Count how many PLT entries we may need */
374static unsigned int
375count_plts (const Elf64_Rela *rela, unsigned int num)
376{
377 unsigned int i, ret = 0;
378
379 /* Sure, this is order(n^2), but it's usually short, and not
380 time critical */
381 for (i = 0; i < num; i++) {
382 switch (ELF64_R_TYPE(rela[i].r_info)) {
383 case R_IA64_PCREL21B:
384 case R_IA64_PLTOFF22:
385 case R_IA64_PLTOFF64I:
386 case R_IA64_PLTOFF64MSB:
387 case R_IA64_PLTOFF64LSB:
388 case R_IA64_IPLTMSB:
389 case R_IA64_IPLTLSB:
390 if (!duplicate_reloc(rela, i))
391 ret++;
392 break;
393 }
394 }
395 return ret;
396}
397
398/* We need to create an function-descriptors for any internal function
399 which is referenced. */
400static unsigned int
401count_fdescs (const Elf64_Rela *rela, unsigned int num)
402{
403 unsigned int i, ret = 0;
404
405 /* Sure, this is order(n^2), but it's usually short, and not time critical. */
406 for (i = 0; i < num; i++) {
407 switch (ELF64_R_TYPE(rela[i].r_info)) {
408 case R_IA64_FPTR64I:
409 case R_IA64_FPTR32LSB:
410 case R_IA64_FPTR32MSB:
411 case R_IA64_FPTR64LSB:
412 case R_IA64_FPTR64MSB:
413 case R_IA64_LTOFF_FPTR22:
414 case R_IA64_LTOFF_FPTR32LSB:
415 case R_IA64_LTOFF_FPTR32MSB:
416 case R_IA64_LTOFF_FPTR64I:
417 case R_IA64_LTOFF_FPTR64LSB:
418 case R_IA64_LTOFF_FPTR64MSB:
419 case R_IA64_IPLTMSB:
420 case R_IA64_IPLTLSB:
421 /*
422 * Jumps to static functions sometimes go straight to their
423 * offset. Of course, that may not be possible if the jump is
424 * from init -> core or vice. versa, so we need to generate an
425 * FDESC (and PLT etc) for that.
426 */
427 case R_IA64_PCREL21B:
428 if (!duplicate_reloc(rela, i))
429 ret++;
430 break;
431 }
432 }
433 return ret;
434}
435
436int
437module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
438 struct module *mod)
439{
440 unsigned long core_plts = 0, init_plts = 0, gots = 0, fdescs = 0;
441 Elf64_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
442
443 /*
444 * To store the PLTs and function-descriptors, we expand the .text section for
445 * core module-code and the .init.text section for initialization code.
446 */
447 for (s = sechdrs; s < sechdrs_end; ++s)
448 if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
449 mod->arch.core_plt = s;
450 else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
451 mod->arch.init_plt = s;
452 else if (strcmp(".got", secstrings + s->sh_name) == 0)
453 mod->arch.got = s;
454 else if (strcmp(".opd", secstrings + s->sh_name) == 0)
455 mod->arch.opd = s;
456 else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
457 mod->arch.unwind = s;
458
459 if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
460 printk(KERN_ERR "%s: sections missing\n", mod->name);
461 return -ENOEXEC;
462 }
463
464 /* GOT and PLTs can occur in any relocated section... */
465 for (s = sechdrs + 1; s < sechdrs_end; ++s) {
466 const Elf64_Rela *rels = (void *)ehdr + s->sh_offset;
467 unsigned long numrels = s->sh_size/sizeof(Elf64_Rela);
468
469 if (s->sh_type != SHT_RELA)
470 continue;
471
472 gots += count_gots(rels, numrels);
473 fdescs += count_fdescs(rels, numrels);
474 if (strstr(secstrings + s->sh_name, ".init"))
475 init_plts += count_plts(rels, numrels);
476 else
477 core_plts += count_plts(rels, numrels);
478 }
479
480 mod->arch.core_plt->sh_type = SHT_NOBITS;
481 mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
482 mod->arch.core_plt->sh_addralign = 16;
483 mod->arch.core_plt->sh_size = core_plts * sizeof(struct plt_entry);
484 mod->arch.init_plt->sh_type = SHT_NOBITS;
485 mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
486 mod->arch.init_plt->sh_addralign = 16;
487 mod->arch.init_plt->sh_size = init_plts * sizeof(struct plt_entry);
488 mod->arch.got->sh_type = SHT_NOBITS;
489 mod->arch.got->sh_flags = ARCH_SHF_SMALL | SHF_ALLOC;
490 mod->arch.got->sh_addralign = 8;
491 mod->arch.got->sh_size = gots * sizeof(struct got_entry);
492 mod->arch.opd->sh_type = SHT_NOBITS;
493 mod->arch.opd->sh_flags = SHF_ALLOC;
494 mod->arch.opd->sh_addralign = 8;
495 mod->arch.opd->sh_size = fdescs * sizeof(struct fdesc);
496 DEBUGP("%s: core.plt=%lx, init.plt=%lx, got=%lx, fdesc=%lx\n",
497 __FUNCTION__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size,
498 mod->arch.got->sh_size, mod->arch.opd->sh_size);
499 return 0;
500}
501
502static inline int
503in_init (const struct module *mod, uint64_t addr)
504{
505 return addr - (uint64_t) mod->module_init < mod->init_size;
506}
507
508static inline int
509in_core (const struct module *mod, uint64_t addr)
510{
511 return addr - (uint64_t) mod->module_core < mod->core_size;
512}
513
514static inline int
515is_internal (const struct module *mod, uint64_t value)
516{
517 return in_init(mod, value) || in_core(mod, value);
518}
519
520/*
521 * Get gp-relative offset for the linkage-table entry of VALUE.
522 */
523static uint64_t
524get_ltoff (struct module *mod, uint64_t value, int *okp)
525{
526 struct got_entry *got, *e;
527
528 if (!*okp)
529 return 0;
530
531 got = (void *) mod->arch.got->sh_addr;
532 for (e = got; e < got + mod->arch.next_got_entry; ++e)
533 if (e->val == value)
534 goto found;
535
536 /* Not enough GOT entries? */
537 if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size))
538 BUG();
539
540 e->val = value;
541 ++mod->arch.next_got_entry;
542 found:
543 return (uint64_t) e - mod->arch.gp;
544}
545
546static inline int
547gp_addressable (struct module *mod, uint64_t value)
548{
549 return value - mod->arch.gp + MAX_LTOFF/2 < MAX_LTOFF;
550}
551
552/* Get PC-relative PLT entry for this value. Returns 0 on failure. */
553static uint64_t
554get_plt (struct module *mod, const struct insn *insn, uint64_t value, int *okp)
555{
556 struct plt_entry *plt, *plt_end;
557 uint64_t target_ip, target_gp;
558
559 if (!*okp)
560 return 0;
561
562 if (in_init(mod, (uint64_t) insn)) {
563 plt = (void *) mod->arch.init_plt->sh_addr;
564 plt_end = (void *) plt + mod->arch.init_plt->sh_size;
565 } else {
566 plt = (void *) mod->arch.core_plt->sh_addr;
567 plt_end = (void *) plt + mod->arch.core_plt->sh_size;
568 }
569
570 /* "value" is a pointer to a function-descriptor; fetch the target ip/gp from it: */
571 target_ip = ((uint64_t *) value)[0];
572 target_gp = ((uint64_t *) value)[1];
573
574 /* Look for existing PLT entry. */
575 while (plt->bundle[0][0]) {
576 if (plt_target(plt) == target_ip)
577 goto found;
578 if (++plt >= plt_end)
579 BUG();
580 }
581 *plt = ia64_plt_template;
582 if (!patch_plt(mod, plt, target_ip, target_gp)) {
583 *okp = 0;
584 return 0;
585 }
586#if ARCH_MODULE_DEBUG
587 if (plt_target(plt) != target_ip) {
588 printk("%s: mistargeted PLT: wanted %lx, got %lx\n",
589 __FUNCTION__, target_ip, plt_target(plt));
590 *okp = 0;
591 return 0;
592 }
593#endif
594 found:
595 return (uint64_t) plt;
596}
597
598/* Get function descriptor for VALUE. */
599static uint64_t
600get_fdesc (struct module *mod, uint64_t value, int *okp)
601{
602 struct fdesc *fdesc = (void *) mod->arch.opd->sh_addr;
603
604 if (!*okp)
605 return 0;
606
607 if (!value) {
608 printk(KERN_ERR "%s: fdesc for zero requested!\n", mod->name);
609 return 0;
610 }
611
612 if (!is_internal(mod, value))
613 /*
614 * If it's not a module-local entry-point, "value" already points to a
615 * function-descriptor.
616 */
617 return value;
618
619 /* Look for existing function descriptor. */
620 while (fdesc->ip) {
621 if (fdesc->ip == value)
622 return (uint64_t)fdesc;
623 if ((uint64_t) ++fdesc >= mod->arch.opd->sh_addr + mod->arch.opd->sh_size)
624 BUG();
625 }
626
627 /* Create new one */
628 fdesc->ip = value;
629 fdesc->gp = mod->arch.gp;
630 return (uint64_t) fdesc;
631}
632
633static inline int
634do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
635 Elf64_Shdr *sec, void *location)
636{
637 enum reloc_target_format format = (r_type >> FORMAT_SHIFT) & FORMAT_MASK;
638 enum reloc_value_formula formula = (r_type >> VALUE_SHIFT) & VALUE_MASK;
639 uint64_t val;
640 int ok = 1;
641
642 val = sym->st_value + addend;
643
644 switch (formula) {
645 case RV_SEGREL: /* segment base is arbitrarily chosen to be 0 for kernel modules */
646 case RV_DIRECT:
647 break;
648
649 case RV_GPREL: val -= mod->arch.gp; break;
650 case RV_LTREL: val = get_ltoff(mod, val, &ok); break;
651 case RV_PLTREL: val = get_plt(mod, location, val, &ok); break;
652 case RV_FPTR: val = get_fdesc(mod, val, &ok); break;
653 case RV_SECREL: val -= sec->sh_addr; break;
654 case RV_LTREL_FPTR: val = get_ltoff(mod, get_fdesc(mod, val, &ok), &ok); break;
655
656 case RV_PCREL:
657 switch (r_type) {
658 case R_IA64_PCREL21B:
659 if ((in_init(mod, val) && in_core(mod, (uint64_t)location)) ||
660 (in_core(mod, val) && in_init(mod, (uint64_t)location))) {
661 /*
662 * Init section may have been allocated far away from core,
663 * if the branch won't reach, then allocate a plt for it.
664 */
665 uint64_t delta = ((int64_t)val - (int64_t)location) / 16;
666 if (delta + (1 << 20) >= (1 << 21)) {
667 val = get_fdesc(mod, val, &ok);
668 val = get_plt(mod, location, val, &ok);
669 }
670 } else if (!is_internal(mod, val))
671 val = get_plt(mod, location, val, &ok);
672 /* FALL THROUGH */
673 default:
674 val -= bundle(location);
675 break;
676
677 case R_IA64_PCREL32MSB:
678 case R_IA64_PCREL32LSB:
679 case R_IA64_PCREL64MSB:
680 case R_IA64_PCREL64LSB:
681 val -= (uint64_t) location;
682 break;
683
684 }
685 switch (r_type) {
686 case R_IA64_PCREL60B: format = RF_INSN60; break;
687 case R_IA64_PCREL21B: format = RF_INSN21B; break;
688 case R_IA64_PCREL21M: format = RF_INSN21M; break;
689 case R_IA64_PCREL21F: format = RF_INSN21F; break;
690 default: break;
691 }
692 break;
693
694 case RV_BDREL:
695 val -= (uint64_t) (in_init(mod, val) ? mod->module_init : mod->module_core);
696 break;
697
698 case RV_LTV:
699 /* can link-time value relocs happen here? */
700 BUG();
701 break;
702
703 case RV_PCREL2:
704 if (r_type == R_IA64_PCREL21BI) {
705 if (!is_internal(mod, val)) {
706 printk(KERN_ERR "%s: %s reloc against non-local symbol (%lx)\n",
707 __FUNCTION__, reloc_name[r_type], val);
708 return -ENOEXEC;
709 }
710 format = RF_INSN21B;
711 }
712 val -= bundle(location);
713 break;
714
715 case RV_SPECIAL:
716 switch (r_type) {
717 case R_IA64_IPLTMSB:
718 case R_IA64_IPLTLSB:
719 val = get_fdesc(mod, get_plt(mod, location, val, &ok), &ok);
720 format = RF_64LSB;
721 if (r_type == R_IA64_IPLTMSB)
722 format = RF_64MSB;
723 break;
724
725 case R_IA64_SUB:
726 val = addend - sym->st_value;
727 format = RF_INSN64;
728 break;
729
730 case R_IA64_LTOFF22X:
731 if (gp_addressable(mod, val))
732 val -= mod->arch.gp;
733 else
734 val = get_ltoff(mod, val, &ok);
735 format = RF_INSN22;
736 break;
737
738 case R_IA64_LDXMOV:
739 if (gp_addressable(mod, val)) {
740 /* turn "ld8" into "mov": */
741 DEBUGP("%s: patching ld8 at %p to mov\n", __FUNCTION__, location);
742 ia64_patch((u64) location, 0x1fff80fe000UL, 0x10000000000UL);
743 }
744 return 0;
745
746 default:
747 if (reloc_name[r_type])
748 printk(KERN_ERR "%s: special reloc %s not supported",
749 mod->name, reloc_name[r_type]);
750 else
751 printk(KERN_ERR "%s: unknown special reloc %x\n",
752 mod->name, r_type);
753 return -ENOEXEC;
754 }
755 break;
756
757 case RV_TPREL:
758 case RV_LTREL_TPREL:
759 case RV_DTPMOD:
760 case RV_LTREL_DTPMOD:
761 case RV_DTPREL:
762 case RV_LTREL_DTPREL:
763 printk(KERN_ERR "%s: %s reloc not supported\n",
764 mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?");
765 return -ENOEXEC;
766
767 default:
768 printk(KERN_ERR "%s: unknown reloc %x\n", mod->name, r_type);
769 return -ENOEXEC;
770 }
771
772 if (!ok)
773 return -ENOEXEC;
774
775 DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __FUNCTION__, location, val,
776 reloc_name[r_type] ? reloc_name[r_type] : "?", sym->st_value + addend);
777
778 switch (format) {
779 case RF_INSN21B: ok = apply_imm21b(mod, location, (int64_t) val / 16); break;
780 case RF_INSN22: ok = apply_imm22(mod, location, val); break;
781 case RF_INSN64: ok = apply_imm64(mod, location, val); break;
782 case RF_INSN60: ok = apply_imm60(mod, location, (int64_t) val / 16); break;
783 case RF_32LSB: put_unaligned(val, (uint32_t *) location); break;
784 case RF_64LSB: put_unaligned(val, (uint64_t *) location); break;
785 case RF_32MSB: /* ia64 Linux is little-endian... */
786 case RF_64MSB: /* ia64 Linux is little-endian... */
787 case RF_INSN14: /* must be within-module, i.e., resolved by "ld -r" */
788 case RF_INSN21M: /* must be within-module, i.e., resolved by "ld -r" */
789 case RF_INSN21F: /* must be within-module, i.e., resolved by "ld -r" */
790 printk(KERN_ERR "%s: format %u needed by %s reloc is not supported\n",
791 mod->name, format, reloc_name[r_type] ? reloc_name[r_type] : "?");
792 return -ENOEXEC;
793
794 default:
795 printk(KERN_ERR "%s: relocation %s resulted in unknown format %u\n",
796 mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?", format);
797 return -ENOEXEC;
798 }
799 return ok ? 0 : -ENOEXEC;
800}
801
802int
803apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex,
804 unsigned int relsec, struct module *mod)
805{
806 unsigned int i, n = sechdrs[relsec].sh_size / sizeof(Elf64_Rela);
807 Elf64_Rela *rela = (void *) sechdrs[relsec].sh_addr;
808 Elf64_Shdr *target_sec;
809 int ret;
810
811 DEBUGP("%s: applying section %u (%u relocs) to %u\n", __FUNCTION__,
812 relsec, n, sechdrs[relsec].sh_info);
813
814 target_sec = sechdrs + sechdrs[relsec].sh_info;
815
816 if (target_sec->sh_entsize == ~0UL)
817 /*
818 * If target section wasn't allocated, we don't need to relocate it.
819 * Happens, e.g., for debug sections.
820 */
821 return 0;
822
823 if (!mod->arch.gp) {
824 /*
825 * XXX Should have an arch-hook for running this after final section
826 * addresses have been selected...
827 */
828 /* See if gp can cover the entire core module: */
829 uint64_t gp = (uint64_t) mod->module_core + MAX_LTOFF / 2;
830 if (mod->core_size >= MAX_LTOFF)
831 /*
832 * This takes advantage of fact that SHF_ARCH_SMALL gets allocated
833 * at the end of the module.
834 */
835 gp = (uint64_t) mod->module_core + mod->core_size - MAX_LTOFF / 2;
836 mod->arch.gp = gp;
837 DEBUGP("%s: placing gp at 0x%lx\n", __FUNCTION__, gp);
838 }
839
840 for (i = 0; i < n; i++) {
841 ret = do_reloc(mod, ELF64_R_TYPE(rela[i].r_info),
842 ((Elf64_Sym *) sechdrs[symindex].sh_addr
843 + ELF64_R_SYM(rela[i].r_info)),
844 rela[i].r_addend, target_sec,
845 (void *) target_sec->sh_addr + rela[i].r_offset);
846 if (ret < 0)
847 return ret;
848 }
849 return 0;
850}
851
852int
853apply_relocate (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex,
854 unsigned int relsec, struct module *mod)
855{
856 printk(KERN_ERR "module %s: REL relocs in section %u unsupported\n", mod->name, relsec);
857 return -ENOEXEC;
858}
859
860/*
861 * Modules contain a single unwind table which covers both the core and the init text
862 * sections but since the two are not contiguous, we need to split this table up such that
863 * we can register (and unregister) each "segment" seperately. Fortunately, this sounds
864 * more complicated than it really is.
865 */
866static void
867register_unwind_table (struct module *mod)
868{
869 struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr;
870 struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start);
871 struct unw_table_entry tmp, *e1, *e2, *core, *init;
872 unsigned long num_init = 0, num_core = 0;
873
874 /* First, count how many init and core unwind-table entries there are. */
875 for (e1 = start; e1 < end; ++e1)
876 if (in_init(mod, e1->start_offset))
877 ++num_init;
878 else
879 ++num_core;
880 /*
881 * Second, sort the table such that all unwind-table entries for the init and core
882 * text sections are nicely separated. We do this with a stupid bubble sort
883 * (unwind tables don't get ridiculously huge).
884 */
885 for (e1 = start; e1 < end; ++e1) {
886 for (e2 = e1 + 1; e2 < end; ++e2) {
887 if (e2->start_offset < e1->start_offset) {
888 tmp = *e1;
889 *e1 = *e2;
890 *e2 = tmp;
891 }
892 }
893 }
894 /*
895 * Third, locate the init and core segments in the unwind table:
896 */
897 if (in_init(mod, start->start_offset)) {
898 init = start;
899 core = start + num_init;
900 } else {
901 core = start;
902 init = start + num_core;
903 }
904
905 DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __FUNCTION__,
906 mod->name, mod->arch.gp, num_init, num_core);
907
908 /*
909 * Fourth, register both tables (if not empty).
910 */
911 if (num_core > 0) {
912 mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
913 core, core + num_core);
914 DEBUGP("%s: core: handle=%p [%p-%p)\n", __FUNCTION__,
915 mod->arch.core_unw_table, core, core + num_core);
916 }
917 if (num_init > 0) {
918 mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
919 init, init + num_init);
920 DEBUGP("%s: init: handle=%p [%p-%p)\n", __FUNCTION__,
921 mod->arch.init_unw_table, init, init + num_init);
922 }
923}
924
925int
926module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod)
927{
928 DEBUGP("%s: init: entry=%p\n", __FUNCTION__, mod->init);
929 if (mod->arch.unwind)
930 register_unwind_table(mod);
931 return 0;
932}
933
934void
935module_arch_cleanup (struct module *mod)
936{
937 if (mod->arch.init_unw_table)
938 unw_remove_unwind_table(mod->arch.init_unw_table);
939 if (mod->arch.core_unw_table)
940 unw_remove_unwind_table(mod->arch.core_unw_table);
941}
942
943#ifdef CONFIG_SMP
944void
945percpu_modcopy (void *pcpudst, const void *src, unsigned long size)
946{
947 unsigned int i;
948 for (i = 0; i < NR_CPUS; i++)
949 if (cpu_possible(i))
950 memcpy(pcpudst + __per_cpu_offset[i], src, size);
951}
952#endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
new file mode 100644
index 000000000000..5018c7f2e7a8
--- /dev/null
+++ b/arch/ia64/kernel/pal.S
@@ -0,0 +1,302 @@
1/*
2 * PAL Firmware support
3 * IA-64 Processor Programmers Reference Vol 2
4 *
5 * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
6 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
7 * Copyright (C) 1999-2001, 2003 Hewlett-Packard Co
8 * David Mosberger <davidm@hpl.hp.com>
9 * Stephane Eranian <eranian@hpl.hp.com>
10 *
11 * 05/22/2000 eranian Added support for stacked register calls
12 * 05/24/2000 eranian Added support for physical mode static calls
13 */
14
15#include <asm/asmmacro.h>
16#include <asm/processor.h>
17
18 .data
19pal_entry_point:
20 data8 ia64_pal_default_handler
21 .text
22
23/*
24 * Set the PAL entry point address. This could be written in C code, but we do it here
25 * to keep it all in one module (besides, it's so trivial that it's
26 * not a big deal).
27 *
28 * in0 Address of the PAL entry point (text address, NOT a function descriptor).
29 */
30GLOBAL_ENTRY(ia64_pal_handler_init)
31 alloc r3=ar.pfs,1,0,0,0
32 movl r2=pal_entry_point
33 ;;
34 st8 [r2]=in0
35 br.ret.sptk.many rp
36END(ia64_pal_handler_init)
37
38/*
39 * Default PAL call handler. This needs to be coded in assembly because it uses
40 * the static calling convention, i.e., the RSE may not be used and calls are
41 * done via "br.cond" (not "br.call").
42 */
43GLOBAL_ENTRY(ia64_pal_default_handler)
44 mov r8=-1
45 br.cond.sptk.many rp
46END(ia64_pal_default_handler)
47
48/*
49 * Make a PAL call using the static calling convention.
50 *
51 * in0 Index of PAL service
52 * in1 - in3 Remaining PAL arguments
53 * in4 1 ==> clear psr.ic, 0 ==> don't clear psr.ic
54 *
55 */
56GLOBAL_ENTRY(ia64_pal_call_static)
57 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
58 alloc loc1 = ar.pfs,5,5,0,0
59 movl loc2 = pal_entry_point
601: {
61 mov r28 = in0
62 mov r29 = in1
63 mov r8 = ip
64 }
65 ;;
66 ld8 loc2 = [loc2] // loc2 <- entry point
67 tbit.nz p6,p7 = in4, 0
68 adds r8 = 1f-1b,r8
69 mov loc4=ar.rsc // save RSE configuration
70 ;;
71 mov ar.rsc=0 // put RSE in enforced lazy, LE mode
72 mov loc3 = psr
73 mov loc0 = rp
74 .body
75 mov r30 = in2
76
77(p6) rsm psr.i | psr.ic
78 mov r31 = in3
79 mov b7 = loc2
80
81(p7) rsm psr.i
82 ;;
83(p6) srlz.i
84 mov rp = r8
85 br.cond.sptk.many b7
861: mov psr.l = loc3
87 mov ar.rsc = loc4 // restore RSE configuration
88 mov ar.pfs = loc1
89 mov rp = loc0
90 ;;
91 srlz.d // seralize restoration of psr.l
92 br.ret.sptk.many b0
93END(ia64_pal_call_static)
94
95/*
96 * Make a PAL call using the stacked registers calling convention.
97 *
98 * Inputs:
99 * in0 Index of PAL service
100 * in2 - in3 Remaning PAL arguments
101 */
102GLOBAL_ENTRY(ia64_pal_call_stacked)
103 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
104 alloc loc1 = ar.pfs,4,4,4,0
105 movl loc2 = pal_entry_point
106
107 mov r28 = in0 // Index MUST be copied to r28
108 mov out0 = in0 // AND in0 of PAL function
109 mov loc0 = rp
110 .body
111 ;;
112 ld8 loc2 = [loc2] // loc2 <- entry point
113 mov out1 = in1
114 mov out2 = in2
115 mov out3 = in3
116 mov loc3 = psr
117 ;;
118 rsm psr.i
119 mov b7 = loc2
120 ;;
121 br.call.sptk.many rp=b7 // now make the call
122.ret0: mov psr.l = loc3
123 mov ar.pfs = loc1
124 mov rp = loc0
125 ;;
126 srlz.d // serialize restoration of psr.l
127 br.ret.sptk.many b0
128END(ia64_pal_call_stacked)
129
130/*
131 * Make a physical mode PAL call using the static registers calling convention.
132 *
133 * Inputs:
134 * in0 Index of PAL service
135 * in2 - in3 Remaning PAL arguments
136 *
137 * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel.
138 * So we don't need to clear them.
139 */
140#define PAL_PSR_BITS_TO_CLEAR \
141 (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_DB | IA64_PSR_RT | \
142 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \
143 IA64_PSR_DFL | IA64_PSR_DFH)
144
145#define PAL_PSR_BITS_TO_SET \
146 (IA64_PSR_BN)
147
148
149GLOBAL_ENTRY(ia64_pal_call_phys_static)
150 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
151 alloc loc1 = ar.pfs,4,7,0,0
152 movl loc2 = pal_entry_point
1531: {
154 mov r28 = in0 // copy procedure index
155 mov r8 = ip // save ip to compute branch
156 mov loc0 = rp // save rp
157 }
158 .body
159 ;;
160 ld8 loc2 = [loc2] // loc2 <- entry point
161 mov r29 = in1 // first argument
162 mov r30 = in2 // copy arg2
163 mov r31 = in3 // copy arg3
164 ;;
165 mov loc3 = psr // save psr
166 adds r8 = 1f-1b,r8 // calculate return address for call
167 ;;
168 mov loc4=ar.rsc // save RSE configuration
169 dep.z loc2=loc2,0,61 // convert pal entry point to physical
170 tpa r8=r8 // convert rp to physical
171 ;;
172 mov b7 = loc2 // install target to branch reg
173 mov ar.rsc=0 // put RSE in enforced lazy, LE mode
174 movl r16=PAL_PSR_BITS_TO_CLEAR
175 movl r17=PAL_PSR_BITS_TO_SET
176 ;;
177 or loc3=loc3,r17 // add in psr the bits to set
178 ;;
179 andcm r16=loc3,r16 // removes bits to clear from psr
180 br.call.sptk.many rp=ia64_switch_mode_phys
181.ret1: mov rp = r8 // install return address (physical)
182 mov loc5 = r19
183 mov loc6 = r20
184 br.cond.sptk.many b7
1851:
186 mov ar.rsc=0 // put RSE in enforced lazy, LE mode
187 mov r16=loc3 // r16= original psr
188 mov r19=loc5
189 mov r20=loc6
190 br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
191.ret2:
192 mov psr.l = loc3 // restore init PSR
193
194 mov ar.pfs = loc1
195 mov rp = loc0
196 ;;
197 mov ar.rsc=loc4 // restore RSE configuration
198 srlz.d // seralize restoration of psr.l
199 br.ret.sptk.many b0
200END(ia64_pal_call_phys_static)
201
202/*
203 * Make a PAL call using the stacked registers in physical mode.
204 *
205 * Inputs:
206 * in0 Index of PAL service
207 * in2 - in3 Remaning PAL arguments
208 */
209GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
210 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
211 alloc loc1 = ar.pfs,5,7,4,0
212 movl loc2 = pal_entry_point
2131: {
214 mov r28 = in0 // copy procedure index
215 mov loc0 = rp // save rp
216 }
217 .body
218 ;;
219 ld8 loc2 = [loc2] // loc2 <- entry point
220 mov out0 = in0 // first argument
221 mov out1 = in1 // copy arg2
222 mov out2 = in2 // copy arg3
223 mov out3 = in3 // copy arg3
224 ;;
225 mov loc3 = psr // save psr
226 ;;
227 mov loc4=ar.rsc // save RSE configuration
228 dep.z loc2=loc2,0,61 // convert pal entry point to physical
229 ;;
230 mov ar.rsc=0 // put RSE in enforced lazy, LE mode
231 movl r16=PAL_PSR_BITS_TO_CLEAR
232 movl r17=PAL_PSR_BITS_TO_SET
233 ;;
234 or loc3=loc3,r17 // add in psr the bits to set
235 mov b7 = loc2 // install target to branch reg
236 ;;
237 andcm r16=loc3,r16 // removes bits to clear from psr
238 br.call.sptk.many rp=ia64_switch_mode_phys
239.ret6:
240 mov loc5 = r19
241 mov loc6 = r20
242 br.call.sptk.many rp=b7 // now make the call
243.ret7:
244 mov ar.rsc=0 // put RSE in enforced lazy, LE mode
245 mov r16=loc3 // r16= original psr
246 mov r19=loc5
247 mov r20=loc6
248 br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
249
250.ret8: mov psr.l = loc3 // restore init PSR
251 mov ar.pfs = loc1
252 mov rp = loc0
253 ;;
254 mov ar.rsc=loc4 // restore RSE configuration
255 srlz.d // seralize restoration of psr.l
256 br.ret.sptk.many b0
257END(ia64_pal_call_phys_stacked)
258
259/*
260 * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15).
261 *
262 * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch
263 * regs fp-low partition.
264 *
265 * Inputs:
266 * in0 Address of stack storage for fp regs
267 */
268GLOBAL_ENTRY(ia64_save_scratch_fpregs)
269 alloc r3=ar.pfs,1,0,0,0
270 add r2=16,in0
271 ;;
272 stf.spill [in0] = f10,32
273 stf.spill [r2] = f11,32
274 ;;
275 stf.spill [in0] = f12,32
276 stf.spill [r2] = f13,32
277 ;;
278 stf.spill [in0] = f14,32
279 stf.spill [r2] = f15,32
280 br.ret.sptk.many rp
281END(ia64_save_scratch_fpregs)
282
283/*
284 * Load scratch fp scratch regs (fp10-fp15)
285 *
286 * Inputs:
287 * in0 Address of stack storage for fp regs
288 */
289GLOBAL_ENTRY(ia64_load_scratch_fpregs)
290 alloc r3=ar.pfs,1,0,0,0
291 add r2=16,in0
292 ;;
293 ldf.fill f10 = [in0],32
294 ldf.fill f11 = [r2],32
295 ;;
296 ldf.fill f12 = [in0],32
297 ldf.fill f13 = [r2],32
298 ;;
299 ldf.fill f14 = [in0],32
300 ldf.fill f15 = [r2],32
301 br.ret.sptk.many rp
302END(ia64_load_scratch_fpregs)
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
new file mode 100644
index 000000000000..25e7c8344564
--- /dev/null
+++ b/arch/ia64/kernel/palinfo.c
@@ -0,0 +1,1023 @@
1/*
2 * palinfo.c
3 *
4 * Prints processor specific information reported by PAL.
5 * This code is based on specification of PAL as of the
6 * Intel IA-64 Architecture Software Developer's Manual v1.0.
7 *
8 *
9 * Copyright (C) 2000-2001, 2003 Hewlett-Packard Co
10 * Stephane Eranian <eranian@hpl.hp.com>
11 * Copyright (C) 2004 Intel Corporation
12 * Ashok Raj <ashok.raj@intel.com>
13 *
14 * 05/26/2000 S.Eranian initial release
15 * 08/21/2000 S.Eranian updated to July 2000 PAL specs
16 * 02/05/2001 S.Eranian fixed module support
17 * 10/23/2001 S.Eranian updated pal_perf_mon_info bug fixes
18 * 03/24/2004 Ashok Raj updated to work with CPU Hotplug
19 */
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/errno.h>
23#include <linux/init.h>
24#include <linux/proc_fs.h>
25#include <linux/mm.h>
26#include <linux/module.h>
27#include <linux/efi.h>
28#include <linux/notifier.h>
29#include <linux/cpu.h>
30#include <linux/cpumask.h>
31
32#include <asm/pal.h>
33#include <asm/sal.h>
34#include <asm/page.h>
35#include <asm/processor.h>
36#include <linux/smp.h>
37
38MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
39MODULE_DESCRIPTION("/proc interface to IA-64 PAL");
40MODULE_LICENSE("GPL");
41
42#define PALINFO_VERSION "0.5"
43
44typedef int (*palinfo_func_t)(char*);
45
46typedef struct {
47 const char *name; /* name of the proc entry */
48 palinfo_func_t proc_read; /* function to call for reading */
49 struct proc_dir_entry *entry; /* registered entry (removal) */
50} palinfo_entry_t;
51
52
53/*
54 * A bunch of string array to get pretty printing
55 */
56
57static char *cache_types[] = {
58 "", /* not used */
59 "Instruction",
60 "Data",
61 "Data/Instruction" /* unified */
62};
63
64static const char *cache_mattrib[]={
65 "WriteThrough",
66 "WriteBack",
67 "", /* reserved */
68 "" /* reserved */
69};
70
71static const char *cache_st_hints[]={
72 "Temporal, level 1",
73 "Reserved",
74 "Reserved",
75 "Non-temporal, all levels",
76 "Reserved",
77 "Reserved",
78 "Reserved",
79 "Reserved"
80};
81
82static const char *cache_ld_hints[]={
83 "Temporal, level 1",
84 "Non-temporal, level 1",
85 "Reserved",
86 "Non-temporal, all levels",
87 "Reserved",
88 "Reserved",
89 "Reserved",
90 "Reserved"
91};
92
93static const char *rse_hints[]={
94 "enforced lazy",
95 "eager stores",
96 "eager loads",
97 "eager loads and stores"
98};
99
100#define RSE_HINTS_COUNT ARRAY_SIZE(rse_hints)
101
102static const char *mem_attrib[]={
103 "WB", /* 000 */
104 "SW", /* 001 */
105 "010", /* 010 */
106 "011", /* 011 */
107 "UC", /* 100 */
108 "UCE", /* 101 */
109 "WC", /* 110 */
110 "NaTPage" /* 111 */
111};
112
113/*
114 * Take a 64bit vector and produces a string such that
115 * if bit n is set then 2^n in clear text is generated. The adjustment
116 * to the right unit is also done.
117 *
118 * Input:
119 * - a pointer to a buffer to hold the string
120 * - a 64-bit vector
121 * Ouput:
122 * - a pointer to the end of the buffer
123 *
124 */
125static char *
126bitvector_process(char *p, u64 vector)
127{
128 int i,j;
129 const char *units[]={ "", "K", "M", "G", "T" };
130
131 for (i=0, j=0; i < 64; i++ , j=i/10) {
132 if (vector & 0x1) {
133 p += sprintf(p, "%d%s ", 1 << (i-j*10), units[j]);
134 }
135 vector >>= 1;
136 }
137 return p;
138}
139
140/*
141 * Take a 64bit vector and produces a string such that
142 * if bit n is set then register n is present. The function
143 * takes into account consecutive registers and prints out ranges.
144 *
145 * Input:
146 * - a pointer to a buffer to hold the string
147 * - a 64-bit vector
148 * Ouput:
149 * - a pointer to the end of the buffer
150 *
151 */
152static char *
153bitregister_process(char *p, u64 *reg_info, int max)
154{
155 int i, begin, skip = 0;
156 u64 value = reg_info[0];
157
158 value >>= i = begin = ffs(value) - 1;
159
160 for(; i < max; i++ ) {
161
162 if (i != 0 && (i%64) == 0) value = *++reg_info;
163
164 if ((value & 0x1) == 0 && skip == 0) {
165 if (begin <= i - 2)
166 p += sprintf(p, "%d-%d ", begin, i-1);
167 else
168 p += sprintf(p, "%d ", i-1);
169 skip = 1;
170 begin = -1;
171 } else if ((value & 0x1) && skip == 1) {
172 skip = 0;
173 begin = i;
174 }
175 value >>=1;
176 }
177 if (begin > -1) {
178 if (begin < 127)
179 p += sprintf(p, "%d-127", begin);
180 else
181 p += sprintf(p, "127");
182 }
183
184 return p;
185}
186
187static int
188power_info(char *page)
189{
190 s64 status;
191 char *p = page;
192 u64 halt_info_buffer[8];
193 pal_power_mgmt_info_u_t *halt_info =(pal_power_mgmt_info_u_t *)halt_info_buffer;
194 int i;
195
196 status = ia64_pal_halt_info(halt_info);
197 if (status != 0) return 0;
198
199 for (i=0; i < 8 ; i++ ) {
200 if (halt_info[i].pal_power_mgmt_info_s.im == 1) {
201 p += sprintf(p, "Power level %d:\n"
202 "\tentry_latency : %d cycles\n"
203 "\texit_latency : %d cycles\n"
204 "\tpower consumption : %d mW\n"
205 "\tCache+TLB coherency : %s\n", i,
206 halt_info[i].pal_power_mgmt_info_s.entry_latency,
207 halt_info[i].pal_power_mgmt_info_s.exit_latency,
208 halt_info[i].pal_power_mgmt_info_s.power_consumption,
209 halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No");
210 } else {
211 p += sprintf(p,"Power level %d: not implemented\n",i);
212 }
213 }
214 return p - page;
215}
216
217static int
218cache_info(char *page)
219{
220 char *p = page;
221 u64 i, levels, unique_caches;
222 pal_cache_config_info_t cci;
223 int j, k;
224 s64 status;
225
226 if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
227 printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
228 return 0;
229 }
230
231 p += sprintf(p, "Cache levels : %ld\nUnique caches : %ld\n\n", levels, unique_caches);
232
233 for (i=0; i < levels; i++) {
234
235 for (j=2; j >0 ; j--) {
236
237 /* even without unification some level may not be present */
238 if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0) {
239 continue;
240 }
241 p += sprintf(p,
242 "%s Cache level %lu:\n"
243 "\tSize : %lu bytes\n"
244 "\tAttributes : ",
245 cache_types[j+cci.pcci_unified], i+1,
246 cci.pcci_cache_size);
247
248 if (cci.pcci_unified) p += sprintf(p, "Unified ");
249
250 p += sprintf(p, "%s\n", cache_mattrib[cci.pcci_cache_attr]);
251
252 p += sprintf(p,
253 "\tAssociativity : %d\n"
254 "\tLine size : %d bytes\n"
255 "\tStride : %d bytes\n",
256 cci.pcci_assoc, 1<<cci.pcci_line_size, 1<<cci.pcci_stride);
257 if (j == 1)
258 p += sprintf(p, "\tStore latency : N/A\n");
259 else
260 p += sprintf(p, "\tStore latency : %d cycle(s)\n",
261 cci.pcci_st_latency);
262
263 p += sprintf(p,
264 "\tLoad latency : %d cycle(s)\n"
265 "\tStore hints : ", cci.pcci_ld_latency);
266
267 for(k=0; k < 8; k++ ) {
268 if ( cci.pcci_st_hints & 0x1)
269 p += sprintf(p, "[%s]", cache_st_hints[k]);
270 cci.pcci_st_hints >>=1;
271 }
272 p += sprintf(p, "\n\tLoad hints : ");
273
274 for(k=0; k < 8; k++ ) {
275 if (cci.pcci_ld_hints & 0x1)
276 p += sprintf(p, "[%s]", cache_ld_hints[k]);
277 cci.pcci_ld_hints >>=1;
278 }
279 p += sprintf(p,
280 "\n\tAlias boundary : %d byte(s)\n"
281 "\tTag LSB : %d\n"
282 "\tTag MSB : %d\n",
283 1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb,
284 cci.pcci_tag_msb);
285
286 /* when unified, data(j=2) is enough */
287 if (cci.pcci_unified) break;
288 }
289 }
290 return p - page;
291}
292
293
294static int
295vm_info(char *page)
296{
297 char *p = page;
298 u64 tr_pages =0, vw_pages=0, tc_pages;
299 u64 attrib;
300 pal_vm_info_1_u_t vm_info_1;
301 pal_vm_info_2_u_t vm_info_2;
302 pal_tc_info_u_t tc_info;
303 ia64_ptce_info_t ptce;
304 const char *sep;
305 int i, j;
306 s64 status;
307
308 if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
309 printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
310 return 0;
311 }
312
313
314 p += sprintf(p,
315 "Physical Address Space : %d bits\n"
316 "Virtual Address Space : %d bits\n"
317 "Protection Key Registers(PKR) : %d\n"
318 "Implemented bits in PKR.key : %d\n"
319 "Hash Tag ID : 0x%x\n"
320 "Size of RR.rid : %d\n",
321 vm_info_1.pal_vm_info_1_s.phys_add_size,
322 vm_info_2.pal_vm_info_2_s.impl_va_msb+1, vm_info_1.pal_vm_info_1_s.max_pkr+1,
323 vm_info_1.pal_vm_info_1_s.key_size, vm_info_1.pal_vm_info_1_s.hash_tag_id,
324 vm_info_2.pal_vm_info_2_s.rid_size);
325
326 if (ia64_pal_mem_attrib(&attrib) != 0)
327 return 0;
328
329 p += sprintf(p, "Supported memory attributes : ");
330 sep = "";
331 for (i = 0; i < 8; i++) {
332 if (attrib & (1 << i)) {
333 p += sprintf(p, "%s%s", sep, mem_attrib[i]);
334 sep = ", ";
335 }
336 }
337 p += sprintf(p, "\n");
338
339 if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) {
340 printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status);
341 return 0;
342 }
343
344 p += sprintf(p,
345 "\nTLB walker : %simplemented\n"
346 "Number of DTR : %d\n"
347 "Number of ITR : %d\n"
348 "TLB insertable page sizes : ",
349 vm_info_1.pal_vm_info_1_s.vw ? "" : "not ",
350 vm_info_1.pal_vm_info_1_s.max_dtr_entry+1,
351 vm_info_1.pal_vm_info_1_s.max_itr_entry+1);
352
353
354 p = bitvector_process(p, tr_pages);
355
356 p += sprintf(p, "\nTLB purgeable page sizes : ");
357
358 p = bitvector_process(p, vw_pages);
359
360 if ((status=ia64_get_ptce(&ptce)) != 0) {
361 printk(KERN_ERR "ia64_get_ptce=%ld\n", status);
362 return 0;
363 }
364
365 p += sprintf(p,
366 "\nPurge base address : 0x%016lx\n"
367 "Purge outer loop count : %d\n"
368 "Purge inner loop count : %d\n"
369 "Purge outer loop stride : %d\n"
370 "Purge inner loop stride : %d\n",
371 ptce.base, ptce.count[0], ptce.count[1], ptce.stride[0], ptce.stride[1]);
372
373 p += sprintf(p,
374 "TC Levels : %d\n"
375 "Unique TC(s) : %d\n",
376 vm_info_1.pal_vm_info_1_s.num_tc_levels,
377 vm_info_1.pal_vm_info_1_s.max_unique_tcs);
378
379 for(i=0; i < vm_info_1.pal_vm_info_1_s.num_tc_levels; i++) {
380 for (j=2; j>0 ; j--) {
381 tc_pages = 0; /* just in case */
382
383
384 /* even without unification, some levels may not be present */
385 if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) {
386 continue;
387 }
388
389 p += sprintf(p,
390 "\n%s Translation Cache Level %d:\n"
391 "\tHash sets : %d\n"
392 "\tAssociativity : %d\n"
393 "\tNumber of entries : %d\n"
394 "\tFlags : ",
395 cache_types[j+tc_info.tc_unified], i+1, tc_info.tc_num_sets,
396 tc_info.tc_associativity, tc_info.tc_num_entries);
397
398 if (tc_info.tc_pf) p += sprintf(p, "PreferredPageSizeOptimized ");
399 if (tc_info.tc_unified) p += sprintf(p, "Unified ");
400 if (tc_info.tc_reduce_tr) p += sprintf(p, "TCReduction");
401
402 p += sprintf(p, "\n\tSupported page sizes: ");
403
404 p = bitvector_process(p, tc_pages);
405
406 /* when unified date (j=2) is enough */
407 if (tc_info.tc_unified) break;
408 }
409 }
410 p += sprintf(p, "\n");
411
412 return p - page;
413}
414
415
416static int
417register_info(char *page)
418{
419 char *p = page;
420 u64 reg_info[2];
421 u64 info;
422 u64 phys_stacked;
423 pal_hints_u_t hints;
424 u64 iregs, dregs;
425 char *info_type[]={
426 "Implemented AR(s)",
427 "AR(s) with read side-effects",
428 "Implemented CR(s)",
429 "CR(s) with read side-effects",
430 };
431
432 for(info=0; info < 4; info++) {
433
434 if (ia64_pal_register_info(info, &reg_info[0], &reg_info[1]) != 0) return 0;
435
436 p += sprintf(p, "%-32s : ", info_type[info]);
437
438 p = bitregister_process(p, reg_info, 128);
439
440 p += sprintf(p, "\n");
441 }
442
443 if (ia64_pal_rse_info(&phys_stacked, &hints) != 0) return 0;
444
445 p += sprintf(p,
446 "RSE stacked physical registers : %ld\n"
447 "RSE load/store hints : %ld (%s)\n",
448 phys_stacked, hints.ph_data,
449 hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)");
450
451 if (ia64_pal_debug_info(&iregs, &dregs))
452 return 0;
453
454 p += sprintf(p,
455 "Instruction debug register pairs : %ld\n"
456 "Data debug register pairs : %ld\n", iregs, dregs);
457
458 return p - page;
459}
460
461static const char *proc_features[]={
462 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
463 NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
464 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
465 NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,
466 NULL,NULL,NULL,NULL,NULL,
467 "XIP,XPSR,XFS implemented",
468 "XR1-XR3 implemented",
469 "Disable dynamic predicate prediction",
470 "Disable processor physical number",
471 "Disable dynamic data cache prefetch",
472 "Disable dynamic inst cache prefetch",
473 "Disable dynamic branch prediction",
474 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
475 "Disable BINIT on processor time-out",
476 "Disable dynamic power management (DPM)",
477 "Disable coherency",
478 "Disable cache",
479 "Enable CMCI promotion",
480 "Enable MCA to BINIT promotion",
481 "Enable MCA promotion",
482 "Enable BERR promotion"
483};
484
485
486static int
487processor_info(char *page)
488{
489 char *p = page;
490 const char **v = proc_features;
491 u64 avail=1, status=1, control=1;
492 int i;
493 s64 ret;
494
495 if ((ret=ia64_pal_proc_get_features(&avail, &status, &control)) != 0) return 0;
496
497 for(i=0; i < 64; i++, v++,avail >>=1, status >>=1, control >>=1) {
498 if ( ! *v ) continue;
499 p += sprintf(p, "%-40s : %s%s %s\n", *v,
500 avail & 0x1 ? "" : "NotImpl",
501 avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "",
502 avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
503 }
504 return p - page;
505}
506
507static const char *bus_features[]={
508 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
509 NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
510 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
511 NULL,NULL,
512 "Request Bus Parking",
513 "Bus Lock Mask",
514 "Enable Half Transfer",
515 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
516 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
517 NULL, NULL, NULL, NULL,
518 "Enable Cache Line Repl. Shared",
519 "Enable Cache Line Repl. Exclusive",
520 "Disable Transaction Queuing",
521 "Disable Response Error Checking",
522 "Disable Bus Error Checking",
523 "Disable Bus Requester Internal Error Signalling",
524 "Disable Bus Requester Error Signalling",
525 "Disable Bus Initialization Event Checking",
526 "Disable Bus Initialization Event Signalling",
527 "Disable Bus Address Error Checking",
528 "Disable Bus Address Error Signalling",
529 "Disable Bus Data Error Checking"
530};
531
532
533static int
534bus_info(char *page)
535{
536 char *p = page;
537 const char **v = bus_features;
538 pal_bus_features_u_t av, st, ct;
539 u64 avail, status, control;
540 int i;
541 s64 ret;
542
543 if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0) return 0;
544
545 avail = av.pal_bus_features_val;
546 status = st.pal_bus_features_val;
547 control = ct.pal_bus_features_val;
548
549 for(i=0; i < 64; i++, v++, avail >>=1, status >>=1, control >>=1) {
550 if ( ! *v ) continue;
551 p += sprintf(p, "%-48s : %s%s %s\n", *v,
552 avail & 0x1 ? "" : "NotImpl",
553 avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "",
554 avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
555 }
556 return p - page;
557}
558
559static int
560version_info(char *page)
561{
562 pal_version_u_t min_ver, cur_ver;
563 char *p = page;
564
565 /* The PAL_VERSION call is advertised as being able to support
566 * both physical and virtual mode calls. This seems to be a documentation
567 * bug rather than firmware bug. In fact, it does only support physical mode.
568 * So now the code reflects this fact and the pal_version() has been updated
569 * accordingly.
570 */
571 if (ia64_pal_version(&min_ver, &cur_ver) != 0) return 0;
572
573 p += sprintf(p,
574 "PAL_vendor : 0x%02x (min=0x%02x)\n"
575 "PAL_A : %x.%x.%x (min=%x.%x.%x)\n"
576 "PAL_B : %x.%x.%x (min=%x.%x.%x)\n",
577 cur_ver.pal_version_s.pv_pal_vendor, min_ver.pal_version_s.pv_pal_vendor,
578
579 cur_ver.pal_version_s.pv_pal_a_model>>4,
580 cur_ver.pal_version_s.pv_pal_a_model&0xf, cur_ver.pal_version_s.pv_pal_a_rev,
581 min_ver.pal_version_s.pv_pal_a_model>>4,
582 min_ver.pal_version_s.pv_pal_a_model&0xf, min_ver.pal_version_s.pv_pal_a_rev,
583
584 cur_ver.pal_version_s.pv_pal_b_model>>4,
585 cur_ver.pal_version_s.pv_pal_b_model&0xf, cur_ver.pal_version_s.pv_pal_b_rev,
586 min_ver.pal_version_s.pv_pal_b_model>>4,
587 min_ver.pal_version_s.pv_pal_b_model&0xf, min_ver.pal_version_s.pv_pal_b_rev);
588 return p - page;
589}
590
591static int
592perfmon_info(char *page)
593{
594 char *p = page;
595 u64 pm_buffer[16];
596 pal_perf_mon_info_u_t pm_info;
597
598 if (ia64_pal_perf_mon_info(pm_buffer, &pm_info) != 0) return 0;
599
600 p += sprintf(p,
601 "PMC/PMD pairs : %d\n"
602 "Counter width : %d bits\n"
603 "Cycle event number : %d\n"
604 "Retired event number : %d\n"
605 "Implemented PMC : ",
606 pm_info.pal_perf_mon_info_s.generic, pm_info.pal_perf_mon_info_s.width,
607 pm_info.pal_perf_mon_info_s.cycles, pm_info.pal_perf_mon_info_s.retired);
608
609 p = bitregister_process(p, pm_buffer, 256);
610 p += sprintf(p, "\nImplemented PMD : ");
611 p = bitregister_process(p, pm_buffer+4, 256);
612 p += sprintf(p, "\nCycles count capable : ");
613 p = bitregister_process(p, pm_buffer+8, 256);
614 p += sprintf(p, "\nRetired bundles count capable : ");
615
616#ifdef CONFIG_ITANIUM
617 /*
618 * PAL_PERF_MON_INFO reports that only PMC4 can be used to count CPU_CYCLES
619 * which is wrong, both PMC4 and PMD5 support it.
620 */
621 if (pm_buffer[12] == 0x10) pm_buffer[12]=0x30;
622#endif
623
624 p = bitregister_process(p, pm_buffer+12, 256);
625
626 p += sprintf(p, "\n");
627
628 return p - page;
629}
630
631static int
632frequency_info(char *page)
633{
634 char *p = page;
635 struct pal_freq_ratio proc, itc, bus;
636 u64 base;
637
638 if (ia64_pal_freq_base(&base) == -1)
639 p += sprintf(p, "Output clock : not implemented\n");
640 else
641 p += sprintf(p, "Output clock : %ld ticks/s\n", base);
642
643 if (ia64_pal_freq_ratios(&proc, &bus, &itc) != 0) return 0;
644
645 p += sprintf(p,
646 "Processor/Clock ratio : %ld/%ld\n"
647 "Bus/Clock ratio : %ld/%ld\n"
648 "ITC/Clock ratio : %ld/%ld\n",
649 proc.num, proc.den, bus.num, bus.den, itc.num, itc.den);
650
651 return p - page;
652}
653
654static int
655tr_info(char *page)
656{
657 char *p = page;
658 s64 status;
659 pal_tr_valid_u_t tr_valid;
660 u64 tr_buffer[4];
661 pal_vm_info_1_u_t vm_info_1;
662 pal_vm_info_2_u_t vm_info_2;
663 u64 i, j;
664 u64 max[3], pgm;
665 struct ifa_reg {
666 u64 valid:1;
667 u64 ig:11;
668 u64 vpn:52;
669 } *ifa_reg;
670 struct itir_reg {
671 u64 rv1:2;
672 u64 ps:6;
673 u64 key:24;
674 u64 rv2:32;
675 } *itir_reg;
676 struct gr_reg {
677 u64 p:1;
678 u64 rv1:1;
679 u64 ma:3;
680 u64 a:1;
681 u64 d:1;
682 u64 pl:2;
683 u64 ar:3;
684 u64 ppn:38;
685 u64 rv2:2;
686 u64 ed:1;
687 u64 ig:11;
688 } *gr_reg;
689 struct rid_reg {
690 u64 ig1:1;
691 u64 rv1:1;
692 u64 ig2:6;
693 u64 rid:24;
694 u64 rv2:32;
695 } *rid_reg;
696
697 if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
698 printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
699 return 0;
700 }
701 max[0] = vm_info_1.pal_vm_info_1_s.max_itr_entry+1;
702 max[1] = vm_info_1.pal_vm_info_1_s.max_dtr_entry+1;
703
704 for (i=0; i < 2; i++ ) {
705 for (j=0; j < max[i]; j++) {
706
707 status = ia64_pal_tr_read(j, i, tr_buffer, &tr_valid);
708 if (status != 0) {
709 printk(KERN_ERR "palinfo: pal call failed on tr[%lu:%lu]=%ld\n",
710 i, j, status);
711 continue;
712 }
713
714 ifa_reg = (struct ifa_reg *)&tr_buffer[2];
715
716 if (ifa_reg->valid == 0) continue;
717
718 gr_reg = (struct gr_reg *)tr_buffer;
719 itir_reg = (struct itir_reg *)&tr_buffer[1];
720 rid_reg = (struct rid_reg *)&tr_buffer[3];
721
722 pgm = -1 << (itir_reg->ps - 12);
723 p += sprintf(p,
724 "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n"
725 "\tppn : 0x%lx\n"
726 "\tvpn : 0x%lx\n"
727 "\tps : ",
728 "ID"[i], j,
729 tr_valid.pal_tr_valid_s.access_rights_valid,
730 tr_valid.pal_tr_valid_s.priv_level_valid,
731 tr_valid.pal_tr_valid_s.dirty_bit_valid,
732 tr_valid.pal_tr_valid_s.mem_attr_valid,
733 (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12);
734
735 p = bitvector_process(p, 1<< itir_reg->ps);
736
737 p += sprintf(p,
738 "\n\tpl : %d\n"
739 "\tar : %d\n"
740 "\trid : %x\n"
741 "\tp : %d\n"
742 "\tma : %d\n"
743 "\td : %d\n",
744 gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma,
745 gr_reg->d);
746 }
747 }
748 return p - page;
749}
750
751
752
753/*
754 * List {name,function} pairs for every entry in /proc/palinfo/cpu*
755 */
756static palinfo_entry_t palinfo_entries[]={
757 { "version_info", version_info, },
758 { "vm_info", vm_info, },
759 { "cache_info", cache_info, },
760 { "power_info", power_info, },
761 { "register_info", register_info, },
762 { "processor_info", processor_info, },
763 { "perfmon_info", perfmon_info, },
764 { "frequency_info", frequency_info, },
765 { "bus_info", bus_info },
766 { "tr_info", tr_info, }
767};
768
769#define NR_PALINFO_ENTRIES (int) ARRAY_SIZE(palinfo_entries)
770
771/*
772 * this array is used to keep track of the proc entries we create. This is
773 * required in the module mode when we need to remove all entries. The procfs code
774 * does not do recursion of deletion
775 *
776 * Notes:
777 * - +1 accounts for the cpuN directory entry in /proc/pal
778 */
779#define NR_PALINFO_PROC_ENTRIES (NR_CPUS*(NR_PALINFO_ENTRIES+1))
780
781static struct proc_dir_entry *palinfo_proc_entries[NR_PALINFO_PROC_ENTRIES];
782static struct proc_dir_entry *palinfo_dir;
783
784/*
785 * This data structure is used to pass which cpu,function is being requested
786 * It must fit in a 64bit quantity to be passed to the proc callback routine
787 *
788 * In SMP mode, when we get a request for another CPU, we must call that
789 * other CPU using IPI and wait for the result before returning.
790 */
791typedef union {
792 u64 value;
793 struct {
794 unsigned req_cpu: 32; /* for which CPU this info is */
795 unsigned func_id: 32; /* which function is requested */
796 } pal_func_cpu;
797} pal_func_cpu_u_t;
798
799#define req_cpu pal_func_cpu.req_cpu
800#define func_id pal_func_cpu.func_id
801
802#ifdef CONFIG_SMP
803
804/*
805 * used to hold information about final function to call
806 */
807typedef struct {
808 palinfo_func_t func; /* pointer to function to call */
809 char *page; /* buffer to store results */
810 int ret; /* return value from call */
811} palinfo_smp_data_t;
812
813
814/*
815 * this function does the actual final call and he called
816 * from the smp code, i.e., this is the palinfo callback routine
817 */
818static void
819palinfo_smp_call(void *info)
820{
821 palinfo_smp_data_t *data = (palinfo_smp_data_t *)info;
822 if (data == NULL) {
823 printk(KERN_ERR "palinfo: data pointer is NULL\n");
824 data->ret = 0; /* no output */
825 return;
826 }
827 /* does this actual call */
828 data->ret = (*data->func)(data->page);
829}
830
831/*
832 * function called to trigger the IPI, we need to access a remote CPU
833 * Return:
834 * 0 : error or nothing to output
835 * otherwise how many bytes in the "page" buffer were written
836 */
837static
838int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
839{
840 palinfo_smp_data_t ptr;
841 int ret;
842
843 ptr.func = palinfo_entries[f->func_id].proc_read;
844 ptr.page = page;
845 ptr.ret = 0; /* just in case */
846
847
848 /* will send IPI to other CPU and wait for completion of remote call */
849 if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 0, 1))) {
850 printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: "
851 "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret);
852 return 0;
853 }
854 return ptr.ret;
855}
856#else /* ! CONFIG_SMP */
857static
858int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
859{
860 printk(KERN_ERR "palinfo: should not be called with non SMP kernel\n");
861 return 0;
862}
863#endif /* CONFIG_SMP */
864
865/*
866 * Entry point routine: all calls go through this function
867 */
868static int
869palinfo_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
870{
871 int len=0;
872 pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&data;
873
874 /*
875 * in SMP mode, we may need to call another CPU to get correct
876 * information. PAL, by definition, is processor specific
877 */
878 if (f->req_cpu == get_cpu())
879 len = (*palinfo_entries[f->func_id].proc_read)(page);
880 else
881 len = palinfo_handle_smp(f, page);
882
883 put_cpu();
884
885 if (len <= off+count) *eof = 1;
886
887 *start = page + off;
888 len -= off;
889
890 if (len>count) len = count;
891 if (len<0) len = 0;
892
893 return len;
894}
895
896static void
897create_palinfo_proc_entries(unsigned int cpu)
898{
899# define CPUSTR "cpu%d"
900
901 pal_func_cpu_u_t f;
902 struct proc_dir_entry **pdir;
903 struct proc_dir_entry *cpu_dir;
904 int j;
905 char cpustr[sizeof(CPUSTR)];
906
907
908 /*
909 * we keep track of created entries in a depth-first order for
910 * cleanup purposes. Each entry is stored into palinfo_proc_entries
911 */
912 sprintf(cpustr,CPUSTR, cpu);
913
914 cpu_dir = proc_mkdir(cpustr, palinfo_dir);
915
916 f.req_cpu = cpu;
917
918 /*
919 * Compute the location to store per cpu entries
920 * We dont store the top level entry in this list, but
921 * remove it finally after removing all cpu entries.
922 */
923 pdir = &palinfo_proc_entries[cpu*(NR_PALINFO_ENTRIES+1)];
924 *pdir++ = cpu_dir;
925 for (j=0; j < NR_PALINFO_ENTRIES; j++) {
926 f.func_id = j;
927 *pdir = create_proc_read_entry(
928 palinfo_entries[j].name, 0, cpu_dir,
929 palinfo_read_entry, (void *)f.value);
930 if (*pdir)
931 (*pdir)->owner = THIS_MODULE;
932 pdir++;
933 }
934}
935
936static void
937remove_palinfo_proc_entries(unsigned int hcpu)
938{
939 int j;
940 struct proc_dir_entry *cpu_dir, **pdir;
941
942 pdir = &palinfo_proc_entries[hcpu*(NR_PALINFO_ENTRIES+1)];
943 cpu_dir = *pdir;
944 *pdir++=NULL;
945 for (j=0; j < (NR_PALINFO_ENTRIES); j++) {
946 if ((*pdir)) {
947 remove_proc_entry ((*pdir)->name, cpu_dir);
948 *pdir ++= NULL;
949 }
950 }
951
952 if (cpu_dir) {
953 remove_proc_entry(cpu_dir->name, palinfo_dir);
954 }
955}
956
957static int __devinit palinfo_cpu_callback(struct notifier_block *nfb,
958 unsigned long action,
959 void *hcpu)
960{
961 unsigned int hotcpu = (unsigned long)hcpu;
962
963 switch (action) {
964 case CPU_ONLINE:
965 create_palinfo_proc_entries(hotcpu);
966 break;
967#ifdef CONFIG_HOTPLUG_CPU
968 case CPU_DEAD:
969 remove_palinfo_proc_entries(hotcpu);
970 break;
971#endif
972 }
973 return NOTIFY_OK;
974}
975
976static struct notifier_block palinfo_cpu_notifier =
977{
978 .notifier_call = palinfo_cpu_callback,
979 .priority = 0,
980};
981
982static int __init
983palinfo_init(void)
984{
985 int i = 0;
986
987 printk(KERN_INFO "PAL Information Facility v%s\n", PALINFO_VERSION);
988 palinfo_dir = proc_mkdir("pal", NULL);
989
990 /* Create palinfo dirs in /proc for all online cpus */
991 for_each_online_cpu(i) {
992 create_palinfo_proc_entries(i);
993 }
994
995 /* Register for future delivery via notify registration */
996 register_cpu_notifier(&palinfo_cpu_notifier);
997
998 return 0;
999}
1000
1001static void __exit
1002palinfo_exit(void)
1003{
1004 int i = 0;
1005
1006 /* remove all nodes: depth first pass. Could optimize this */
1007 for_each_online_cpu(i) {
1008 remove_palinfo_proc_entries(i);
1009 }
1010
1011 /*
1012 * Remove the top level entry finally
1013 */
1014 remove_proc_entry(palinfo_dir->name, NULL);
1015
1016 /*
1017 * Unregister from cpu notifier callbacks
1018 */
1019 unregister_cpu_notifier(&palinfo_cpu_notifier);
1020}
1021
1022module_init(palinfo_init);
1023module_exit(palinfo_exit);
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
new file mode 100644
index 000000000000..367804a605fa
--- /dev/null
+++ b/arch/ia64/kernel/patch.c
@@ -0,0 +1,189 @@
1/*
2 * Instruction-patching support.
3 *
4 * Copyright (C) 2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 */
7#include <linux/init.h>
8#include <linux/string.h>
9
10#include <asm/patch.h>
11#include <asm/processor.h>
12#include <asm/sections.h>
13#include <asm/system.h>
14#include <asm/unistd.h>
15
16/*
17 * This was adapted from code written by Tony Luck:
18 *
19 * The 64-bit value in a "movl reg=value" is scattered between the two words of the bundle
20 * like this:
21 *
22 * 6 6 5 4 3 2 1
23 * 3210987654321098765432109876543210987654321098765432109876543210
24 * ABBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCDEEEEEFFFFFFFFFGGGGGGG
25 *
26 * CCCCCCCCCCCCCCCCCCxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
27 * xxxxAFFFFFFFFFEEEEEDxGGGGGGGxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBBBBBB
28 */
29static u64
30get_imm64 (u64 insn_addr)
31{
32 u64 *p = (u64 *) (insn_addr & -16); /* mask out slot number */
33
34 return ( (p[1] & 0x0800000000000000UL) << 4) | /*A*/
35 ((p[1] & 0x00000000007fffffUL) << 40) | /*B*/
36 ((p[0] & 0xffffc00000000000UL) >> 24) | /*C*/
37 ((p[1] & 0x0000100000000000UL) >> 23) | /*D*/
38 ((p[1] & 0x0003e00000000000UL) >> 29) | /*E*/
39 ((p[1] & 0x07fc000000000000UL) >> 43) | /*F*/
40 ((p[1] & 0x000007f000000000UL) >> 36); /*G*/
41}
42
43/* Patch instruction with "val" where "mask" has 1 bits. */
44void
45ia64_patch (u64 insn_addr, u64 mask, u64 val)
46{
47 u64 m0, m1, v0, v1, b0, b1, *b = (u64 *) (insn_addr & -16);
48# define insn_mask ((1UL << 41) - 1)
49 unsigned long shift;
50
51 b0 = b[0]; b1 = b[1];
52 shift = 5 + 41 * (insn_addr % 16); /* 5 bits of template, then 3 x 41-bit instructions */
53 if (shift >= 64) {
54 m1 = mask << (shift - 64);
55 v1 = val << (shift - 64);
56 } else {
57 m0 = mask << shift; m1 = mask >> (64 - shift);
58 v0 = val << shift; v1 = val >> (64 - shift);
59 b[0] = (b0 & ~m0) | (v0 & m0);
60 }
61 b[1] = (b1 & ~m1) | (v1 & m1);
62}
63
64void
65ia64_patch_imm64 (u64 insn_addr, u64 val)
66{
67 ia64_patch(insn_addr,
68 0x01fffefe000UL, ( ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
69 | ((val & 0x0000000000200000UL) << 0) /* bit 21 -> 21 */
70 | ((val & 0x00000000001f0000UL) << 6) /* bit 16 -> 22 */
71 | ((val & 0x000000000000ff80UL) << 20) /* bit 7 -> 27 */
72 | ((val & 0x000000000000007fUL) << 13) /* bit 0 -> 13 */));
73 ia64_patch(insn_addr - 1, 0x1ffffffffffUL, val >> 22);
74}
75
76void
77ia64_patch_imm60 (u64 insn_addr, u64 val)
78{
79 ia64_patch(insn_addr,
80 0x011ffffe000UL, ( ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
81 | ((val & 0x00000000000fffffUL) << 13) /* bit 0 -> 13 */));
82 ia64_patch(insn_addr - 1, 0x1fffffffffcUL, val >> 18);
83}
84
85/*
86 * We need sometimes to load the physical address of a kernel
87 * object. Often we can convert the virtual address to physical
88 * at execution time, but sometimes (either for performance reasons
89 * or during error recovery) we cannot to this. Patch the marked
90 * bundles to load the physical address.
91 */
92void __init
93ia64_patch_vtop (unsigned long start, unsigned long end)
94{
95 s32 *offp = (s32 *) start;
96 u64 ip;
97
98 while (offp < (s32 *) end) {
99 ip = (u64) offp + *offp;
100
101 /* replace virtual address with corresponding physical address: */
102 ia64_patch_imm64(ip, ia64_tpa(get_imm64(ip)));
103 ia64_fc((void *) ip);
104 ++offp;
105 }
106 ia64_sync_i();
107 ia64_srlz_i();
108}
109
110void
111ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
112{
113 static int first_time = 1;
114 int need_workaround;
115 s32 *offp = (s32 *) start;
116 u64 *wp;
117
118 need_workaround = (local_cpu_data->family == 0x1f && local_cpu_data->model == 0);
119
120 if (first_time) {
121 first_time = 0;
122 if (need_workaround)
123 printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n");
124 else
125 printk(KERN_INFO "McKinley Errata 9 workaround not needed; "
126 "disabling it\n");
127 }
128 if (need_workaround)
129 return;
130
131 while (offp < (s32 *) end) {
132 wp = (u64 *) ia64_imva((char *) offp + *offp);
133 wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
134 wp[1] = 0x0004000000000200UL;
135 wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
136 wp[3] = 0x0084006880000200UL;
137 ia64_fc(wp); ia64_fc(wp + 2);
138 ++offp;
139 }
140 ia64_sync_i();
141 ia64_srlz_i();
142}
143
144static void
145patch_fsyscall_table (unsigned long start, unsigned long end)
146{
147 extern unsigned long fsyscall_table[NR_syscalls];
148 s32 *offp = (s32 *) start;
149 u64 ip;
150
151 while (offp < (s32 *) end) {
152 ip = (u64) ia64_imva((char *) offp + *offp);
153 ia64_patch_imm64(ip, (u64) fsyscall_table);
154 ia64_fc((void *) ip);
155 ++offp;
156 }
157 ia64_sync_i();
158 ia64_srlz_i();
159}
160
161static void
162patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
163{
164 extern char fsys_bubble_down[];
165 s32 *offp = (s32 *) start;
166 u64 ip;
167
168 while (offp < (s32 *) end) {
169 ip = (u64) offp + *offp;
170 ia64_patch_imm60((u64) ia64_imva((void *) ip),
171 (u64) (fsys_bubble_down - (ip & -16)) / 16);
172 ia64_fc((void *) ip);
173 ++offp;
174 }
175 ia64_sync_i();
176 ia64_srlz_i();
177}
178
179void
180ia64_patch_gate (void)
181{
182# define START(name) ((unsigned long) __start_gate_##name##_patchlist)
183# define END(name) ((unsigned long)__end_gate_##name##_patchlist)
184
185 patch_fsyscall_table(START(fsyscall), END(fsyscall));
186 patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
187 ia64_patch_vtop(START(vtop), END(vtop));
188 ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
189}
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
new file mode 100644
index 000000000000..71147be3279c
--- /dev/null
+++ b/arch/ia64/kernel/perfmon.c
@@ -0,0 +1,6676 @@
1/*
2 * This file implements the perfmon-2 subsystem which is used
3 * to program the IA-64 Performance Monitoring Unit (PMU).
4 *
5 * The initial version of perfmon.c was written by
6 * Ganesh Venkitachalam, IBM Corp.
7 *
8 * Then it was modified for perfmon-1.x by Stephane Eranian and
9 * David Mosberger, Hewlett Packard Co.
10 *
11 * Version Perfmon-2.x is a rewrite of perfmon-1.x
12 * by Stephane Eranian, Hewlett Packard Co.
13 *
14 * Copyright (C) 1999-2003, 2005 Hewlett Packard Co
15 * Stephane Eranian <eranian@hpl.hp.com>
16 * David Mosberger-Tang <davidm@hpl.hp.com>
17 *
18 * More information about perfmon available at:
19 * http://www.hpl.hp.com/research/linux/perfmon
20 */
21
22#include <linux/config.h>
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/sched.h>
26#include <linux/interrupt.h>
27#include <linux/smp_lock.h>
28#include <linux/proc_fs.h>
29#include <linux/seq_file.h>
30#include <linux/init.h>
31#include <linux/vmalloc.h>
32#include <linux/mm.h>
33#include <linux/sysctl.h>
34#include <linux/list.h>
35#include <linux/file.h>
36#include <linux/poll.h>
37#include <linux/vfs.h>
38#include <linux/pagemap.h>
39#include <linux/mount.h>
40#include <linux/version.h>
41#include <linux/bitops.h>
42
43#include <asm/errno.h>
44#include <asm/intrinsics.h>
45#include <asm/page.h>
46#include <asm/perfmon.h>
47#include <asm/processor.h>
48#include <asm/signal.h>
49#include <asm/system.h>
50#include <asm/uaccess.h>
51#include <asm/delay.h>
52
53#ifdef CONFIG_PERFMON
54/*
55 * perfmon context state
56 */
57#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */
58#define PFM_CTX_LOADED 2 /* context is loaded onto a task */
59#define PFM_CTX_MASKED 3 /* context is loaded but monitoring is masked due to overflow */
60#define PFM_CTX_ZOMBIE 4 /* owner of the context is closing it */
61
62#define PFM_INVALID_ACTIVATION (~0UL)
63
64/*
65 * depth of message queue
66 */
67#define PFM_MAX_MSGS 32
68#define PFM_CTXQ_EMPTY(g) ((g)->ctx_msgq_head == (g)->ctx_msgq_tail)
69
70/*
71 * type of a PMU register (bitmask).
72 * bitmask structure:
73 * bit0 : register implemented
74 * bit1 : end marker
75 * bit2-3 : reserved
76 * bit4 : pmc has pmc.pm
77 * bit5 : pmc controls a counter (has pmc.oi), pmd is used as counter
78 * bit6-7 : register type
79 * bit8-31: reserved
80 */
81#define PFM_REG_NOTIMPL 0x0 /* not implemented at all */
82#define PFM_REG_IMPL 0x1 /* register implemented */
83#define PFM_REG_END 0x2 /* end marker */
84#define PFM_REG_MONITOR (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
85#define PFM_REG_COUNTING (0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */
86#define PFM_REG_CONTROL (0x4<<4|PFM_REG_IMPL) /* PMU control register */
87#define PFM_REG_CONFIG (0x8<<4|PFM_REG_IMPL) /* configuration register */
88#define PFM_REG_BUFFER (0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */
89
90#define PMC_IS_LAST(i) (pmu_conf->pmc_desc[i].type & PFM_REG_END)
91#define PMD_IS_LAST(i) (pmu_conf->pmd_desc[i].type & PFM_REG_END)
92
93#define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY)
94
95/* i assumed unsigned */
96#define PMC_IS_IMPL(i) (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
97#define PMD_IS_IMPL(i) (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
98
99/* XXX: these assume that register i is implemented */
100#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
101#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
102#define PMC_IS_MONITOR(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR) == PFM_REG_MONITOR)
103#define PMC_IS_CONTROL(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL) == PFM_REG_CONTROL)
104
105#define PMC_DFL_VAL(i) pmu_conf->pmc_desc[i].default_value
106#define PMC_RSVD_MASK(i) pmu_conf->pmc_desc[i].reserved_mask
107#define PMD_PMD_DEP(i) pmu_conf->pmd_desc[i].dep_pmd[0]
108#define PMC_PMD_DEP(i) pmu_conf->pmc_desc[i].dep_pmd[0]
109
110#define PFM_NUM_IBRS IA64_NUM_DBG_REGS
111#define PFM_NUM_DBRS IA64_NUM_DBG_REGS
112
113#define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0)
114#define CTX_HAS_SMPL(c) ((c)->ctx_fl_is_sampling)
115#define PFM_CTX_TASK(h) (h)->ctx_task
116
117#define PMU_PMC_OI 5 /* position of pmc.oi bit */
118
119/* XXX: does not support more than 64 PMDs */
120#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
121#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
122
123#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] |= (mask)
124
125#define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
126#define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
127#define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
128#define PFM_CODE_RR 0 /* requesting code range restriction */
129#define PFM_DATA_RR 1 /* requestion data range restriction */
130
131#define PFM_CPUINFO_CLEAR(v) pfm_get_cpu_var(pfm_syst_info) &= ~(v)
132#define PFM_CPUINFO_SET(v) pfm_get_cpu_var(pfm_syst_info) |= (v)
133#define PFM_CPUINFO_GET() pfm_get_cpu_var(pfm_syst_info)
134
135#define RDEP(x) (1UL<<(x))
136
137/*
138 * context protection macros
139 * in SMP:
140 * - we need to protect against CPU concurrency (spin_lock)
141 * - we need to protect against PMU overflow interrupts (local_irq_disable)
142 * in UP:
143 * - we need to protect against PMU overflow interrupts (local_irq_disable)
144 *
145 * spin_lock_irqsave()/spin_lock_irqrestore():
146 * in SMP: local_irq_disable + spin_lock
147 * in UP : local_irq_disable
148 *
149 * spin_lock()/spin_lock():
150 * in UP : removed automatically
151 * in SMP: protect against context accesses from other CPU. interrupts
152 * are not masked. This is useful for the PMU interrupt handler
153 * because we know we will not get PMU concurrency in that code.
154 */
155#define PROTECT_CTX(c, f) \
156 do { \
157 DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \
158 spin_lock_irqsave(&(c)->ctx_lock, f); \
159 DPRINT(("spinlocked ctx %p by [%d]\n", c, current->pid)); \
160 } while(0)
161
162#define UNPROTECT_CTX(c, f) \
163 do { \
164 DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \
165 spin_unlock_irqrestore(&(c)->ctx_lock, f); \
166 } while(0)
167
168#define PROTECT_CTX_NOPRINT(c, f) \
169 do { \
170 spin_lock_irqsave(&(c)->ctx_lock, f); \
171 } while(0)
172
173
174#define UNPROTECT_CTX_NOPRINT(c, f) \
175 do { \
176 spin_unlock_irqrestore(&(c)->ctx_lock, f); \
177 } while(0)
178
179
180#define PROTECT_CTX_NOIRQ(c) \
181 do { \
182 spin_lock(&(c)->ctx_lock); \
183 } while(0)
184
185#define UNPROTECT_CTX_NOIRQ(c) \
186 do { \
187 spin_unlock(&(c)->ctx_lock); \
188 } while(0)
189
190
191#ifdef CONFIG_SMP
192
193#define GET_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)
194#define INC_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)++
195#define SET_ACTIVATION(c) (c)->ctx_last_activation = GET_ACTIVATION()
196
197#else /* !CONFIG_SMP */
198#define SET_ACTIVATION(t) do {} while(0)
199#define GET_ACTIVATION(t) do {} while(0)
200#define INC_ACTIVATION(t) do {} while(0)
201#endif /* CONFIG_SMP */
202
203#define SET_PMU_OWNER(t, c) do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0)
204#define GET_PMU_OWNER() pfm_get_cpu_var(pmu_owner)
205#define GET_PMU_CTX() pfm_get_cpu_var(pmu_ctx)
206
207#define LOCK_PFS(g) spin_lock_irqsave(&pfm_sessions.pfs_lock, g)
208#define UNLOCK_PFS(g) spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g)
209
210#define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
211
212/*
213 * cmp0 must be the value of pmc0
214 */
215#define PMC0_HAS_OVFL(cmp0) (cmp0 & ~0x1UL)
216
217#define PFMFS_MAGIC 0xa0b4d889
218
219/*
220 * debugging
221 */
222#define PFM_DEBUGGING 1
223#ifdef PFM_DEBUGGING
224#define DPRINT(a) \
225 do { \
226 if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
227 } while (0)
228
229#define DPRINT_ovfl(a) \
230 do { \
231 if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
232 } while (0)
233#endif
234
235/*
236 * 64-bit software counter structure
237 *
238 * the next_reset_type is applied to the next call to pfm_reset_regs()
239 */
240typedef struct {
241 unsigned long val; /* virtual 64bit counter value */
242 unsigned long lval; /* last reset value */
243 unsigned long long_reset; /* reset value on sampling overflow */
244 unsigned long short_reset; /* reset value on overflow */
245 unsigned long reset_pmds[4]; /* which other pmds to reset when this counter overflows */
246 unsigned long smpl_pmds[4]; /* which pmds are accessed when counter overflow */
247 unsigned long seed; /* seed for random-number generator */
248 unsigned long mask; /* mask for random-number generator */
249 unsigned int flags; /* notify/do not notify */
250 unsigned long eventid; /* overflow event identifier */
251} pfm_counter_t;
252
253/*
254 * context flags
255 */
256typedef struct {
257 unsigned int block:1; /* when 1, task will blocked on user notifications */
258 unsigned int system:1; /* do system wide monitoring */
259 unsigned int using_dbreg:1; /* using range restrictions (debug registers) */
260 unsigned int is_sampling:1; /* true if using a custom format */
261 unsigned int excl_idle:1; /* exclude idle task in system wide session */
262 unsigned int going_zombie:1; /* context is zombie (MASKED+blocking) */
263 unsigned int trap_reason:2; /* reason for going into pfm_handle_work() */
264 unsigned int no_msg:1; /* no message sent on overflow */
265 unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */
266 unsigned int reserved:22;
267} pfm_context_flags_t;
268
269#define PFM_TRAP_REASON_NONE 0x0 /* default value */
270#define PFM_TRAP_REASON_BLOCK 0x1 /* we need to block on overflow */
271#define PFM_TRAP_REASON_RESET 0x2 /* we need to reset PMDs */
272
273
274/*
275 * perfmon context: encapsulates all the state of a monitoring session
276 */
277
278typedef struct pfm_context {
279 spinlock_t ctx_lock; /* context protection */
280
281 pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */
282 unsigned int ctx_state; /* state: active/inactive (no bitfield) */
283
284 struct task_struct *ctx_task; /* task to which context is attached */
285
286 unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */
287
288 struct semaphore ctx_restart_sem; /* use for blocking notification mode */
289
290 unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */
291 unsigned long ctx_all_pmds[4]; /* bitmask of all accessible PMDs */
292 unsigned long ctx_reload_pmds[4]; /* bitmask of force reload PMD on ctxsw in */
293
294 unsigned long ctx_all_pmcs[4]; /* bitmask of all accessible PMCs */
295 unsigned long ctx_reload_pmcs[4]; /* bitmask of force reload PMC on ctxsw in */
296 unsigned long ctx_used_monitors[4]; /* bitmask of monitor PMC being used */
297
298 unsigned long ctx_pmcs[IA64_NUM_PMC_REGS]; /* saved copies of PMC values */
299
300 unsigned int ctx_used_ibrs[1]; /* bitmask of used IBR (speedup ctxsw in) */
301 unsigned int ctx_used_dbrs[1]; /* bitmask of used DBR (speedup ctxsw in) */
302 unsigned long ctx_dbrs[IA64_NUM_DBG_REGS]; /* DBR values (cache) when not loaded */
303 unsigned long ctx_ibrs[IA64_NUM_DBG_REGS]; /* IBR values (cache) when not loaded */
304
305 pfm_counter_t ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */
306
307 u64 ctx_saved_psr_up; /* only contains psr.up value */
308
309 unsigned long ctx_last_activation; /* context last activation number for last_cpu */
310 unsigned int ctx_last_cpu; /* CPU id of current or last CPU used (SMP only) */
311 unsigned int ctx_cpu; /* cpu to which perfmon is applied (system wide) */
312
313 int ctx_fd; /* file descriptor used my this context */
314 pfm_ovfl_arg_t ctx_ovfl_arg; /* argument to custom buffer format handler */
315
316 pfm_buffer_fmt_t *ctx_buf_fmt; /* buffer format callbacks */
317 void *ctx_smpl_hdr; /* points to sampling buffer header kernel vaddr */
318 unsigned long ctx_smpl_size; /* size of sampling buffer */
319 void *ctx_smpl_vaddr; /* user level virtual address of smpl buffer */
320
321 wait_queue_head_t ctx_msgq_wait;
322 pfm_msg_t ctx_msgq[PFM_MAX_MSGS];
323 int ctx_msgq_head;
324 int ctx_msgq_tail;
325 struct fasync_struct *ctx_async_queue;
326
327 wait_queue_head_t ctx_zombieq; /* termination cleanup wait queue */
328} pfm_context_t;
329
330/*
331 * magic number used to verify that structure is really
332 * a perfmon context
333 */
334#define PFM_IS_FILE(f) ((f)->f_op == &pfm_file_ops)
335
336#define PFM_GET_CTX(t) ((pfm_context_t *)(t)->thread.pfm_context)
337
338#ifdef CONFIG_SMP
339#define SET_LAST_CPU(ctx, v) (ctx)->ctx_last_cpu = (v)
340#define GET_LAST_CPU(ctx) (ctx)->ctx_last_cpu
341#else
342#define SET_LAST_CPU(ctx, v) do {} while(0)
343#define GET_LAST_CPU(ctx) do {} while(0)
344#endif
345
346
347#define ctx_fl_block ctx_flags.block
348#define ctx_fl_system ctx_flags.system
349#define ctx_fl_using_dbreg ctx_flags.using_dbreg
350#define ctx_fl_is_sampling ctx_flags.is_sampling
351#define ctx_fl_excl_idle ctx_flags.excl_idle
352#define ctx_fl_going_zombie ctx_flags.going_zombie
353#define ctx_fl_trap_reason ctx_flags.trap_reason
354#define ctx_fl_no_msg ctx_flags.no_msg
355#define ctx_fl_can_restart ctx_flags.can_restart
356
357#define PFM_SET_WORK_PENDING(t, v) do { (t)->thread.pfm_needs_checking = v; } while(0);
358#define PFM_GET_WORK_PENDING(t) (t)->thread.pfm_needs_checking
359
360/*
361 * global information about all sessions
362 * mostly used to synchronize between system wide and per-process
363 */
364typedef struct {
365 spinlock_t pfs_lock; /* lock the structure */
366
367 unsigned int pfs_task_sessions; /* number of per task sessions */
368 unsigned int pfs_sys_sessions; /* number of per system wide sessions */
369 unsigned int pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */
370 unsigned int pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */
371 struct task_struct *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
372} pfm_session_t;
373
374/*
375 * information about a PMC or PMD.
376 * dep_pmd[]: a bitmask of dependent PMD registers
377 * dep_pmc[]: a bitmask of dependent PMC registers
378 */
379typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
380typedef struct {
381 unsigned int type;
382 int pm_pos;
383 unsigned long default_value; /* power-on default value */
384 unsigned long reserved_mask; /* bitmask of reserved bits */
385 pfm_reg_check_t read_check;
386 pfm_reg_check_t write_check;
387 unsigned long dep_pmd[4];
388 unsigned long dep_pmc[4];
389} pfm_reg_desc_t;
390
391/* assume cnum is a valid monitor */
392#define PMC_PM(cnum, val) (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
393
394/*
395 * This structure is initialized at boot time and contains
396 * a description of the PMU main characteristics.
397 *
398 * If the probe function is defined, detection is based
399 * on its return value:
400 * - 0 means recognized PMU
401 * - anything else means not supported
402 * When the probe function is not defined, then the pmu_family field
403 * is used and it must match the host CPU family such that:
404 * - cpu->family & config->pmu_family != 0
405 */
406typedef struct {
407 unsigned long ovfl_val; /* overflow value for counters */
408
409 pfm_reg_desc_t *pmc_desc; /* detailed PMC register dependencies descriptions */
410 pfm_reg_desc_t *pmd_desc; /* detailed PMD register dependencies descriptions */
411
412 unsigned int num_pmcs; /* number of PMCS: computed at init time */
413 unsigned int num_pmds; /* number of PMDS: computed at init time */
414 unsigned long impl_pmcs[4]; /* bitmask of implemented PMCS */
415 unsigned long impl_pmds[4]; /* bitmask of implemented PMDS */
416
417 char *pmu_name; /* PMU family name */
418 unsigned int pmu_family; /* cpuid family pattern used to identify pmu */
419 unsigned int flags; /* pmu specific flags */
420 unsigned int num_ibrs; /* number of IBRS: computed at init time */
421 unsigned int num_dbrs; /* number of DBRS: computed at init time */
422 unsigned int num_counters; /* PMC/PMD counting pairs : computed at init time */
423 int (*probe)(void); /* customized probe routine */
424 unsigned int use_rr_dbregs:1; /* set if debug registers used for range restriction */
425} pmu_config_t;
426/*
427 * PMU specific flags
428 */
429#define PFM_PMU_IRQ_RESEND 1 /* PMU needs explicit IRQ resend */
430
431/*
432 * debug register related type definitions
433 */
434typedef struct {
435 unsigned long ibr_mask:56;
436 unsigned long ibr_plm:4;
437 unsigned long ibr_ig:3;
438 unsigned long ibr_x:1;
439} ibr_mask_reg_t;
440
441typedef struct {
442 unsigned long dbr_mask:56;
443 unsigned long dbr_plm:4;
444 unsigned long dbr_ig:2;
445 unsigned long dbr_w:1;
446 unsigned long dbr_r:1;
447} dbr_mask_reg_t;
448
449typedef union {
450 unsigned long val;
451 ibr_mask_reg_t ibr;
452 dbr_mask_reg_t dbr;
453} dbreg_t;
454
455
456/*
457 * perfmon command descriptions
458 */
459typedef struct {
460 int (*cmd_func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
461 char *cmd_name;
462 int cmd_flags;
463 unsigned int cmd_narg;
464 size_t cmd_argsize;
465 int (*cmd_getsize)(void *arg, size_t *sz);
466} pfm_cmd_desc_t;
467
468#define PFM_CMD_FD 0x01 /* command requires a file descriptor */
469#define PFM_CMD_ARG_READ 0x02 /* command must read argument(s) */
470#define PFM_CMD_ARG_RW 0x04 /* command must read/write argument(s) */
471#define PFM_CMD_STOP 0x08 /* command does not work on zombie context */
472
473
474#define PFM_CMD_NAME(cmd) pfm_cmd_tab[(cmd)].cmd_name
475#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ)
476#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW)
477#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD)
478#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP)
479
480#define PFM_CMD_ARG_MANY -1 /* cannot be zero */
481
482typedef struct {
483 int debug; /* turn on/off debugging via syslog */
484 int debug_ovfl; /* turn on/off debug printk in overflow handler */
485 int fastctxsw; /* turn on/off fast (unsecure) ctxsw */
486 int expert_mode; /* turn on/off value checking */
487 int debug_pfm_read;
488} pfm_sysctl_t;
489
490typedef struct {
491 unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
492 unsigned long pfm_replay_ovfl_intr_count; /* keep track of replayed ovfl interrupts */
493 unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
494 unsigned long pfm_ovfl_intr_cycles; /* cycles spent processing ovfl interrupts */
495 unsigned long pfm_ovfl_intr_cycles_min; /* min cycles spent processing ovfl interrupts */
496 unsigned long pfm_ovfl_intr_cycles_max; /* max cycles spent processing ovfl interrupts */
497 unsigned long pfm_smpl_handler_calls;
498 unsigned long pfm_smpl_handler_cycles;
499 char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
500} pfm_stats_t;
501
502/*
503 * perfmon internal variables
504 */
505static pfm_stats_t pfm_stats[NR_CPUS];
506static pfm_session_t pfm_sessions; /* global sessions information */
507
508static struct proc_dir_entry *perfmon_dir;
509static pfm_uuid_t pfm_null_uuid = {0,};
510
511static spinlock_t pfm_buffer_fmt_lock;
512static LIST_HEAD(pfm_buffer_fmt_list);
513
514static pmu_config_t *pmu_conf;
515
516/* sysctl() controls */
517static pfm_sysctl_t pfm_sysctl;
518int pfm_debug_var;
519
520static ctl_table pfm_ctl_table[]={
521 {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
522 {2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
523 {3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
524 {4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
525 { 0, },
526};
527static ctl_table pfm_sysctl_dir[] = {
528 {1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
529 {0,},
530};
531static ctl_table pfm_sysctl_root[] = {
532 {1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
533 {0,},
534};
535static struct ctl_table_header *pfm_sysctl_header;
536
537static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
538static int pfm_flush(struct file *filp);
539
540#define pfm_get_cpu_var(v) __ia64_per_cpu_var(v)
541#define pfm_get_cpu_data(a,b) per_cpu(a, b)
542
543static inline void
544pfm_put_task(struct task_struct *task)
545{
546 if (task != current) put_task_struct(task);
547}
548
549static inline void
550pfm_set_task_notify(struct task_struct *task)
551{
552 struct thread_info *info;
553
554 info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
555 set_bit(TIF_NOTIFY_RESUME, &info->flags);
556}
557
558static inline void
559pfm_clear_task_notify(void)
560{
561 clear_thread_flag(TIF_NOTIFY_RESUME);
562}
563
564static inline void
565pfm_reserve_page(unsigned long a)
566{
567 SetPageReserved(vmalloc_to_page((void *)a));
568}
569static inline void
570pfm_unreserve_page(unsigned long a)
571{
572 ClearPageReserved(vmalloc_to_page((void*)a));
573}
574
575static inline unsigned long
576pfm_protect_ctx_ctxsw(pfm_context_t *x)
577{
578 spin_lock(&(x)->ctx_lock);
579 return 0UL;
580}
581
582static inline unsigned long
583pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
584{
585 spin_unlock(&(x)->ctx_lock);
586}
587
588static inline unsigned int
589pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
590{
591 return do_munmap(mm, addr, len);
592}
593
594static inline unsigned long
595pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
596{
597 return get_unmapped_area(file, addr, len, pgoff, flags);
598}
599
600
601static struct super_block *
602pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
603{
604 return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
605}
606
607static struct file_system_type pfm_fs_type = {
608 .name = "pfmfs",
609 .get_sb = pfmfs_get_sb,
610 .kill_sb = kill_anon_super,
611};
612
613DEFINE_PER_CPU(unsigned long, pfm_syst_info);
614DEFINE_PER_CPU(struct task_struct *, pmu_owner);
615DEFINE_PER_CPU(pfm_context_t *, pmu_ctx);
616DEFINE_PER_CPU(unsigned long, pmu_activation_number);
617
618
619/* forward declaration */
620static struct file_operations pfm_file_ops;
621
622/*
623 * forward declarations
624 */
625#ifndef CONFIG_SMP
626static void pfm_lazy_save_regs (struct task_struct *ta);
627#endif
628
629void dump_pmu_state(const char *);
630static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
631
632#include "perfmon_itanium.h"
633#include "perfmon_mckinley.h"
634#include "perfmon_generic.h"
635
636static pmu_config_t *pmu_confs[]={
637 &pmu_conf_mck,
638 &pmu_conf_ita,
639 &pmu_conf_gen, /* must be last */
640 NULL
641};
642
643
644static int pfm_end_notify_user(pfm_context_t *ctx);
645
646static inline void
647pfm_clear_psr_pp(void)
648{
649 ia64_rsm(IA64_PSR_PP);
650 ia64_srlz_i();
651}
652
653static inline void
654pfm_set_psr_pp(void)
655{
656 ia64_ssm(IA64_PSR_PP);
657 ia64_srlz_i();
658}
659
660static inline void
661pfm_clear_psr_up(void)
662{
663 ia64_rsm(IA64_PSR_UP);
664 ia64_srlz_i();
665}
666
667static inline void
668pfm_set_psr_up(void)
669{
670 ia64_ssm(IA64_PSR_UP);
671 ia64_srlz_i();
672}
673
674static inline unsigned long
675pfm_get_psr(void)
676{
677 unsigned long tmp;
678 tmp = ia64_getreg(_IA64_REG_PSR);
679 ia64_srlz_i();
680 return tmp;
681}
682
683static inline void
684pfm_set_psr_l(unsigned long val)
685{
686 ia64_setreg(_IA64_REG_PSR_L, val);
687 ia64_srlz_i();
688}
689
690static inline void
691pfm_freeze_pmu(void)
692{
693 ia64_set_pmc(0,1UL);
694 ia64_srlz_d();
695}
696
697static inline void
698pfm_unfreeze_pmu(void)
699{
700 ia64_set_pmc(0,0UL);
701 ia64_srlz_d();
702}
703
704static inline void
705pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
706{
707 int i;
708
709 for (i=0; i < nibrs; i++) {
710 ia64_set_ibr(i, ibrs[i]);
711 ia64_dv_serialize_instruction();
712 }
713 ia64_srlz_i();
714}
715
716static inline void
717pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
718{
719 int i;
720
721 for (i=0; i < ndbrs; i++) {
722 ia64_set_dbr(i, dbrs[i]);
723 ia64_dv_serialize_data();
724 }
725 ia64_srlz_d();
726}
727
728/*
729 * PMD[i] must be a counter. no check is made
730 */
731static inline unsigned long
732pfm_read_soft_counter(pfm_context_t *ctx, int i)
733{
734 return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
735}
736
737/*
738 * PMD[i] must be a counter. no check is made
739 */
740static inline void
741pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
742{
743 unsigned long ovfl_val = pmu_conf->ovfl_val;
744
745 ctx->ctx_pmds[i].val = val & ~ovfl_val;
746 /*
747 * writing to unimplemented part is ignore, so we do not need to
748 * mask off top part
749 */
750 ia64_set_pmd(i, val & ovfl_val);
751}
752
753static pfm_msg_t *
754pfm_get_new_msg(pfm_context_t *ctx)
755{
756 int idx, next;
757
758 next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS;
759
760 DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
761 if (next == ctx->ctx_msgq_head) return NULL;
762
763 idx = ctx->ctx_msgq_tail;
764 ctx->ctx_msgq_tail = next;
765
766 DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx));
767
768 return ctx->ctx_msgq+idx;
769}
770
771static pfm_msg_t *
772pfm_get_next_msg(pfm_context_t *ctx)
773{
774 pfm_msg_t *msg;
775
776 DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
777
778 if (PFM_CTXQ_EMPTY(ctx)) return NULL;
779
780 /*
781 * get oldest message
782 */
783 msg = ctx->ctx_msgq+ctx->ctx_msgq_head;
784
785 /*
786 * and move forward
787 */
788 ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS;
789
790 DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type));
791
792 return msg;
793}
794
795static void
796pfm_reset_msgq(pfm_context_t *ctx)
797{
798 ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
799 DPRINT(("ctx=%p msgq reset\n", ctx));
800}
801
802static void *
803pfm_rvmalloc(unsigned long size)
804{
805 void *mem;
806 unsigned long addr;
807
808 size = PAGE_ALIGN(size);
809 mem = vmalloc(size);
810 if (mem) {
811 //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
812 memset(mem, 0, size);
813 addr = (unsigned long)mem;
814 while (size > 0) {
815 pfm_reserve_page(addr);
816 addr+=PAGE_SIZE;
817 size-=PAGE_SIZE;
818 }
819 }
820 return mem;
821}
822
823static void
824pfm_rvfree(void *mem, unsigned long size)
825{
826 unsigned long addr;
827
828 if (mem) {
829 DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
830 addr = (unsigned long) mem;
831 while ((long) size > 0) {
832 pfm_unreserve_page(addr);
833 addr+=PAGE_SIZE;
834 size-=PAGE_SIZE;
835 }
836 vfree(mem);
837 }
838 return;
839}
840
841static pfm_context_t *
842pfm_context_alloc(void)
843{
844 pfm_context_t *ctx;
845
846 /*
847 * allocate context descriptor
848 * must be able to free with interrupts disabled
849 */
850 ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
851 if (ctx) {
852 memset(ctx, 0, sizeof(pfm_context_t));
853 DPRINT(("alloc ctx @%p\n", ctx));
854 }
855 return ctx;
856}
857
858static void
859pfm_context_free(pfm_context_t *ctx)
860{
861 if (ctx) {
862 DPRINT(("free ctx @%p\n", ctx));
863 kfree(ctx);
864 }
865}
866
867static void
868pfm_mask_monitoring(struct task_struct *task)
869{
870 pfm_context_t *ctx = PFM_GET_CTX(task);
871 struct thread_struct *th = &task->thread;
872 unsigned long mask, val, ovfl_mask;
873 int i;
874
875 DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
876
877 ovfl_mask = pmu_conf->ovfl_val;
878 /*
879 * monitoring can only be masked as a result of a valid
880 * counter overflow. In UP, it means that the PMU still
881 * has an owner. Note that the owner can be different
882 * from the current task. However the PMU state belongs
883 * to the owner.
884 * In SMP, a valid overflow only happens when task is
885 * current. Therefore if we come here, we know that
886 * the PMU state belongs to the current task, therefore
887 * we can access the live registers.
888 *
889 * So in both cases, the live register contains the owner's
890 * state. We can ONLY touch the PMU registers and NOT the PSR.
891 *
892 * As a consequence to this call, the thread->pmds[] array
893 * contains stale information which must be ignored
894 * when context is reloaded AND monitoring is active (see
895 * pfm_restart).
896 */
897 mask = ctx->ctx_used_pmds[0];
898 for (i = 0; mask; i++, mask>>=1) {
899 /* skip non used pmds */
900 if ((mask & 0x1) == 0) continue;
901 val = ia64_get_pmd(i);
902
903 if (PMD_IS_COUNTING(i)) {
904 /*
905 * we rebuild the full 64 bit value of the counter
906 */
907 ctx->ctx_pmds[i].val += (val & ovfl_mask);
908 } else {
909 ctx->ctx_pmds[i].val = val;
910 }
911 DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
912 i,
913 ctx->ctx_pmds[i].val,
914 val & ovfl_mask));
915 }
916 /*
917 * mask monitoring by setting the privilege level to 0
918 * we cannot use psr.pp/psr.up for this, it is controlled by
919 * the user
920 *
921 * if task is current, modify actual registers, otherwise modify
922 * thread save state, i.e., what will be restored in pfm_load_regs()
923 */
924 mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
925 for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
926 if ((mask & 0x1) == 0UL) continue;
927 ia64_set_pmc(i, th->pmcs[i] & ~0xfUL);
928 th->pmcs[i] &= ~0xfUL;
929 DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i]));
930 }
931 /*
932 * make all of this visible
933 */
934 ia64_srlz_d();
935}
936
937/*
938 * must always be done with task == current
939 *
940 * context must be in MASKED state when calling
941 */
942static void
943pfm_restore_monitoring(struct task_struct *task)
944{
945 pfm_context_t *ctx = PFM_GET_CTX(task);
946 struct thread_struct *th = &task->thread;
947 unsigned long mask, ovfl_mask;
948 unsigned long psr, val;
949 int i, is_system;
950
951 is_system = ctx->ctx_fl_system;
952 ovfl_mask = pmu_conf->ovfl_val;
953
954 if (task != current) {
955 printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
956 return;
957 }
958 if (ctx->ctx_state != PFM_CTX_MASKED) {
959 printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
960 task->pid, current->pid, ctx->ctx_state);
961 return;
962 }
963 psr = pfm_get_psr();
964 /*
965 * monitoring is masked via the PMC.
966 * As we restore their value, we do not want each counter to
967 * restart right away. We stop monitoring using the PSR,
968 * restore the PMC (and PMD) and then re-establish the psr
969 * as it was. Note that there can be no pending overflow at
970 * this point, because monitoring was MASKED.
971 *
972 * system-wide session are pinned and self-monitoring
973 */
974 if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
975 /* disable dcr pp */
976 ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
977 pfm_clear_psr_pp();
978 } else {
979 pfm_clear_psr_up();
980 }
981 /*
982 * first, we restore the PMD
983 */
984 mask = ctx->ctx_used_pmds[0];
985 for (i = 0; mask; i++, mask>>=1) {
986 /* skip non used pmds */
987 if ((mask & 0x1) == 0) continue;
988
989 if (PMD_IS_COUNTING(i)) {
990 /*
991 * we split the 64bit value according to
992 * counter width
993 */
994 val = ctx->ctx_pmds[i].val & ovfl_mask;
995 ctx->ctx_pmds[i].val &= ~ovfl_mask;
996 } else {
997 val = ctx->ctx_pmds[i].val;
998 }
999 ia64_set_pmd(i, val);
1000
1001 DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
1002 i,
1003 ctx->ctx_pmds[i].val,
1004 val));
1005 }
1006 /*
1007 * restore the PMCs
1008 */
1009 mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
1010 for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
1011 if ((mask & 0x1) == 0UL) continue;
1012 th->pmcs[i] = ctx->ctx_pmcs[i];
1013 ia64_set_pmc(i, th->pmcs[i]);
1014 DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i]));
1015 }
1016 ia64_srlz_d();
1017
1018 /*
1019 * must restore DBR/IBR because could be modified while masked
1020 * XXX: need to optimize
1021 */
1022 if (ctx->ctx_fl_using_dbreg) {
1023 pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
1024 pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
1025 }
1026
1027 /*
1028 * now restore PSR
1029 */
1030 if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
1031 /* enable dcr pp */
1032 ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
1033 ia64_srlz_i();
1034 }
1035 pfm_set_psr_l(psr);
1036}
1037
1038static inline void
1039pfm_save_pmds(unsigned long *pmds, unsigned long mask)
1040{
1041 int i;
1042
1043 ia64_srlz_d();
1044
1045 for (i=0; mask; i++, mask>>=1) {
1046 if (mask & 0x1) pmds[i] = ia64_get_pmd(i);
1047 }
1048}
1049
1050/*
1051 * reload from thread state (used for ctxw only)
1052 */
1053static inline void
1054pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
1055{
1056 int i;
1057 unsigned long val, ovfl_val = pmu_conf->ovfl_val;
1058
1059 for (i=0; mask; i++, mask>>=1) {
1060 if ((mask & 0x1) == 0) continue;
1061 val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i];
1062 ia64_set_pmd(i, val);
1063 }
1064 ia64_srlz_d();
1065}
1066
1067/*
1068 * propagate PMD from context to thread-state
1069 */
1070static inline void
1071pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
1072{
1073 struct thread_struct *thread = &task->thread;
1074 unsigned long ovfl_val = pmu_conf->ovfl_val;
1075 unsigned long mask = ctx->ctx_all_pmds[0];
1076 unsigned long val;
1077 int i;
1078
1079 DPRINT(("mask=0x%lx\n", mask));
1080
1081 for (i=0; mask; i++, mask>>=1) {
1082
1083 val = ctx->ctx_pmds[i].val;
1084
1085 /*
1086 * We break up the 64 bit value into 2 pieces
1087 * the lower bits go to the machine state in the
1088 * thread (will be reloaded on ctxsw in).
1089 * The upper part stays in the soft-counter.
1090 */
1091 if (PMD_IS_COUNTING(i)) {
1092 ctx->ctx_pmds[i].val = val & ~ovfl_val;
1093 val &= ovfl_val;
1094 }
1095 thread->pmds[i] = val;
1096
1097 DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
1098 i,
1099 thread->pmds[i],
1100 ctx->ctx_pmds[i].val));
1101 }
1102}
1103
1104/*
1105 * propagate PMC from context to thread-state
1106 */
1107static inline void
1108pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx)
1109{
1110 struct thread_struct *thread = &task->thread;
1111 unsigned long mask = ctx->ctx_all_pmcs[0];
1112 int i;
1113
1114 DPRINT(("mask=0x%lx\n", mask));
1115
1116 for (i=0; mask; i++, mask>>=1) {
1117 /* masking 0 with ovfl_val yields 0 */
1118 thread->pmcs[i] = ctx->ctx_pmcs[i];
1119 DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i]));
1120 }
1121}
1122
1123
1124
1125static inline void
1126pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask)
1127{
1128 int i;
1129
1130 for (i=0; mask; i++, mask>>=1) {
1131 if ((mask & 0x1) == 0) continue;
1132 ia64_set_pmc(i, pmcs[i]);
1133 }
1134 ia64_srlz_d();
1135}
1136
1137static inline int
1138pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b)
1139{
1140 return memcmp(a, b, sizeof(pfm_uuid_t));
1141}
1142
1143static inline int
1144pfm_buf_fmt_exit(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, struct pt_regs *regs)
1145{
1146 int ret = 0;
1147 if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs);
1148 return ret;
1149}
1150
1151static inline int
1152pfm_buf_fmt_getsize(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size)
1153{
1154 int ret = 0;
1155 if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size);
1156 return ret;
1157}
1158
1159
1160static inline int
1161pfm_buf_fmt_validate(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags,
1162 int cpu, void *arg)
1163{
1164 int ret = 0;
1165 if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg);
1166 return ret;
1167}
1168
1169static inline int
1170pfm_buf_fmt_init(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, unsigned int flags,
1171 int cpu, void *arg)
1172{
1173 int ret = 0;
1174 if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg);
1175 return ret;
1176}
1177
1178static inline int
1179pfm_buf_fmt_restart(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
1180{
1181 int ret = 0;
1182 if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs);
1183 return ret;
1184}
1185
1186static inline int
1187pfm_buf_fmt_restart_active(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
1188{
1189 int ret = 0;
1190 if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs);
1191 return ret;
1192}
1193
1194static pfm_buffer_fmt_t *
1195__pfm_find_buffer_fmt(pfm_uuid_t uuid)
1196{
1197 struct list_head * pos;
1198 pfm_buffer_fmt_t * entry;
1199
1200 list_for_each(pos, &pfm_buffer_fmt_list) {
1201 entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
1202 if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
1203 return entry;
1204 }
1205 return NULL;
1206}
1207
1208/*
1209 * find a buffer format based on its uuid
1210 */
1211static pfm_buffer_fmt_t *
1212pfm_find_buffer_fmt(pfm_uuid_t uuid)
1213{
1214 pfm_buffer_fmt_t * fmt;
1215 spin_lock(&pfm_buffer_fmt_lock);
1216 fmt = __pfm_find_buffer_fmt(uuid);
1217 spin_unlock(&pfm_buffer_fmt_lock);
1218 return fmt;
1219}
1220
1221int
1222pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
1223{
1224 int ret = 0;
1225
1226 /* some sanity checks */
1227 if (fmt == NULL || fmt->fmt_name == NULL) return -EINVAL;
1228
1229 /* we need at least a handler */
1230 if (fmt->fmt_handler == NULL) return -EINVAL;
1231
1232 /*
1233 * XXX: need check validity of fmt_arg_size
1234 */
1235
1236 spin_lock(&pfm_buffer_fmt_lock);
1237
1238 if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
1239 printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
1240 ret = -EBUSY;
1241 goto out;
1242 }
1243 list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
1244 printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
1245
1246out:
1247 spin_unlock(&pfm_buffer_fmt_lock);
1248 return ret;
1249}
1250EXPORT_SYMBOL(pfm_register_buffer_fmt);
1251
1252int
1253pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
1254{
1255 pfm_buffer_fmt_t *fmt;
1256 int ret = 0;
1257
1258 spin_lock(&pfm_buffer_fmt_lock);
1259
1260 fmt = __pfm_find_buffer_fmt(uuid);
1261 if (!fmt) {
1262 printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
1263 ret = -EINVAL;
1264 goto out;
1265 }
1266 list_del_init(&fmt->fmt_list);
1267 printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
1268
1269out:
1270 spin_unlock(&pfm_buffer_fmt_lock);
1271 return ret;
1272
1273}
1274EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
1275
1276static int
1277pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
1278{
1279 unsigned long flags;
1280 /*
1281 * validy checks on cpu_mask have been done upstream
1282 */
1283 LOCK_PFS(flags);
1284
1285 DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
1286 pfm_sessions.pfs_sys_sessions,
1287 pfm_sessions.pfs_task_sessions,
1288 pfm_sessions.pfs_sys_use_dbregs,
1289 is_syswide,
1290 cpu));
1291
1292 if (is_syswide) {
1293 /*
1294 * cannot mix system wide and per-task sessions
1295 */
1296 if (pfm_sessions.pfs_task_sessions > 0UL) {
1297 DPRINT(("system wide not possible, %u conflicting task_sessions\n",
1298 pfm_sessions.pfs_task_sessions));
1299 goto abort;
1300 }
1301
1302 if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict;
1303
1304 DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id()));
1305
1306 pfm_sessions.pfs_sys_session[cpu] = task;
1307
1308 pfm_sessions.pfs_sys_sessions++ ;
1309
1310 } else {
1311 if (pfm_sessions.pfs_sys_sessions) goto abort;
1312 pfm_sessions.pfs_task_sessions++;
1313 }
1314
1315 DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
1316 pfm_sessions.pfs_sys_sessions,
1317 pfm_sessions.pfs_task_sessions,
1318 pfm_sessions.pfs_sys_use_dbregs,
1319 is_syswide,
1320 cpu));
1321
1322 UNLOCK_PFS(flags);
1323
1324 return 0;
1325
1326error_conflict:
1327 DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
1328 pfm_sessions.pfs_sys_session[cpu]->pid,
1329 smp_processor_id()));
1330abort:
1331 UNLOCK_PFS(flags);
1332
1333 return -EBUSY;
1334
1335}
1336
1337static int
1338pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
1339{
1340 unsigned long flags;
1341 /*
1342 * validy checks on cpu_mask have been done upstream
1343 */
1344 LOCK_PFS(flags);
1345
1346 DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
1347 pfm_sessions.pfs_sys_sessions,
1348 pfm_sessions.pfs_task_sessions,
1349 pfm_sessions.pfs_sys_use_dbregs,
1350 is_syswide,
1351 cpu));
1352
1353
1354 if (is_syswide) {
1355 pfm_sessions.pfs_sys_session[cpu] = NULL;
1356 /*
1357 * would not work with perfmon+more than one bit in cpu_mask
1358 */
1359 if (ctx && ctx->ctx_fl_using_dbreg) {
1360 if (pfm_sessions.pfs_sys_use_dbregs == 0) {
1361 printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx);
1362 } else {
1363 pfm_sessions.pfs_sys_use_dbregs--;
1364 }
1365 }
1366 pfm_sessions.pfs_sys_sessions--;
1367 } else {
1368 pfm_sessions.pfs_task_sessions--;
1369 }
1370 DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
1371 pfm_sessions.pfs_sys_sessions,
1372 pfm_sessions.pfs_task_sessions,
1373 pfm_sessions.pfs_sys_use_dbregs,
1374 is_syswide,
1375 cpu));
1376
1377 UNLOCK_PFS(flags);
1378
1379 return 0;
1380}
1381
1382/*
1383 * removes virtual mapping of the sampling buffer.
1384 * IMPORTANT: cannot be called with interrupts disable, e.g. inside
1385 * a PROTECT_CTX() section.
1386 */
1387static int
1388pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size)
1389{
1390 int r;
1391
1392 /* sanity checks */
1393 if (task->mm == NULL || size == 0UL || vaddr == NULL) {
1394 printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm);
1395 return -EINVAL;
1396 }
1397
1398 DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size));
1399
1400 /*
1401 * does the actual unmapping
1402 */
1403 down_write(&task->mm->mmap_sem);
1404
1405 DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
1406
1407 r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
1408
1409 up_write(&task->mm->mmap_sem);
1410 if (r !=0) {
1411 printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size);
1412 }
1413
1414 DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
1415
1416 return 0;
1417}
1418
1419/*
1420 * free actual physical storage used by sampling buffer
1421 */
1422#if 0
1423static int
1424pfm_free_smpl_buffer(pfm_context_t *ctx)
1425{
1426 pfm_buffer_fmt_t *fmt;
1427
1428 if (ctx->ctx_smpl_hdr == NULL) goto invalid_free;
1429
1430 /*
1431 * we won't use the buffer format anymore
1432 */
1433 fmt = ctx->ctx_buf_fmt;
1434
1435 DPRINT(("sampling buffer @%p size %lu vaddr=%p\n",
1436 ctx->ctx_smpl_hdr,
1437 ctx->ctx_smpl_size,
1438 ctx->ctx_smpl_vaddr));
1439
1440 pfm_buf_fmt_exit(fmt, current, NULL, NULL);
1441
1442 /*
1443 * free the buffer
1444 */
1445 pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
1446
1447 ctx->ctx_smpl_hdr = NULL;
1448 ctx->ctx_smpl_size = 0UL;
1449
1450 return 0;
1451
1452invalid_free:
1453 printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid);
1454 return -EINVAL;
1455}
1456#endif
1457
1458static inline void
1459pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
1460{
1461 if (fmt == NULL) return;
1462
1463 pfm_buf_fmt_exit(fmt, current, NULL, NULL);
1464
1465}
1466
1467/*
1468 * pfmfs should _never_ be mounted by userland - too much of security hassle,
1469 * no real gain from having the whole whorehouse mounted. So we don't need
1470 * any operations on the root directory. However, we need a non-trivial
1471 * d_name - pfm: will go nicely and kill the special-casing in procfs.
1472 */
1473static struct vfsmount *pfmfs_mnt;
1474
1475static int __init
1476init_pfm_fs(void)
1477{
1478 int err = register_filesystem(&pfm_fs_type);
1479 if (!err) {
1480 pfmfs_mnt = kern_mount(&pfm_fs_type);
1481 err = PTR_ERR(pfmfs_mnt);
1482 if (IS_ERR(pfmfs_mnt))
1483 unregister_filesystem(&pfm_fs_type);
1484 else
1485 err = 0;
1486 }
1487 return err;
1488}
1489
1490static void __exit
1491exit_pfm_fs(void)
1492{
1493 unregister_filesystem(&pfm_fs_type);
1494 mntput(pfmfs_mnt);
1495}
1496
1497static ssize_t
1498pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
1499{
1500 pfm_context_t *ctx;
1501 pfm_msg_t *msg;
1502 ssize_t ret;
1503 unsigned long flags;
1504 DECLARE_WAITQUEUE(wait, current);
1505 if (PFM_IS_FILE(filp) == 0) {
1506 printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
1507 return -EINVAL;
1508 }
1509
1510 ctx = (pfm_context_t *)filp->private_data;
1511 if (ctx == NULL) {
1512 printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid);
1513 return -EINVAL;
1514 }
1515
1516 /*
1517 * check even when there is no message
1518 */
1519 if (size < sizeof(pfm_msg_t)) {
1520 DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
1521 return -EINVAL;
1522 }
1523
1524 PROTECT_CTX(ctx, flags);
1525
1526 /*
1527 * put ourselves on the wait queue
1528 */
1529 add_wait_queue(&ctx->ctx_msgq_wait, &wait);
1530
1531
1532 for(;;) {
1533 /*
1534 * check wait queue
1535 */
1536
1537 set_current_state(TASK_INTERRUPTIBLE);
1538
1539 DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
1540
1541 ret = 0;
1542 if(PFM_CTXQ_EMPTY(ctx) == 0) break;
1543
1544 UNPROTECT_CTX(ctx, flags);
1545
1546 /*
1547 * check non-blocking read
1548 */
1549 ret = -EAGAIN;
1550 if(filp->f_flags & O_NONBLOCK) break;
1551
1552 /*
1553 * check pending signals
1554 */
1555 if(signal_pending(current)) {
1556 ret = -EINTR;
1557 break;
1558 }
1559 /*
1560 * no message, so wait
1561 */
1562 schedule();
1563
1564 PROTECT_CTX(ctx, flags);
1565 }
1566 DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret));
1567 set_current_state(TASK_RUNNING);
1568 remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
1569
1570 if (ret < 0) goto abort;
1571
1572 ret = -EINVAL;
1573 msg = pfm_get_next_msg(ctx);
1574 if (msg == NULL) {
1575 printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid);
1576 goto abort_locked;
1577 }
1578
1579 DPRINT(("[%d] fd=%d type=%d\n", current->pid, msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
1580
1581 ret = -EFAULT;
1582 if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
1583
1584abort_locked:
1585 UNPROTECT_CTX(ctx, flags);
1586abort:
1587 return ret;
1588}
1589
1590static ssize_t
1591pfm_write(struct file *file, const char __user *ubuf,
1592 size_t size, loff_t *ppos)
1593{
1594 DPRINT(("pfm_write called\n"));
1595 return -EINVAL;
1596}
1597
1598static unsigned int
1599pfm_poll(struct file *filp, poll_table * wait)
1600{
1601 pfm_context_t *ctx;
1602 unsigned long flags;
1603 unsigned int mask = 0;
1604
1605 if (PFM_IS_FILE(filp) == 0) {
1606 printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
1607 return 0;
1608 }
1609
1610 ctx = (pfm_context_t *)filp->private_data;
1611 if (ctx == NULL) {
1612 printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid);
1613 return 0;
1614 }
1615
1616
1617 DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd));
1618
1619 poll_wait(filp, &ctx->ctx_msgq_wait, wait);
1620
1621 PROTECT_CTX(ctx, flags);
1622
1623 if (PFM_CTXQ_EMPTY(ctx) == 0)
1624 mask = POLLIN | POLLRDNORM;
1625
1626 UNPROTECT_CTX(ctx, flags);
1627
1628 DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask));
1629
1630 return mask;
1631}
1632
1633static int
1634pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
1635{
1636 DPRINT(("pfm_ioctl called\n"));
1637 return -EINVAL;
1638}
1639
1640/*
1641 * interrupt cannot be masked when coming here
1642 */
1643static inline int
1644pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
1645{
1646 int ret;
1647
1648 ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
1649
1650 DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
1651 current->pid,
1652 fd,
1653 on,
1654 ctx->ctx_async_queue, ret));
1655
1656 return ret;
1657}
1658
1659static int
1660pfm_fasync(int fd, struct file *filp, int on)
1661{
1662 pfm_context_t *ctx;
1663 int ret;
1664
1665 if (PFM_IS_FILE(filp) == 0) {
1666 printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid);
1667 return -EBADF;
1668 }
1669
1670 ctx = (pfm_context_t *)filp->private_data;
1671 if (ctx == NULL) {
1672 printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
1673 return -EBADF;
1674 }
1675 /*
1676 * we cannot mask interrupts during this call because this may
1677 * may go to sleep if memory is not readily avalaible.
1678 *
1679 * We are protected from the conetxt disappearing by the get_fd()/put_fd()
1680 * done in caller. Serialization of this function is ensured by caller.
1681 */
1682 ret = pfm_do_fasync(fd, filp, ctx, on);
1683
1684
1685 DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
1686 fd,
1687 on,
1688 ctx->ctx_async_queue, ret));
1689
1690 return ret;
1691}
1692
1693#ifdef CONFIG_SMP
1694/*
1695 * this function is exclusively called from pfm_close().
1696 * The context is not protected at that time, nor are interrupts
1697 * on the remote CPU. That's necessary to avoid deadlocks.
1698 */
1699static void
1700pfm_syswide_force_stop(void *info)
1701{
1702 pfm_context_t *ctx = (pfm_context_t *)info;
1703 struct pt_regs *regs = ia64_task_regs(current);
1704 struct task_struct *owner;
1705 unsigned long flags;
1706 int ret;
1707
1708 if (ctx->ctx_cpu != smp_processor_id()) {
1709 printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d but on CPU%d\n",
1710 ctx->ctx_cpu,
1711 smp_processor_id());
1712 return;
1713 }
1714 owner = GET_PMU_OWNER();
1715 if (owner != ctx->ctx_task) {
1716 printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
1717 smp_processor_id(),
1718 owner->pid, ctx->ctx_task->pid);
1719 return;
1720 }
1721 if (GET_PMU_CTX() != ctx) {
1722 printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n",
1723 smp_processor_id(),
1724 GET_PMU_CTX(), ctx);
1725 return;
1726 }
1727
1728 DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid));
1729 /*
1730 * the context is already protected in pfm_close(), we simply
1731 * need to mask interrupts to avoid a PMU interrupt race on
1732 * this CPU
1733 */
1734 local_irq_save(flags);
1735
1736 ret = pfm_context_unload(ctx, NULL, 0, regs);
1737 if (ret) {
1738 DPRINT(("context_unload returned %d\n", ret));
1739 }
1740
1741 /*
1742 * unmask interrupts, PMU interrupts are now spurious here
1743 */
1744 local_irq_restore(flags);
1745}
1746
1747static void
1748pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
1749{
1750 int ret;
1751
1752 DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
1753 ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
1754 DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
1755}
1756#endif /* CONFIG_SMP */
1757
1758/*
1759 * called for each close(). Partially free resources.
1760 * When caller is self-monitoring, the context is unloaded.
1761 */
1762static int
1763pfm_flush(struct file *filp)
1764{
1765 pfm_context_t *ctx;
1766 struct task_struct *task;
1767 struct pt_regs *regs;
1768 unsigned long flags;
1769 unsigned long smpl_buf_size = 0UL;
1770 void *smpl_buf_vaddr = NULL;
1771 int state, is_system;
1772
1773 if (PFM_IS_FILE(filp) == 0) {
1774 DPRINT(("bad magic for\n"));
1775 return -EBADF;
1776 }
1777
1778 ctx = (pfm_context_t *)filp->private_data;
1779 if (ctx == NULL) {
1780 printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid);
1781 return -EBADF;
1782 }
1783
1784 /*
1785 * remove our file from the async queue, if we use this mode.
1786 * This can be done without the context being protected. We come
1787 * here when the context has become unreacheable by other tasks.
1788 *
1789 * We may still have active monitoring at this point and we may
1790 * end up in pfm_overflow_handler(). However, fasync_helper()
1791 * operates with interrupts disabled and it cleans up the
1792 * queue. If the PMU handler is called prior to entering
1793 * fasync_helper() then it will send a signal. If it is
1794 * invoked after, it will find an empty queue and no
1795 * signal will be sent. In both case, we are safe
1796 */
1797 if (filp->f_flags & FASYNC) {
1798 DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
1799 pfm_do_fasync (-1, filp, ctx, 0);
1800 }
1801
1802 PROTECT_CTX(ctx, flags);
1803
1804 state = ctx->ctx_state;
1805 is_system = ctx->ctx_fl_system;
1806
1807 task = PFM_CTX_TASK(ctx);
1808 regs = ia64_task_regs(task);
1809
1810 DPRINT(("ctx_state=%d is_current=%d\n",
1811 state,
1812 task == current ? 1 : 0));
1813
1814 /*
1815 * if state == UNLOADED, then task is NULL
1816 */
1817
1818 /*
1819 * we must stop and unload because we are losing access to the context.
1820 */
1821 if (task == current) {
1822#ifdef CONFIG_SMP
1823 /*
1824 * the task IS the owner but it migrated to another CPU: that's bad
1825 * but we must handle this cleanly. Unfortunately, the kernel does
1826 * not provide a mechanism to block migration (while the context is loaded).
1827 *
1828 * We need to release the resource on the ORIGINAL cpu.
1829 */
1830 if (is_system && ctx->ctx_cpu != smp_processor_id()) {
1831
1832 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
1833 /*
1834 * keep context protected but unmask interrupt for IPI
1835 */
1836 local_irq_restore(flags);
1837
1838 pfm_syswide_cleanup_other_cpu(ctx);
1839
1840 /*
1841 * restore interrupt masking
1842 */
1843 local_irq_save(flags);
1844
1845 /*
1846 * context is unloaded at this point
1847 */
1848 } else
1849#endif /* CONFIG_SMP */
1850 {
1851
1852 DPRINT(("forcing unload\n"));
1853 /*
1854 * stop and unload, returning with state UNLOADED
1855 * and session unreserved.
1856 */
1857 pfm_context_unload(ctx, NULL, 0, regs);
1858
1859 DPRINT(("ctx_state=%d\n", ctx->ctx_state));
1860 }
1861 }
1862
1863 /*
1864 * remove virtual mapping, if any, for the calling task.
1865 * cannot reset ctx field until last user is calling close().
1866 *
1867 * ctx_smpl_vaddr must never be cleared because it is needed
1868 * by every task with access to the context
1869 *
1870 * When called from do_exit(), the mm context is gone already, therefore
1871 * mm is NULL, i.e., the VMA is already gone and we do not have to
1872 * do anything here
1873 */
1874 if (ctx->ctx_smpl_vaddr && current->mm) {
1875 smpl_buf_vaddr = ctx->ctx_smpl_vaddr;
1876 smpl_buf_size = ctx->ctx_smpl_size;
1877 }
1878
1879 UNPROTECT_CTX(ctx, flags);
1880
1881 /*
1882 * if there was a mapping, then we systematically remove it
1883 * at this point. Cannot be done inside critical section
1884 * because some VM function reenables interrupts.
1885 *
1886 */
1887 if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
1888
1889 return 0;
1890}
1891/*
1892 * called either on explicit close() or from exit_files().
1893 * Only the LAST user of the file gets to this point, i.e., it is
1894 * called only ONCE.
1895 *
1896 * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
1897 * (fput()),i.e, last task to access the file. Nobody else can access the
1898 * file at this point.
1899 *
1900 * When called from exit_files(), the VMA has been freed because exit_mm()
1901 * is executed before exit_files().
1902 *
1903 * When called from exit_files(), the current task is not yet ZOMBIE but we
1904 * flush the PMU state to the context.
1905 */
1906static int
1907pfm_close(struct inode *inode, struct file *filp)
1908{
1909 pfm_context_t *ctx;
1910 struct task_struct *task;
1911 struct pt_regs *regs;
1912 DECLARE_WAITQUEUE(wait, current);
1913 unsigned long flags;
1914 unsigned long smpl_buf_size = 0UL;
1915 void *smpl_buf_addr = NULL;
1916 int free_possible = 1;
1917 int state, is_system;
1918
1919 DPRINT(("pfm_close called private=%p\n", filp->private_data));
1920
1921 if (PFM_IS_FILE(filp) == 0) {
1922 DPRINT(("bad magic\n"));
1923 return -EBADF;
1924 }
1925
1926 ctx = (pfm_context_t *)filp->private_data;
1927 if (ctx == NULL) {
1928 printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid);
1929 return -EBADF;
1930 }
1931
1932 PROTECT_CTX(ctx, flags);
1933
1934 state = ctx->ctx_state;
1935 is_system = ctx->ctx_fl_system;
1936
1937 task = PFM_CTX_TASK(ctx);
1938 regs = ia64_task_regs(task);
1939
1940 DPRINT(("ctx_state=%d is_current=%d\n",
1941 state,
1942 task == current ? 1 : 0));
1943
1944 /*
1945 * if task == current, then pfm_flush() unloaded the context
1946 */
1947 if (state == PFM_CTX_UNLOADED) goto doit;
1948
1949 /*
1950 * context is loaded/masked and task != current, we need to
1951 * either force an unload or go zombie
1952 */
1953
1954 /*
1955 * The task is currently blocked or will block after an overflow.
1956 * we must force it to wakeup to get out of the
1957 * MASKED state and transition to the unloaded state by itself.
1958 *
1959 * This situation is only possible for per-task mode
1960 */
1961 if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) {
1962
1963 /*
1964 * set a "partial" zombie state to be checked
1965 * upon return from down() in pfm_handle_work().
1966 *
1967 * We cannot use the ZOMBIE state, because it is checked
1968 * by pfm_load_regs() which is called upon wakeup from down().
1969 * In such case, it would free the context and then we would
1970 * return to pfm_handle_work() which would access the
1971 * stale context. Instead, we set a flag invisible to pfm_load_regs()
1972 * but visible to pfm_handle_work().
1973 *
1974 * For some window of time, we have a zombie context with
1975 * ctx_state = MASKED and not ZOMBIE
1976 */
1977 ctx->ctx_fl_going_zombie = 1;
1978
1979 /*
1980 * force task to wake up from MASKED state
1981 */
1982 up(&ctx->ctx_restart_sem);
1983
1984 DPRINT(("waking up ctx_state=%d\n", state));
1985
1986 /*
1987 * put ourself to sleep waiting for the other
1988 * task to report completion
1989 *
1990 * the context is protected by mutex, therefore there
1991 * is no risk of being notified of completion before
1992 * begin actually on the waitq.
1993 */
1994 set_current_state(TASK_INTERRUPTIBLE);
1995 add_wait_queue(&ctx->ctx_zombieq, &wait);
1996
1997 UNPROTECT_CTX(ctx, flags);
1998
1999 /*
2000 * XXX: check for signals :
2001 * - ok for explicit close
2002 * - not ok when coming from exit_files()
2003 */
2004 schedule();
2005
2006
2007 PROTECT_CTX(ctx, flags);
2008
2009
2010 remove_wait_queue(&ctx->ctx_zombieq, &wait);
2011 set_current_state(TASK_RUNNING);
2012
2013 /*
2014 * context is unloaded at this point
2015 */
2016 DPRINT(("after zombie wakeup ctx_state=%d for\n", state));
2017 }
2018 else if (task != current) {
2019#ifdef CONFIG_SMP
2020 /*
2021 * switch context to zombie state
2022 */
2023 ctx->ctx_state = PFM_CTX_ZOMBIE;
2024
2025 DPRINT(("zombie ctx for [%d]\n", task->pid));
2026 /*
2027 * cannot free the context on the spot. deferred until
2028 * the task notices the ZOMBIE state
2029 */
2030 free_possible = 0;
2031#else
2032 pfm_context_unload(ctx, NULL, 0, regs);
2033#endif
2034 }
2035
2036doit:
2037 /* reload state, may have changed during opening of critical section */
2038 state = ctx->ctx_state;
2039
2040 /*
2041 * the context is still attached to a task (possibly current)
2042 * we cannot destroy it right now
2043 */
2044
2045 /*
2046 * we must free the sampling buffer right here because
2047 * we cannot rely on it being cleaned up later by the
2048 * monitored task. It is not possible to free vmalloc'ed
2049 * memory in pfm_load_regs(). Instead, we remove the buffer
2050 * now. should there be subsequent PMU overflow originally
2051 * meant for sampling, the will be converted to spurious
2052 * and that's fine because the monitoring tools is gone anyway.
2053 */
2054 if (ctx->ctx_smpl_hdr) {
2055 smpl_buf_addr = ctx->ctx_smpl_hdr;
2056 smpl_buf_size = ctx->ctx_smpl_size;
2057 /* no more sampling */
2058 ctx->ctx_smpl_hdr = NULL;
2059 ctx->ctx_fl_is_sampling = 0;
2060 }
2061
2062 DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n",
2063 state,
2064 free_possible,
2065 smpl_buf_addr,
2066 smpl_buf_size));
2067
2068 if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt);
2069
2070 /*
2071 * UNLOADED that the session has already been unreserved.
2072 */
2073 if (state == PFM_CTX_ZOMBIE) {
2074 pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu);
2075 }
2076
2077 /*
2078 * disconnect file descriptor from context must be done
2079 * before we unlock.
2080 */
2081 filp->private_data = NULL;
2082
2083 /*
2084 * if we free on the spot, the context is now completely unreacheable
2085 * from the callers side. The monitored task side is also cut, so we
2086 * can freely cut.
2087 *
2088 * If we have a deferred free, only the caller side is disconnected.
2089 */
2090 UNPROTECT_CTX(ctx, flags);
2091
2092 /*
2093 * All memory free operations (especially for vmalloc'ed memory)
2094 * MUST be done with interrupts ENABLED.
2095 */
2096 if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size);
2097
2098 /*
2099 * return the memory used by the context
2100 */
2101 if (free_possible) pfm_context_free(ctx);
2102
2103 return 0;
2104}
2105
2106static int
2107pfm_no_open(struct inode *irrelevant, struct file *dontcare)
2108{
2109 DPRINT(("pfm_no_open called\n"));
2110 return -ENXIO;
2111}
2112
2113
2114
2115static struct file_operations pfm_file_ops = {
2116 .llseek = no_llseek,
2117 .read = pfm_read,
2118 .write = pfm_write,
2119 .poll = pfm_poll,
2120 .ioctl = pfm_ioctl,
2121 .open = pfm_no_open, /* special open code to disallow open via /proc */
2122 .fasync = pfm_fasync,
2123 .release = pfm_close,
2124 .flush = pfm_flush
2125};
2126
2127static int
2128pfmfs_delete_dentry(struct dentry *dentry)
2129{
2130 return 1;
2131}
2132
2133static struct dentry_operations pfmfs_dentry_operations = {
2134 .d_delete = pfmfs_delete_dentry,
2135};
2136
2137
2138static int
2139pfm_alloc_fd(struct file **cfile)
2140{
2141 int fd, ret = 0;
2142 struct file *file = NULL;
2143 struct inode * inode;
2144 char name[32];
2145 struct qstr this;
2146
2147 fd = get_unused_fd();
2148 if (fd < 0) return -ENFILE;
2149
2150 ret = -ENFILE;
2151
2152 file = get_empty_filp();
2153 if (!file) goto out;
2154
2155 /*
2156 * allocate a new inode
2157 */
2158 inode = new_inode(pfmfs_mnt->mnt_sb);
2159 if (!inode) goto out;
2160
2161 DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
2162
2163 inode->i_mode = S_IFCHR|S_IRUGO;
2164 inode->i_uid = current->fsuid;
2165 inode->i_gid = current->fsgid;
2166
2167 sprintf(name, "[%lu]", inode->i_ino);
2168 this.name = name;
2169 this.len = strlen(name);
2170 this.hash = inode->i_ino;
2171
2172 ret = -ENOMEM;
2173
2174 /*
2175 * allocate a new dcache entry
2176 */
2177 file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
2178 if (!file->f_dentry) goto out;
2179
2180 file->f_dentry->d_op = &pfmfs_dentry_operations;
2181
2182 d_add(file->f_dentry, inode);
2183 file->f_vfsmnt = mntget(pfmfs_mnt);
2184 file->f_mapping = inode->i_mapping;
2185
2186 file->f_op = &pfm_file_ops;
2187 file->f_mode = FMODE_READ;
2188 file->f_flags = O_RDONLY;
2189 file->f_pos = 0;
2190
2191 /*
2192 * may have to delay until context is attached?
2193 */
2194 fd_install(fd, file);
2195
2196 /*
2197 * the file structure we will use
2198 */
2199 *cfile = file;
2200
2201 return fd;
2202out:
2203 if (file) put_filp(file);
2204 put_unused_fd(fd);
2205 return ret;
2206}
2207
2208static void
2209pfm_free_fd(int fd, struct file *file)
2210{
2211 struct files_struct *files = current->files;
2212
2213 /*
2214 * there ie no fd_uninstall(), so we do it here
2215 */
2216 spin_lock(&files->file_lock);
2217 files->fd[fd] = NULL;
2218 spin_unlock(&files->file_lock);
2219
2220 if (file) put_filp(file);
2221 put_unused_fd(fd);
2222}
2223
2224static int
2225pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
2226{
2227 DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
2228
2229 while (size > 0) {
2230 unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
2231
2232
2233 if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
2234 return -ENOMEM;
2235
2236 addr += PAGE_SIZE;
2237 buf += PAGE_SIZE;
2238 size -= PAGE_SIZE;
2239 }
2240 return 0;
2241}
2242
2243/*
2244 * allocate a sampling buffer and remaps it into the user address space of the task
2245 */
2246static int
2247pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr)
2248{
2249 struct mm_struct *mm = task->mm;
2250 struct vm_area_struct *vma = NULL;
2251 unsigned long size;
2252 void *smpl_buf;
2253
2254
2255 /*
2256 * the fixed header + requested size and align to page boundary
2257 */
2258 size = PAGE_ALIGN(rsize);
2259
2260 DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size));
2261
2262 /*
2263 * check requested size to avoid Denial-of-service attacks
2264 * XXX: may have to refine this test
2265 * Check against address space limit.
2266 *
2267 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
2268 * return -ENOMEM;
2269 */
2270 if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
2271 return -ENOMEM;
2272
2273 /*
2274 * We do the easy to undo allocations first.
2275 *
2276 * pfm_rvmalloc(), clears the buffer, so there is no leak
2277 */
2278 smpl_buf = pfm_rvmalloc(size);
2279 if (smpl_buf == NULL) {
2280 DPRINT(("Can't allocate sampling buffer\n"));
2281 return -ENOMEM;
2282 }
2283
2284 DPRINT(("smpl_buf @%p\n", smpl_buf));
2285
2286 /* allocate vma */
2287 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2288 if (!vma) {
2289 DPRINT(("Cannot allocate vma\n"));
2290 goto error_kmem;
2291 }
2292 memset(vma, 0, sizeof(*vma));
2293
2294 /*
2295 * partially initialize the vma for the sampling buffer
2296 */
2297 vma->vm_mm = mm;
2298 vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED;
2299 vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */
2300
2301 /*
2302 * Now we have everything we need and we can initialize
2303 * and connect all the data structures
2304 */
2305
2306 ctx->ctx_smpl_hdr = smpl_buf;
2307 ctx->ctx_smpl_size = size; /* aligned size */
2308
2309 /*
2310 * Let's do the difficult operations next.
2311 *
2312 * now we atomically find some area in the address space and
2313 * remap the buffer in it.
2314 */
2315 down_write(&task->mm->mmap_sem);
2316
2317 /* find some free area in address space, must have mmap sem held */
2318 vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
2319 if (vma->vm_start == 0UL) {
2320 DPRINT(("Cannot find unmapped area for size %ld\n", size));
2321 up_write(&task->mm->mmap_sem);
2322 goto error;
2323 }
2324 vma->vm_end = vma->vm_start + size;
2325 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2326
2327 DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
2328
2329 /* can only be applied to current task, need to have the mm semaphore held when called */
2330 if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
2331 DPRINT(("Can't remap buffer\n"));
2332 up_write(&task->mm->mmap_sem);
2333 goto error;
2334 }
2335
2336 /*
2337 * now insert the vma in the vm list for the process, must be
2338 * done with mmap lock held
2339 */
2340 insert_vm_struct(mm, vma);
2341
2342 mm->total_vm += size >> PAGE_SHIFT;
2343 vm_stat_account(vma);
2344 up_write(&task->mm->mmap_sem);
2345
2346 /*
2347 * keep track of user level virtual address
2348 */
2349 ctx->ctx_smpl_vaddr = (void *)vma->vm_start;
2350 *(unsigned long *)user_vaddr = vma->vm_start;
2351
2352 return 0;
2353
2354error:
2355 kmem_cache_free(vm_area_cachep, vma);
2356error_kmem:
2357 pfm_rvfree(smpl_buf, size);
2358
2359 return -ENOMEM;
2360}
2361
2362/*
2363 * XXX: do something better here
2364 */
2365static int
2366pfm_bad_permissions(struct task_struct *task)
2367{
2368 /* inspired by ptrace_attach() */
2369 DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
2370 current->uid,
2371 current->gid,
2372 task->euid,
2373 task->suid,
2374 task->uid,
2375 task->egid,
2376 task->sgid));
2377
2378 return ((current->uid != task->euid)
2379 || (current->uid != task->suid)
2380 || (current->uid != task->uid)
2381 || (current->gid != task->egid)
2382 || (current->gid != task->sgid)
2383 || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
2384}
2385
2386static int
2387pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx)
2388{
2389 int ctx_flags;
2390
2391 /* valid signal */
2392
2393 ctx_flags = pfx->ctx_flags;
2394
2395 if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
2396
2397 /*
2398 * cannot block in this mode
2399 */
2400 if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
2401 DPRINT(("cannot use blocking mode when in system wide monitoring\n"));
2402 return -EINVAL;
2403 }
2404 } else {
2405 }
2406 /* probably more to add here */
2407
2408 return 0;
2409}
2410
2411static int
2412pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int ctx_flags,
2413 unsigned int cpu, pfarg_context_t *arg)
2414{
2415 pfm_buffer_fmt_t *fmt = NULL;
2416 unsigned long size = 0UL;
2417 void *uaddr = NULL;
2418 void *fmt_arg = NULL;
2419 int ret = 0;
2420#define PFM_CTXARG_BUF_ARG(a) (pfm_buffer_fmt_t *)(a+1)
2421
2422 /* invoke and lock buffer format, if found */
2423 fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
2424 if (fmt == NULL) {
2425 DPRINT(("[%d] cannot find buffer format\n", task->pid));
2426 return -EINVAL;
2427 }
2428
2429 /*
2430 * buffer argument MUST be contiguous to pfarg_context_t
2431 */
2432 if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg);
2433
2434 ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
2435
2436 DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret));
2437
2438 if (ret) goto error;
2439
2440 /* link buffer format and context */
2441 ctx->ctx_buf_fmt = fmt;
2442
2443 /*
2444 * check if buffer format wants to use perfmon buffer allocation/mapping service
2445 */
2446 ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size);
2447 if (ret) goto error;
2448
2449 if (size) {
2450 /*
2451 * buffer is always remapped into the caller's address space
2452 */
2453 ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr);
2454 if (ret) goto error;
2455
2456 /* keep track of user address of buffer */
2457 arg->ctx_smpl_vaddr = uaddr;
2458 }
2459 ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg);
2460
2461error:
2462 return ret;
2463}
2464
2465static void
2466pfm_reset_pmu_state(pfm_context_t *ctx)
2467{
2468 int i;
2469
2470 /*
2471 * install reset values for PMC.
2472 */
2473 for (i=1; PMC_IS_LAST(i) == 0; i++) {
2474 if (PMC_IS_IMPL(i) == 0) continue;
2475 ctx->ctx_pmcs[i] = PMC_DFL_VAL(i);
2476 DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i]));
2477 }
2478 /*
2479 * PMD registers are set to 0UL when the context in memset()
2480 */
2481
2482 /*
2483 * On context switched restore, we must restore ALL pmc and ALL pmd even
2484 * when they are not actively used by the task. In UP, the incoming process
2485 * may otherwise pick up left over PMC, PMD state from the previous process.
2486 * As opposed to PMD, stale PMC can cause harm to the incoming
2487 * process because they may change what is being measured.
2488 * Therefore, we must systematically reinstall the entire
2489 * PMC state. In SMP, the same thing is possible on the
2490 * same CPU but also on between 2 CPUs.
2491 *
2492 * The problem with PMD is information leaking especially
2493 * to user level when psr.sp=0
2494 *
2495 * There is unfortunately no easy way to avoid this problem
2496 * on either UP or SMP. This definitively slows down the
2497 * pfm_load_regs() function.
2498 */
2499
2500 /*
2501 * bitmask of all PMCs accessible to this context
2502 *
2503 * PMC0 is treated differently.
2504 */
2505 ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
2506
2507 /*
2508 * bitmask of all PMDs that are accesible to this context
2509 */
2510 ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
2511
2512 DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
2513
2514 /*
2515 * useful in case of re-enable after disable
2516 */
2517 ctx->ctx_used_ibrs[0] = 0UL;
2518 ctx->ctx_used_dbrs[0] = 0UL;
2519}
2520
2521static int
2522pfm_ctx_getsize(void *arg, size_t *sz)
2523{
2524 pfarg_context_t *req = (pfarg_context_t *)arg;
2525 pfm_buffer_fmt_t *fmt;
2526
2527 *sz = 0;
2528
2529 if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
2530
2531 fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
2532 if (fmt == NULL) {
2533 DPRINT(("cannot find buffer format\n"));
2534 return -EINVAL;
2535 }
2536 /* get just enough to copy in user parameters */
2537 *sz = fmt->fmt_arg_size;
2538 DPRINT(("arg_size=%lu\n", *sz));
2539
2540 return 0;
2541}
2542
2543
2544
2545/*
2546 * cannot attach if :
2547 * - kernel task
2548 * - task not owned by caller
2549 * - task incompatible with context mode
2550 */
2551static int
2552pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
2553{
2554 /*
2555 * no kernel task or task not owner by caller
2556 */
2557 if (task->mm == NULL) {
2558 DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid));
2559 return -EPERM;
2560 }
2561 if (pfm_bad_permissions(task)) {
2562 DPRINT(("no permission to attach to [%d]\n", task->pid));
2563 return -EPERM;
2564 }
2565 /*
2566 * cannot block in self-monitoring mode
2567 */
2568 if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
2569 DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid));
2570 return -EINVAL;
2571 }
2572
2573 if (task->exit_state == EXIT_ZOMBIE) {
2574 DPRINT(("cannot attach to zombie task [%d]\n", task->pid));
2575 return -EBUSY;
2576 }
2577
2578 /*
2579 * always ok for self
2580 */
2581 if (task == current) return 0;
2582
2583 if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
2584 DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
2585 return -EBUSY;
2586 }
2587 /*
2588 * make sure the task is off any CPU
2589 */
2590 wait_task_inactive(task);
2591
2592 /* more to come... */
2593
2594 return 0;
2595}
2596
2597static int
2598pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task)
2599{
2600 struct task_struct *p = current;
2601 int ret;
2602
2603 /* XXX: need to add more checks here */
2604 if (pid < 2) return -EPERM;
2605
2606 if (pid != current->pid) {
2607
2608 read_lock(&tasklist_lock);
2609
2610 p = find_task_by_pid(pid);
2611
2612 /* make sure task cannot go away while we operate on it */
2613 if (p) get_task_struct(p);
2614
2615 read_unlock(&tasklist_lock);
2616
2617 if (p == NULL) return -ESRCH;
2618 }
2619
2620 ret = pfm_task_incompatible(ctx, p);
2621 if (ret == 0) {
2622 *task = p;
2623 } else if (p != current) {
2624 pfm_put_task(p);
2625 }
2626 return ret;
2627}
2628
2629
2630
2631static int
2632pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
2633{
2634 pfarg_context_t *req = (pfarg_context_t *)arg;
2635 struct file *filp;
2636 int ctx_flags;
2637 int ret;
2638
2639 /* let's check the arguments first */
2640 ret = pfarg_is_sane(current, req);
2641 if (ret < 0) return ret;
2642
2643 ctx_flags = req->ctx_flags;
2644
2645 ret = -ENOMEM;
2646
2647 ctx = pfm_context_alloc();
2648 if (!ctx) goto error;
2649
2650 ret = pfm_alloc_fd(&filp);
2651 if (ret < 0) goto error_file;
2652
2653 req->ctx_fd = ctx->ctx_fd = ret;
2654
2655 /*
2656 * attach context to file
2657 */
2658 filp->private_data = ctx;
2659
2660 /*
2661 * does the user want to sample?
2662 */
2663 if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
2664 ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req);
2665 if (ret) goto buffer_error;
2666 }
2667
2668 /*
2669 * init context protection lock
2670 */
2671 spin_lock_init(&ctx->ctx_lock);
2672
2673 /*
2674 * context is unloaded
2675 */
2676 ctx->ctx_state = PFM_CTX_UNLOADED;
2677
2678 /*
2679 * initialization of context's flags
2680 */
2681 ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
2682 ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
2683 ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
2684 ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
2685 /*
2686 * will move to set properties
2687 * ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
2688 */
2689
2690 /*
2691 * init restart semaphore to locked
2692 */
2693 sema_init(&ctx->ctx_restart_sem, 0);
2694
2695 /*
2696 * activation is used in SMP only
2697 */
2698 ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
2699 SET_LAST_CPU(ctx, -1);
2700
2701 /*
2702 * initialize notification message queue
2703 */
2704 ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
2705 init_waitqueue_head(&ctx->ctx_msgq_wait);
2706 init_waitqueue_head(&ctx->ctx_zombieq);
2707
2708 DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
2709 ctx,
2710 ctx_flags,
2711 ctx->ctx_fl_system,
2712 ctx->ctx_fl_block,
2713 ctx->ctx_fl_excl_idle,
2714 ctx->ctx_fl_no_msg,
2715 ctx->ctx_fd));
2716
2717 /*
2718 * initialize soft PMU state
2719 */
2720 pfm_reset_pmu_state(ctx);
2721
2722 return 0;
2723
2724buffer_error:
2725 pfm_free_fd(ctx->ctx_fd, filp);
2726
2727 if (ctx->ctx_buf_fmt) {
2728 pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
2729 }
2730error_file:
2731 pfm_context_free(ctx);
2732
2733error:
2734 return ret;
2735}
2736
2737static inline unsigned long
2738pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
2739{
2740 unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
2741 unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
2742 extern unsigned long carta_random32 (unsigned long seed);
2743
2744 if (reg->flags & PFM_REGFL_RANDOM) {
2745 new_seed = carta_random32(old_seed);
2746 val -= (old_seed & mask); /* counter values are negative numbers! */
2747 if ((mask >> 32) != 0)
2748 /* construct a full 64-bit random value: */
2749 new_seed |= carta_random32(old_seed >> 32) << 32;
2750 reg->seed = new_seed;
2751 }
2752 reg->lval = val;
2753 return val;
2754}
2755
2756static void
2757pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
2758{
2759 unsigned long mask = ovfl_regs[0];
2760 unsigned long reset_others = 0UL;
2761 unsigned long val;
2762 int i;
2763
2764 /*
2765 * now restore reset value on sampling overflowed counters
2766 */
2767 mask >>= PMU_FIRST_COUNTER;
2768 for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
2769
2770 if ((mask & 0x1UL) == 0UL) continue;
2771
2772 ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
2773 reset_others |= ctx->ctx_pmds[i].reset_pmds[0];
2774
2775 DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
2776 }
2777
2778 /*
2779 * Now take care of resetting the other registers
2780 */
2781 for(i = 0; reset_others; i++, reset_others >>= 1) {
2782
2783 if ((reset_others & 0x1) == 0) continue;
2784
2785 ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
2786
2787 DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
2788 is_long_reset ? "long" : "short", i, val));
2789 }
2790}
2791
2792static void
2793pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
2794{
2795 unsigned long mask = ovfl_regs[0];
2796 unsigned long reset_others = 0UL;
2797 unsigned long val;
2798 int i;
2799
2800 DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
2801
2802 if (ctx->ctx_state == PFM_CTX_MASKED) {
2803 pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
2804 return;
2805 }
2806
2807 /*
2808 * now restore reset value on sampling overflowed counters
2809 */
2810 mask >>= PMU_FIRST_COUNTER;
2811 for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
2812
2813 if ((mask & 0x1UL) == 0UL) continue;
2814
2815 val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
2816 reset_others |= ctx->ctx_pmds[i].reset_pmds[0];
2817
2818 DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
2819
2820 pfm_write_soft_counter(ctx, i, val);
2821 }
2822
2823 /*
2824 * Now take care of resetting the other registers
2825 */
2826 for(i = 0; reset_others; i++, reset_others >>= 1) {
2827
2828 if ((reset_others & 0x1) == 0) continue;
2829
2830 val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
2831
2832 if (PMD_IS_COUNTING(i)) {
2833 pfm_write_soft_counter(ctx, i, val);
2834 } else {
2835 ia64_set_pmd(i, val);
2836 }
2837 DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
2838 is_long_reset ? "long" : "short", i, val));
2839 }
2840 ia64_srlz_d();
2841}
2842
2843static int
2844pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
2845{
2846 struct thread_struct *thread = NULL;
2847 struct task_struct *task;
2848 pfarg_reg_t *req = (pfarg_reg_t *)arg;
2849 unsigned long value, pmc_pm;
2850 unsigned long smpl_pmds, reset_pmds, impl_pmds;
2851 unsigned int cnum, reg_flags, flags, pmc_type;
2852 int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
2853 int is_monitor, is_counting, state;
2854 int ret = -EINVAL;
2855 pfm_reg_check_t wr_func;
2856#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
2857
2858 state = ctx->ctx_state;
2859 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
2860 is_system = ctx->ctx_fl_system;
2861 task = ctx->ctx_task;
2862 impl_pmds = pmu_conf->impl_pmds[0];
2863
2864 if (state == PFM_CTX_ZOMBIE) return -EINVAL;
2865
2866 if (is_loaded) {
2867 thread = &task->thread;
2868 /*
2869 * In system wide and when the context is loaded, access can only happen
2870 * when the caller is running on the CPU being monitored by the session.
2871 * It does not have to be the owner (ctx_task) of the context per se.
2872 */
2873 if (is_system && ctx->ctx_cpu != smp_processor_id()) {
2874 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
2875 return -EBUSY;
2876 }
2877 can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
2878 }
2879 expert_mode = pfm_sysctl.expert_mode;
2880
2881 for (i = 0; i < count; i++, req++) {
2882
2883 cnum = req->reg_num;
2884 reg_flags = req->reg_flags;
2885 value = req->reg_value;
2886 smpl_pmds = req->reg_smpl_pmds[0];
2887 reset_pmds = req->reg_reset_pmds[0];
2888 flags = 0;
2889
2890
2891 if (cnum >= PMU_MAX_PMCS) {
2892 DPRINT(("pmc%u is invalid\n", cnum));
2893 goto error;
2894 }
2895
2896 pmc_type = pmu_conf->pmc_desc[cnum].type;
2897 pmc_pm = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
2898 is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
2899 is_monitor = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
2900
2901 /*
2902 * we reject all non implemented PMC as well
2903 * as attempts to modify PMC[0-3] which are used
2904 * as status registers by the PMU
2905 */
2906 if ((pmc_type & PFM_REG_IMPL) == 0 || (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) {
2907 DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
2908 goto error;
2909 }
2910 wr_func = pmu_conf->pmc_desc[cnum].write_check;
2911 /*
2912 * If the PMC is a monitor, then if the value is not the default:
2913 * - system-wide session: PMCx.pm=1 (privileged monitor)
2914 * - per-task : PMCx.pm=0 (user monitor)
2915 */
2916 if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) {
2917 DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n",
2918 cnum,
2919 pmc_pm,
2920 is_system));
2921 goto error;
2922 }
2923
2924 if (is_counting) {
2925 /*
2926 * enforce generation of overflow interrupt. Necessary on all
2927 * CPUs.
2928 */
2929 value |= 1 << PMU_PMC_OI;
2930
2931 if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
2932 flags |= PFM_REGFL_OVFL_NOTIFY;
2933 }
2934
2935 if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM;
2936
2937 /* verify validity of smpl_pmds */
2938 if ((smpl_pmds & impl_pmds) != smpl_pmds) {
2939 DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum));
2940 goto error;
2941 }
2942
2943 /* verify validity of reset_pmds */
2944 if ((reset_pmds & impl_pmds) != reset_pmds) {
2945 DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
2946 goto error;
2947 }
2948 } else {
2949 if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
2950 DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum));
2951 goto error;
2952 }
2953 /* eventid on non-counting monitors are ignored */
2954 }
2955
2956 /*
2957 * execute write checker, if any
2958 */
2959 if (likely(expert_mode == 0 && wr_func)) {
2960 ret = (*wr_func)(task, ctx, cnum, &value, regs);
2961 if (ret) goto error;
2962 ret = -EINVAL;
2963 }
2964
2965 /*
2966 * no error on this register
2967 */
2968 PFM_REG_RETFLAG_SET(req->reg_flags, 0);
2969
2970 /*
2971 * Now we commit the changes to the software state
2972 */
2973
2974 /*
2975 * update overflow information
2976 */
2977 if (is_counting) {
2978 /*
2979 * full flag update each time a register is programmed
2980 */
2981 ctx->ctx_pmds[cnum].flags = flags;
2982
2983 ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds;
2984 ctx->ctx_pmds[cnum].smpl_pmds[0] = smpl_pmds;
2985 ctx->ctx_pmds[cnum].eventid = req->reg_smpl_eventid;
2986
2987 /*
2988 * Mark all PMDS to be accessed as used.
2989 *
2990 * We do not keep track of PMC because we have to
2991 * systematically restore ALL of them.
2992 *
2993 * We do not update the used_monitors mask, because
2994 * if we have not programmed them, then will be in
2995 * a quiescent state, therefore we will not need to
2996 * mask/restore then when context is MASKED.
2997 */
2998 CTX_USED_PMD(ctx, reset_pmds);
2999 CTX_USED_PMD(ctx, smpl_pmds);
3000 /*
3001 * make sure we do not try to reset on
3002 * restart because we have established new values
3003 */
3004 if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
3005 }
3006 /*
3007 * Needed in case the user does not initialize the equivalent
3008 * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
3009 * possible leak here.
3010 */
3011 CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
3012
3013 /*
3014 * keep track of the monitor PMC that we are using.
3015 * we save the value of the pmc in ctx_pmcs[] and if
3016 * the monitoring is not stopped for the context we also
3017 * place it in the saved state area so that it will be
3018 * picked up later by the context switch code.
3019 *
3020 * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs().
3021 *
3022 * The value in thread->pmcs[] may be modified on overflow, i.e., when
3023 * monitoring needs to be stopped.
3024 */
3025 if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum);
3026
3027 /*
3028 * update context state
3029 */
3030 ctx->ctx_pmcs[cnum] = value;
3031
3032 if (is_loaded) {
3033 /*
3034 * write thread state
3035 */
3036 if (is_system == 0) thread->pmcs[cnum] = value;
3037
3038 /*
3039 * write hardware register if we can
3040 */
3041 if (can_access_pmu) {
3042 ia64_set_pmc(cnum, value);
3043 }
3044#ifdef CONFIG_SMP
3045 else {
3046 /*
3047 * per-task SMP only here
3048 *
3049 * we are guaranteed that the task is not running on the other CPU,
3050 * we indicate that this PMD will need to be reloaded if the task
3051 * is rescheduled on the CPU it ran last on.
3052 */
3053 ctx->ctx_reload_pmcs[0] |= 1UL << cnum;
3054 }
3055#endif
3056 }
3057
3058 DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
3059 cnum,
3060 value,
3061 is_loaded,
3062 can_access_pmu,
3063 flags,
3064 ctx->ctx_all_pmcs[0],
3065 ctx->ctx_used_pmds[0],
3066 ctx->ctx_pmds[cnum].eventid,
3067 smpl_pmds,
3068 reset_pmds,
3069 ctx->ctx_reload_pmcs[0],
3070 ctx->ctx_used_monitors[0],
3071 ctx->ctx_ovfl_regs[0]));
3072 }
3073
3074 /*
3075 * make sure the changes are visible
3076 */
3077 if (can_access_pmu) ia64_srlz_d();
3078
3079 return 0;
3080error:
3081 PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
3082 return ret;
3083}
3084
3085static int
3086pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3087{
3088 struct thread_struct *thread = NULL;
3089 struct task_struct *task;
3090 pfarg_reg_t *req = (pfarg_reg_t *)arg;
3091 unsigned long value, hw_value, ovfl_mask;
3092 unsigned int cnum;
3093 int i, can_access_pmu = 0, state;
3094 int is_counting, is_loaded, is_system, expert_mode;
3095 int ret = -EINVAL;
3096 pfm_reg_check_t wr_func;
3097
3098
3099 state = ctx->ctx_state;
3100 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
3101 is_system = ctx->ctx_fl_system;
3102 ovfl_mask = pmu_conf->ovfl_val;
3103 task = ctx->ctx_task;
3104
3105 if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
3106
3107 /*
3108 * on both UP and SMP, we can only write to the PMC when the task is
3109 * the owner of the local PMU.
3110 */
3111 if (likely(is_loaded)) {
3112 thread = &task->thread;
3113 /*
3114 * In system wide and when the context is loaded, access can only happen
3115 * when the caller is running on the CPU being monitored by the session.
3116 * It does not have to be the owner (ctx_task) of the context per se.
3117 */
3118 if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
3119 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
3120 return -EBUSY;
3121 }
3122 can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
3123 }
3124 expert_mode = pfm_sysctl.expert_mode;
3125
3126 for (i = 0; i < count; i++, req++) {
3127
3128 cnum = req->reg_num;
3129 value = req->reg_value;
3130
3131 if (!PMD_IS_IMPL(cnum)) {
3132 DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum));
3133 goto abort_mission;
3134 }
3135 is_counting = PMD_IS_COUNTING(cnum);
3136 wr_func = pmu_conf->pmd_desc[cnum].write_check;
3137
3138 /*
3139 * execute write checker, if any
3140 */
3141 if (unlikely(expert_mode == 0 && wr_func)) {
3142 unsigned long v = value;
3143
3144 ret = (*wr_func)(task, ctx, cnum, &v, regs);
3145 if (ret) goto abort_mission;
3146
3147 value = v;
3148 ret = -EINVAL;
3149 }
3150
3151 /*
3152 * no error on this register
3153 */
3154 PFM_REG_RETFLAG_SET(req->reg_flags, 0);
3155
3156 /*
3157 * now commit changes to software state
3158 */
3159 hw_value = value;
3160
3161 /*
3162 * update virtualized (64bits) counter
3163 */
3164 if (is_counting) {
3165 /*
3166 * write context state
3167 */
3168 ctx->ctx_pmds[cnum].lval = value;
3169
3170 /*
3171 * when context is load we use the split value
3172 */
3173 if (is_loaded) {
3174 hw_value = value & ovfl_mask;
3175 value = value & ~ovfl_mask;
3176 }
3177 }
3178 /*
3179 * update reset values (not just for counters)
3180 */
3181 ctx->ctx_pmds[cnum].long_reset = req->reg_long_reset;
3182 ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
3183
3184 /*
3185 * update randomization parameters (not just for counters)
3186 */
3187 ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
3188 ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
3189
3190 /*
3191 * update context value
3192 */
3193 ctx->ctx_pmds[cnum].val = value;
3194
3195 /*
3196 * Keep track of what we use
3197 *
3198 * We do not keep track of PMC because we have to
3199 * systematically restore ALL of them.
3200 */
3201 CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum));
3202
3203 /*
3204 * mark this PMD register used as well
3205 */
3206 CTX_USED_PMD(ctx, RDEP(cnum));
3207
3208 /*
3209 * make sure we do not try to reset on
3210 * restart because we have established new values
3211 */
3212 if (is_counting && state == PFM_CTX_MASKED) {
3213 ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
3214 }
3215
3216 if (is_loaded) {
3217 /*
3218 * write thread state
3219 */
3220 if (is_system == 0) thread->pmds[cnum] = hw_value;
3221
3222 /*
3223 * write hardware register if we can
3224 */
3225 if (can_access_pmu) {
3226 ia64_set_pmd(cnum, hw_value);
3227 } else {
3228#ifdef CONFIG_SMP
3229 /*
3230 * we are guaranteed that the task is not running on the other CPU,
3231 * we indicate that this PMD will need to be reloaded if the task
3232 * is rescheduled on the CPU it ran last on.
3233 */
3234 ctx->ctx_reload_pmds[0] |= 1UL << cnum;
3235#endif
3236 }
3237 }
3238
3239 DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx short_reset=0x%lx "
3240 "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
3241 cnum,
3242 value,
3243 is_loaded,
3244 can_access_pmu,
3245 hw_value,
3246 ctx->ctx_pmds[cnum].val,
3247 ctx->ctx_pmds[cnum].short_reset,
3248 ctx->ctx_pmds[cnum].long_reset,
3249 PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
3250 ctx->ctx_pmds[cnum].seed,
3251 ctx->ctx_pmds[cnum].mask,
3252 ctx->ctx_used_pmds[0],
3253 ctx->ctx_pmds[cnum].reset_pmds[0],
3254 ctx->ctx_reload_pmds[0],
3255 ctx->ctx_all_pmds[0],
3256 ctx->ctx_ovfl_regs[0]));
3257 }
3258
3259 /*
3260 * make changes visible
3261 */
3262 if (can_access_pmu) ia64_srlz_d();
3263
3264 return 0;
3265
3266abort_mission:
3267 /*
3268 * for now, we have only one possibility for error
3269 */
3270 PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
3271 return ret;
3272}
3273
3274/*
3275 * By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function.
3276 * Therefore we know, we do not have to worry about the PMU overflow interrupt. If an
3277 * interrupt is delivered during the call, it will be kept pending until we leave, making
3278 * it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are
3279 * guaranteed to return consistent data to the user, it may simply be old. It is not
3280 * trivial to treat the overflow while inside the call because you may end up in
3281 * some module sampling buffer code causing deadlocks.
3282 */
3283static int
3284pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3285{
3286 struct thread_struct *thread = NULL;
3287 struct task_struct *task;
3288 unsigned long val = 0UL, lval, ovfl_mask, sval;
3289 pfarg_reg_t *req = (pfarg_reg_t *)arg;
3290 unsigned int cnum, reg_flags = 0;
3291 int i, can_access_pmu = 0, state;
3292 int is_loaded, is_system, is_counting, expert_mode;
3293 int ret = -EINVAL;
3294 pfm_reg_check_t rd_func;
3295
3296 /*
3297 * access is possible when loaded only for
3298 * self-monitoring tasks or in UP mode
3299 */
3300
3301 state = ctx->ctx_state;
3302 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
3303 is_system = ctx->ctx_fl_system;
3304 ovfl_mask = pmu_conf->ovfl_val;
3305 task = ctx->ctx_task;
3306
3307 if (state == PFM_CTX_ZOMBIE) return -EINVAL;
3308
3309 if (likely(is_loaded)) {
3310 thread = &task->thread;
3311 /*
3312 * In system wide and when the context is loaded, access can only happen
3313 * when the caller is running on the CPU being monitored by the session.
3314 * It does not have to be the owner (ctx_task) of the context per se.
3315 */
3316 if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
3317 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
3318 return -EBUSY;
3319 }
3320 /*
3321 * this can be true when not self-monitoring only in UP
3322 */
3323 can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
3324
3325 if (can_access_pmu) ia64_srlz_d();
3326 }
3327 expert_mode = pfm_sysctl.expert_mode;
3328
3329 DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
3330 is_loaded,
3331 can_access_pmu,
3332 state));
3333
3334 /*
3335 * on both UP and SMP, we can only read the PMD from the hardware register when
3336 * the task is the owner of the local PMU.
3337 */
3338
3339 for (i = 0; i < count; i++, req++) {
3340
3341 cnum = req->reg_num;
3342 reg_flags = req->reg_flags;
3343
3344 if (unlikely(!PMD_IS_IMPL(cnum))) goto error;
3345 /*
3346 * we can only read the register that we use. That includes
3347 * the one we explicitely initialize AND the one we want included
3348 * in the sampling buffer (smpl_regs).
3349 *
3350 * Having this restriction allows optimization in the ctxsw routine
3351 * without compromising security (leaks)
3352 */
3353 if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error;
3354
3355 sval = ctx->ctx_pmds[cnum].val;
3356 lval = ctx->ctx_pmds[cnum].lval;
3357 is_counting = PMD_IS_COUNTING(cnum);
3358
3359 /*
3360 * If the task is not the current one, then we check if the
3361 * PMU state is still in the local live register due to lazy ctxsw.
3362 * If true, then we read directly from the registers.
3363 */
3364 if (can_access_pmu){
3365 val = ia64_get_pmd(cnum);
3366 } else {
3367 /*
3368 * context has been saved
3369 * if context is zombie, then task does not exist anymore.
3370 * In this case, we use the full value saved in the context (pfm_flush_regs()).
3371 */
3372 val = is_loaded ? thread->pmds[cnum] : 0UL;
3373 }
3374 rd_func = pmu_conf->pmd_desc[cnum].read_check;
3375
3376 if (is_counting) {
3377 /*
3378 * XXX: need to check for overflow when loaded
3379 */
3380 val &= ovfl_mask;
3381 val += sval;
3382 }
3383
3384 /*
3385 * execute read checker, if any
3386 */
3387 if (unlikely(expert_mode == 0 && rd_func)) {
3388 unsigned long v = val;
3389 ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
3390 if (ret) goto error;
3391 val = v;
3392 ret = -EINVAL;
3393 }
3394
3395 PFM_REG_RETFLAG_SET(reg_flags, 0);
3396
3397 DPRINT(("pmd[%u]=0x%lx\n", cnum, val));
3398
3399 /*
3400 * update register return value, abort all if problem during copy.
3401 * we only modify the reg_flags field. no check mode is fine because
3402 * access has been verified upfront in sys_perfmonctl().
3403 */
3404 req->reg_value = val;
3405 req->reg_flags = reg_flags;
3406 req->reg_last_reset_val = lval;
3407 }
3408
3409 return 0;
3410
3411error:
3412 PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
3413 return ret;
3414}
3415
3416int
3417pfm_mod_write_pmcs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
3418{
3419 pfm_context_t *ctx;
3420
3421 if (req == NULL) return -EINVAL;
3422
3423 ctx = GET_PMU_CTX();
3424
3425 if (ctx == NULL) return -EINVAL;
3426
3427 /*
3428 * for now limit to current task, which is enough when calling
3429 * from overflow handler
3430 */
3431 if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
3432
3433 return pfm_write_pmcs(ctx, req, nreq, regs);
3434}
3435EXPORT_SYMBOL(pfm_mod_write_pmcs);
3436
3437int
3438pfm_mod_read_pmds(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
3439{
3440 pfm_context_t *ctx;
3441
3442 if (req == NULL) return -EINVAL;
3443
3444 ctx = GET_PMU_CTX();
3445
3446 if (ctx == NULL) return -EINVAL;
3447
3448 /*
3449 * for now limit to current task, which is enough when calling
3450 * from overflow handler
3451 */
3452 if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
3453
3454 return pfm_read_pmds(ctx, req, nreq, regs);
3455}
3456EXPORT_SYMBOL(pfm_mod_read_pmds);
3457
3458/*
3459 * Only call this function when a process it trying to
3460 * write the debug registers (reading is always allowed)
3461 */
3462int
3463pfm_use_debug_registers(struct task_struct *task)
3464{
3465 pfm_context_t *ctx = task->thread.pfm_context;
3466 unsigned long flags;
3467 int ret = 0;
3468
3469 if (pmu_conf->use_rr_dbregs == 0) return 0;
3470
3471 DPRINT(("called for [%d]\n", task->pid));
3472
3473 /*
3474 * do it only once
3475 */
3476 if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
3477
3478 /*
3479 * Even on SMP, we do not need to use an atomic here because
3480 * the only way in is via ptrace() and this is possible only when the
3481 * process is stopped. Even in the case where the ctxsw out is not totally
3482 * completed by the time we come here, there is no way the 'stopped' process
3483 * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
3484 * So this is always safe.
3485 */
3486 if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
3487
3488 LOCK_PFS(flags);
3489
3490 /*
3491 * We cannot allow setting breakpoints when system wide monitoring
3492 * sessions are using the debug registers.
3493 */
3494 if (pfm_sessions.pfs_sys_use_dbregs> 0)
3495 ret = -1;
3496 else
3497 pfm_sessions.pfs_ptrace_use_dbregs++;
3498
3499 DPRINT(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n",
3500 pfm_sessions.pfs_ptrace_use_dbregs,
3501 pfm_sessions.pfs_sys_use_dbregs,
3502 task->pid, ret));
3503
3504 UNLOCK_PFS(flags);
3505
3506 return ret;
3507}
3508
3509/*
3510 * This function is called for every task that exits with the
3511 * IA64_THREAD_DBG_VALID set. This indicates a task which was
3512 * able to use the debug registers for debugging purposes via
3513 * ptrace(). Therefore we know it was not using them for
3514 * perfmormance monitoring, so we only decrement the number
3515 * of "ptraced" debug register users to keep the count up to date
3516 */
3517int
3518pfm_release_debug_registers(struct task_struct *task)
3519{
3520 unsigned long flags;
3521 int ret;
3522
3523 if (pmu_conf->use_rr_dbregs == 0) return 0;
3524
3525 LOCK_PFS(flags);
3526 if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
3527 printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid);
3528 ret = -1;
3529 } else {
3530 pfm_sessions.pfs_ptrace_use_dbregs--;
3531 ret = 0;
3532 }
3533 UNLOCK_PFS(flags);
3534
3535 return ret;
3536}
3537
3538static int
3539pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3540{
3541 struct task_struct *task;
3542 pfm_buffer_fmt_t *fmt;
3543 pfm_ovfl_ctrl_t rst_ctrl;
3544 int state, is_system;
3545 int ret = 0;
3546
3547 state = ctx->ctx_state;
3548 fmt = ctx->ctx_buf_fmt;
3549 is_system = ctx->ctx_fl_system;
3550 task = PFM_CTX_TASK(ctx);
3551
3552 switch(state) {
3553 case PFM_CTX_MASKED:
3554 break;
3555 case PFM_CTX_LOADED:
3556 if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break;
3557 /* fall through */
3558 case PFM_CTX_UNLOADED:
3559 case PFM_CTX_ZOMBIE:
3560 DPRINT(("invalid state=%d\n", state));
3561 return -EBUSY;
3562 default:
3563 DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state));
3564 return -EINVAL;
3565 }
3566
3567 /*
3568 * In system wide and when the context is loaded, access can only happen
3569 * when the caller is running on the CPU being monitored by the session.
3570 * It does not have to be the owner (ctx_task) of the context per se.
3571 */
3572 if (is_system && ctx->ctx_cpu != smp_processor_id()) {
3573 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
3574 return -EBUSY;
3575 }
3576
3577 /* sanity check */
3578 if (unlikely(task == NULL)) {
3579 printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid);
3580 return -EINVAL;
3581 }
3582
3583 if (task == current || is_system) {
3584
3585 fmt = ctx->ctx_buf_fmt;
3586
3587 DPRINT(("restarting self %d ovfl=0x%lx\n",
3588 task->pid,
3589 ctx->ctx_ovfl_regs[0]));
3590
3591 if (CTX_HAS_SMPL(ctx)) {
3592
3593 prefetch(ctx->ctx_smpl_hdr);
3594
3595 rst_ctrl.bits.mask_monitoring = 0;
3596 rst_ctrl.bits.reset_ovfl_pmds = 0;
3597
3598 if (state == PFM_CTX_LOADED)
3599 ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
3600 else
3601 ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
3602 } else {
3603 rst_ctrl.bits.mask_monitoring = 0;
3604 rst_ctrl.bits.reset_ovfl_pmds = 1;
3605 }
3606
3607 if (ret == 0) {
3608 if (rst_ctrl.bits.reset_ovfl_pmds)
3609 pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
3610
3611 if (rst_ctrl.bits.mask_monitoring == 0) {
3612 DPRINT(("resuming monitoring for [%d]\n", task->pid));
3613
3614 if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task);
3615 } else {
3616 DPRINT(("keeping monitoring stopped for [%d]\n", task->pid));
3617
3618 // cannot use pfm_stop_monitoring(task, regs);
3619 }
3620 }
3621 /*
3622 * clear overflowed PMD mask to remove any stale information
3623 */
3624 ctx->ctx_ovfl_regs[0] = 0UL;
3625
3626 /*
3627 * back to LOADED state
3628 */
3629 ctx->ctx_state = PFM_CTX_LOADED;
3630
3631 /*
3632 * XXX: not really useful for self monitoring
3633 */
3634 ctx->ctx_fl_can_restart = 0;
3635
3636 return 0;
3637 }
3638
3639 /*
3640 * restart another task
3641 */
3642
3643 /*
3644 * When PFM_CTX_MASKED, we cannot issue a restart before the previous
3645 * one is seen by the task.
3646 */
3647 if (state == PFM_CTX_MASKED) {
3648 if (ctx->ctx_fl_can_restart == 0) return -EINVAL;
3649 /*
3650 * will prevent subsequent restart before this one is
3651 * seen by other task
3652 */
3653 ctx->ctx_fl_can_restart = 0;
3654 }
3655
3656 /*
3657 * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
3658 * the task is blocked or on its way to block. That's the normal
3659 * restart path. If the monitoring is not masked, then the task
3660 * can be actively monitoring and we cannot directly intervene.
3661 * Therefore we use the trap mechanism to catch the task and
3662 * force it to reset the buffer/reset PMDs.
3663 *
3664 * if non-blocking, then we ensure that the task will go into
3665 * pfm_handle_work() before returning to user mode.
3666 *
3667 * We cannot explicitely reset another task, it MUST always
3668 * be done by the task itself. This works for system wide because
3669 * the tool that is controlling the session is logically doing
3670 * "self-monitoring".
3671 */
3672 if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
3673 DPRINT(("unblocking [%d] \n", task->pid));
3674 up(&ctx->ctx_restart_sem);
3675 } else {
3676 DPRINT(("[%d] armed exit trap\n", task->pid));
3677
3678 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
3679
3680 PFM_SET_WORK_PENDING(task, 1);
3681
3682 pfm_set_task_notify(task);
3683
3684 /*
3685 * XXX: send reschedule if task runs on another CPU
3686 */
3687 }
3688 return 0;
3689}
3690
3691static int
3692pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3693{
3694 unsigned int m = *(unsigned int *)arg;
3695
3696 pfm_sysctl.debug = m == 0 ? 0 : 1;
3697
3698 pfm_debug_var = pfm_sysctl.debug;
3699
3700 printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off");
3701
3702 if (m == 0) {
3703 memset(pfm_stats, 0, sizeof(pfm_stats));
3704 for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL;
3705 }
3706 return 0;
3707}
3708
3709/*
3710 * arg can be NULL and count can be zero for this function
3711 */
3712static int
3713pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3714{
3715 struct thread_struct *thread = NULL;
3716 struct task_struct *task;
3717 pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg;
3718 unsigned long flags;
3719 dbreg_t dbreg;
3720 unsigned int rnum;
3721 int first_time;
3722 int ret = 0, state;
3723 int i, can_access_pmu = 0;
3724 int is_system, is_loaded;
3725
3726 if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
3727
3728 state = ctx->ctx_state;
3729 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
3730 is_system = ctx->ctx_fl_system;
3731 task = ctx->ctx_task;
3732
3733 if (state == PFM_CTX_ZOMBIE) return -EINVAL;
3734
3735 /*
3736 * on both UP and SMP, we can only write to the PMC when the task is
3737 * the owner of the local PMU.
3738 */
3739 if (is_loaded) {
3740 thread = &task->thread;
3741 /*
3742 * In system wide and when the context is loaded, access can only happen
3743 * when the caller is running on the CPU being monitored by the session.
3744 * It does not have to be the owner (ctx_task) of the context per se.
3745 */
3746 if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
3747 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
3748 return -EBUSY;
3749 }
3750 can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
3751 }
3752
3753 /*
3754 * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
3755 * ensuring that no real breakpoint can be installed via this call.
3756 *
3757 * IMPORTANT: regs can be NULL in this function
3758 */
3759
3760 first_time = ctx->ctx_fl_using_dbreg == 0;
3761
3762 /*
3763 * don't bother if we are loaded and task is being debugged
3764 */
3765 if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) {
3766 DPRINT(("debug registers already in use for [%d]\n", task->pid));
3767 return -EBUSY;
3768 }
3769
3770 /*
3771 * check for debug registers in system wide mode
3772 *
3773 * If though a check is done in pfm_context_load(),
3774 * we must repeat it here, in case the registers are
3775 * written after the context is loaded
3776 */
3777 if (is_loaded) {
3778 LOCK_PFS(flags);
3779
3780 if (first_time && is_system) {
3781 if (pfm_sessions.pfs_ptrace_use_dbregs)
3782 ret = -EBUSY;
3783 else
3784 pfm_sessions.pfs_sys_use_dbregs++;
3785 }
3786 UNLOCK_PFS(flags);
3787 }
3788
3789 if (ret != 0) return ret;
3790
3791 /*
3792 * mark ourself as user of the debug registers for
3793 * perfmon purposes.
3794 */
3795 ctx->ctx_fl_using_dbreg = 1;
3796
3797 /*
3798 * clear hardware registers to make sure we don't
3799 * pick up stale state.
3800 *
3801 * for a system wide session, we do not use
3802 * thread.dbr, thread.ibr because this process
3803 * never leaves the current CPU and the state
3804 * is shared by all processes running on it
3805 */
3806 if (first_time && can_access_pmu) {
3807 DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
3808 for (i=0; i < pmu_conf->num_ibrs; i++) {
3809 ia64_set_ibr(i, 0UL);
3810 ia64_dv_serialize_instruction();
3811 }
3812 ia64_srlz_i();
3813 for (i=0; i < pmu_conf->num_dbrs; i++) {
3814 ia64_set_dbr(i, 0UL);
3815 ia64_dv_serialize_data();
3816 }
3817 ia64_srlz_d();
3818 }
3819
3820 /*
3821 * Now install the values into the registers
3822 */
3823 for (i = 0; i < count; i++, req++) {
3824
3825 rnum = req->dbreg_num;
3826 dbreg.val = req->dbreg_value;
3827
3828 ret = -EINVAL;
3829
3830 if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) || ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) {
3831 DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
3832 rnum, dbreg.val, mode, i, count));
3833
3834 goto abort_mission;
3835 }
3836
3837 /*
3838 * make sure we do not install enabled breakpoint
3839 */
3840 if (rnum & 0x1) {
3841 if (mode == PFM_CODE_RR)
3842 dbreg.ibr.ibr_x = 0;
3843 else
3844 dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
3845 }
3846
3847 PFM_REG_RETFLAG_SET(req->dbreg_flags, 0);
3848
3849 /*
3850 * Debug registers, just like PMC, can only be modified
3851 * by a kernel call. Moreover, perfmon() access to those
3852 * registers are centralized in this routine. The hardware
3853 * does not modify the value of these registers, therefore,
3854 * if we save them as they are written, we can avoid having
3855 * to save them on context switch out. This is made possible
3856 * by the fact that when perfmon uses debug registers, ptrace()
3857 * won't be able to modify them concurrently.
3858 */
3859 if (mode == PFM_CODE_RR) {
3860 CTX_USED_IBR(ctx, rnum);
3861
3862 if (can_access_pmu) {
3863 ia64_set_ibr(rnum, dbreg.val);
3864 ia64_dv_serialize_instruction();
3865 }
3866
3867 ctx->ctx_ibrs[rnum] = dbreg.val;
3868
3869 DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
3870 rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
3871 } else {
3872 CTX_USED_DBR(ctx, rnum);
3873
3874 if (can_access_pmu) {
3875 ia64_set_dbr(rnum, dbreg.val);
3876 ia64_dv_serialize_data();
3877 }
3878 ctx->ctx_dbrs[rnum] = dbreg.val;
3879
3880 DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
3881 rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
3882 }
3883 }
3884
3885 return 0;
3886
3887abort_mission:
3888 /*
3889 * in case it was our first attempt, we undo the global modifications
3890 */
3891 if (first_time) {
3892 LOCK_PFS(flags);
3893 if (ctx->ctx_fl_system) {
3894 pfm_sessions.pfs_sys_use_dbregs--;
3895 }
3896 UNLOCK_PFS(flags);
3897 ctx->ctx_fl_using_dbreg = 0;
3898 }
3899 /*
3900 * install error return flag
3901 */
3902 PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL);
3903
3904 return ret;
3905}
3906
3907static int
3908pfm_write_ibrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3909{
3910 return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs);
3911}
3912
3913static int
3914pfm_write_dbrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3915{
3916 return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs);
3917}
3918
3919int
3920pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
3921{
3922 pfm_context_t *ctx;
3923
3924 if (req == NULL) return -EINVAL;
3925
3926 ctx = GET_PMU_CTX();
3927
3928 if (ctx == NULL) return -EINVAL;
3929
3930 /*
3931 * for now limit to current task, which is enough when calling
3932 * from overflow handler
3933 */
3934 if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
3935
3936 return pfm_write_ibrs(ctx, req, nreq, regs);
3937}
3938EXPORT_SYMBOL(pfm_mod_write_ibrs);
3939
3940int
3941pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
3942{
3943 pfm_context_t *ctx;
3944
3945 if (req == NULL) return -EINVAL;
3946
3947 ctx = GET_PMU_CTX();
3948
3949 if (ctx == NULL) return -EINVAL;
3950
3951 /*
3952 * for now limit to current task, which is enough when calling
3953 * from overflow handler
3954 */
3955 if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
3956
3957 return pfm_write_dbrs(ctx, req, nreq, regs);
3958}
3959EXPORT_SYMBOL(pfm_mod_write_dbrs);
3960
3961
3962static int
3963pfm_get_features(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3964{
3965 pfarg_features_t *req = (pfarg_features_t *)arg;
3966
3967 req->ft_version = PFM_VERSION;
3968 return 0;
3969}
3970
3971static int
3972pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
3973{
3974 struct pt_regs *tregs;
3975 struct task_struct *task = PFM_CTX_TASK(ctx);
3976 int state, is_system;
3977
3978 state = ctx->ctx_state;
3979 is_system = ctx->ctx_fl_system;
3980
3981 /*
3982 * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
3983 */
3984 if (state == PFM_CTX_UNLOADED) return -EINVAL;
3985
3986 /*
3987 * In system wide and when the context is loaded, access can only happen
3988 * when the caller is running on the CPU being monitored by the session.
3989 * It does not have to be the owner (ctx_task) of the context per se.
3990 */
3991 if (is_system && ctx->ctx_cpu != smp_processor_id()) {
3992 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
3993 return -EBUSY;
3994 }
3995 DPRINT(("task [%d] ctx_state=%d is_system=%d\n",
3996 PFM_CTX_TASK(ctx)->pid,
3997 state,
3998 is_system));
3999 /*
4000 * in system mode, we need to update the PMU directly
4001 * and the user level state of the caller, which may not
4002 * necessarily be the creator of the context.
4003 */
4004 if (is_system) {
4005 /*
4006 * Update local PMU first
4007 *
4008 * disable dcr pp
4009 */
4010 ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
4011 ia64_srlz_i();
4012
4013 /*
4014 * update local cpuinfo
4015 */
4016 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
4017
4018 /*
4019 * stop monitoring, does srlz.i
4020 */
4021 pfm_clear_psr_pp();
4022
4023 /*
4024 * stop monitoring in the caller
4025 */
4026 ia64_psr(regs)->pp = 0;
4027
4028 return 0;
4029 }
4030 /*
4031 * per-task mode
4032 */
4033
4034 if (task == current) {
4035 /* stop monitoring at kernel level */
4036 pfm_clear_psr_up();
4037
4038 /*
4039 * stop monitoring at the user level
4040 */
4041 ia64_psr(regs)->up = 0;
4042 } else {
4043 tregs = ia64_task_regs(task);
4044
4045 /*
4046 * stop monitoring at the user level
4047 */
4048 ia64_psr(tregs)->up = 0;
4049
4050 /*
4051 * monitoring disabled in kernel at next reschedule
4052 */
4053 ctx->ctx_saved_psr_up = 0;
4054 DPRINT(("task=[%d]\n", task->pid));
4055 }
4056 return 0;
4057}
4058
4059
4060static int
4061pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
4062{
4063 struct pt_regs *tregs;
4064 int state, is_system;
4065
4066 state = ctx->ctx_state;
4067 is_system = ctx->ctx_fl_system;
4068
4069 if (state != PFM_CTX_LOADED) return -EINVAL;
4070
4071 /*
4072 * In system wide and when the context is loaded, access can only happen
4073 * when the caller is running on the CPU being monitored by the session.
4074 * It does not have to be the owner (ctx_task) of the context per se.
4075 */
4076 if (is_system && ctx->ctx_cpu != smp_processor_id()) {
4077 DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
4078 return -EBUSY;
4079 }
4080
4081 /*
4082 * in system mode, we need to update the PMU directly
4083 * and the user level state of the caller, which may not
4084 * necessarily be the creator of the context.
4085 */
4086 if (is_system) {
4087
4088 /*
4089 * set user level psr.pp for the caller
4090 */
4091 ia64_psr(regs)->pp = 1;
4092
4093 /*
4094 * now update the local PMU and cpuinfo
4095 */
4096 PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
4097
4098 /*
4099 * start monitoring at kernel level
4100 */
4101 pfm_set_psr_pp();
4102
4103 /* enable dcr pp */
4104 ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
4105 ia64_srlz_i();
4106
4107 return 0;
4108 }
4109
4110 /*
4111 * per-process mode
4112 */
4113
4114 if (ctx->ctx_task == current) {
4115
4116 /* start monitoring at kernel level */
4117 pfm_set_psr_up();
4118
4119 /*
4120 * activate monitoring at user level
4121 */
4122 ia64_psr(regs)->up = 1;
4123
4124 } else {
4125 tregs = ia64_task_regs(ctx->ctx_task);
4126
4127 /*
4128 * start monitoring at the kernel level the next
4129 * time the task is scheduled
4130 */
4131 ctx->ctx_saved_psr_up = IA64_PSR_UP;
4132
4133 /*
4134 * activate monitoring at user level
4135 */
4136 ia64_psr(tregs)->up = 1;
4137 }
4138 return 0;
4139}
4140
4141static int
4142pfm_get_pmc_reset(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
4143{
4144 pfarg_reg_t *req = (pfarg_reg_t *)arg;
4145 unsigned int cnum;
4146 int i;
4147 int ret = -EINVAL;
4148
4149 for (i = 0; i < count; i++, req++) {
4150
4151 cnum = req->reg_num;
4152
4153 if (!PMC_IS_IMPL(cnum)) goto abort_mission;
4154
4155 req->reg_value = PMC_DFL_VAL(cnum);
4156
4157 PFM_REG_RETFLAG_SET(req->reg_flags, 0);
4158
4159 DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value));
4160 }
4161 return 0;
4162
4163abort_mission:
4164 PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
4165 return ret;
4166}
4167
4168static int
4169pfm_check_task_exist(pfm_context_t *ctx)
4170{
4171 struct task_struct *g, *t;
4172 int ret = -ESRCH;
4173
4174 read_lock(&tasklist_lock);
4175
4176 do_each_thread (g, t) {
4177 if (t->thread.pfm_context == ctx) {
4178 ret = 0;
4179 break;
4180 }
4181 } while_each_thread (g, t);
4182
4183 read_unlock(&tasklist_lock);
4184
4185 DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx));
4186
4187 return ret;
4188}
4189
4190static int
4191pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
4192{
4193 struct task_struct *task;
4194 struct thread_struct *thread;
4195 struct pfm_context_t *old;
4196 unsigned long flags;
4197#ifndef CONFIG_SMP
4198 struct task_struct *owner_task = NULL;
4199#endif
4200 pfarg_load_t *req = (pfarg_load_t *)arg;
4201 unsigned long *pmcs_source, *pmds_source;
4202 int the_cpu;
4203 int ret = 0;
4204 int state, is_system, set_dbregs = 0;
4205
4206 state = ctx->ctx_state;
4207 is_system = ctx->ctx_fl_system;
4208 /*
4209 * can only load from unloaded or terminated state
4210 */
4211 if (state != PFM_CTX_UNLOADED) {
4212 DPRINT(("cannot load to [%d], invalid ctx_state=%d\n",
4213 req->load_pid,
4214 ctx->ctx_state));
4215 return -EINVAL;
4216 }
4217
4218 DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
4219
4220 if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) {
4221 DPRINT(("cannot use blocking mode on self\n"));
4222 return -EINVAL;
4223 }
4224
4225 ret = pfm_get_task(ctx, req->load_pid, &task);
4226 if (ret) {
4227 DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret));
4228 return ret;
4229 }
4230
4231 ret = -EINVAL;
4232
4233 /*
4234 * system wide is self monitoring only
4235 */
4236 if (is_system && task != current) {
4237 DPRINT(("system wide is self monitoring only load_pid=%d\n",
4238 req->load_pid));
4239 goto error;
4240 }
4241
4242 thread = &task->thread;
4243
4244 ret = 0;
4245 /*
4246 * cannot load a context which is using range restrictions,
4247 * into a task that is being debugged.
4248 */
4249 if (ctx->ctx_fl_using_dbreg) {
4250 if (thread->flags & IA64_THREAD_DBG_VALID) {
4251 ret = -EBUSY;
4252 DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
4253 goto error;
4254 }
4255 LOCK_PFS(flags);
4256
4257 if (is_system) {
4258 if (pfm_sessions.pfs_ptrace_use_dbregs) {
4259 DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
4260 ret = -EBUSY;
4261 } else {
4262 pfm_sessions.pfs_sys_use_dbregs++;
4263 DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
4264 set_dbregs = 1;
4265 }
4266 }
4267
4268 UNLOCK_PFS(flags);
4269
4270 if (ret) goto error;
4271 }
4272
4273 /*
4274 * SMP system-wide monitoring implies self-monitoring.
4275 *
4276 * The programming model expects the task to
4277 * be pinned on a CPU throughout the session.
4278 * Here we take note of the current CPU at the
4279 * time the context is loaded. No call from
4280 * another CPU will be allowed.
4281 *
4282 * The pinning via shed_setaffinity()
4283 * must be done by the calling task prior
4284 * to this call.
4285 *
4286 * systemwide: keep track of CPU this session is supposed to run on
4287 */
4288 the_cpu = ctx->ctx_cpu = smp_processor_id();
4289
4290 ret = -EBUSY;
4291 /*
4292 * now reserve the session
4293 */
4294 ret = pfm_reserve_session(current, is_system, the_cpu);
4295 if (ret) goto error;
4296
4297 /*
4298 * task is necessarily stopped at this point.
4299 *
4300 * If the previous context was zombie, then it got removed in
4301 * pfm_save_regs(). Therefore we should not see it here.
4302 * If we see a context, then this is an active context
4303 *
4304 * XXX: needs to be atomic
4305 */
4306 DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n",
4307 thread->pfm_context, ctx));
4308
4309 old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *));
4310 if (old != NULL) {
4311 DPRINT(("load_pid [%d] already has a context\n", req->load_pid));
4312 goto error_unres;
4313 }
4314
4315 pfm_reset_msgq(ctx);
4316
4317 ctx->ctx_state = PFM_CTX_LOADED;
4318
4319 /*
4320 * link context to task
4321 */
4322 ctx->ctx_task = task;
4323
4324 if (is_system) {
4325 /*
4326 * we load as stopped
4327 */
4328 PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
4329 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
4330
4331 if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
4332 } else {
4333 thread->flags |= IA64_THREAD_PM_VALID;
4334 }
4335
4336 /*
4337 * propagate into thread-state
4338 */
4339 pfm_copy_pmds(task, ctx);
4340 pfm_copy_pmcs(task, ctx);
4341
4342 pmcs_source = thread->pmcs;
4343 pmds_source = thread->pmds;
4344
4345 /*
4346 * always the case for system-wide
4347 */
4348 if (task == current) {
4349
4350 if (is_system == 0) {
4351
4352 /* allow user level control */
4353 ia64_psr(regs)->sp = 0;
4354 DPRINT(("clearing psr.sp for [%d]\n", task->pid));
4355
4356 SET_LAST_CPU(ctx, smp_processor_id());
4357 INC_ACTIVATION();
4358 SET_ACTIVATION(ctx);
4359#ifndef CONFIG_SMP
4360 /*
4361 * push the other task out, if any
4362 */
4363 owner_task = GET_PMU_OWNER();
4364 if (owner_task) pfm_lazy_save_regs(owner_task);
4365#endif
4366 }
4367 /*
4368 * load all PMD from ctx to PMU (as opposed to thread state)
4369 * restore all PMC from ctx to PMU
4370 */
4371 pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]);
4372 pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]);
4373
4374 ctx->ctx_reload_pmcs[0] = 0UL;
4375 ctx->ctx_reload_pmds[0] = 0UL;
4376
4377 /*
4378 * guaranteed safe by earlier check against DBG_VALID
4379 */
4380 if (ctx->ctx_fl_using_dbreg) {
4381 pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
4382 pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
4383 }
4384 /*
4385 * set new ownership
4386 */
4387 SET_PMU_OWNER(task, ctx);
4388
4389 DPRINT(("context loaded on PMU for [%d]\n", task->pid));
4390 } else {
4391 /*
4392 * when not current, task MUST be stopped, so this is safe
4393 */
4394 regs = ia64_task_regs(task);
4395
4396 /* force a full reload */
4397 ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
4398 SET_LAST_CPU(ctx, -1);
4399
4400 /* initial saved psr (stopped) */
4401 ctx->ctx_saved_psr_up = 0UL;
4402 ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
4403 }
4404
4405 ret = 0;
4406
4407error_unres:
4408 if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu);
4409error:
4410 /*
4411 * we must undo the dbregs setting (for system-wide)
4412 */
4413 if (ret && set_dbregs) {
4414 LOCK_PFS(flags);
4415 pfm_sessions.pfs_sys_use_dbregs--;
4416 UNLOCK_PFS(flags);
4417 }
4418 /*
4419 * release task, there is now a link with the context
4420 */
4421 if (is_system == 0 && task != current) {
4422 pfm_put_task(task);
4423
4424 if (ret == 0) {
4425 ret = pfm_check_task_exist(ctx);
4426 if (ret) {
4427 ctx->ctx_state = PFM_CTX_UNLOADED;
4428 ctx->ctx_task = NULL;
4429 }
4430 }
4431 }
4432 return ret;
4433}
4434
4435/*
4436 * in this function, we do not need to increase the use count
4437 * for the task via get_task_struct(), because we hold the
4438 * context lock. If the task were to disappear while having
4439 * a context attached, it would go through pfm_exit_thread()
4440 * which also grabs the context lock and would therefore be blocked
4441 * until we are here.
4442 */
4443static void pfm_flush_pmds(struct task_struct *, pfm_context_t *ctx);
4444
4445static int
4446pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
4447{
4448 struct task_struct *task = PFM_CTX_TASK(ctx);
4449 struct pt_regs *tregs;
4450 int prev_state, is_system;
4451 int ret;
4452
4453 DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
4454
4455 prev_state = ctx->ctx_state;
4456 is_system = ctx->ctx_fl_system;
4457
4458 /*
4459 * unload only when necessary
4460 */
4461 if (prev_state == PFM_CTX_UNLOADED) {
4462 DPRINT(("ctx_state=%d, nothing to do\n", prev_state));
4463 return 0;
4464 }
4465
4466 /*
4467 * clear psr and dcr bits
4468 */
4469 ret = pfm_stop(ctx, NULL, 0, regs);
4470 if (ret) return ret;
4471
4472 ctx->ctx_state = PFM_CTX_UNLOADED;
4473
4474 /*
4475 * in system mode, we need to update the PMU directly
4476 * and the user level state of the caller, which may not
4477 * necessarily be the creator of the context.
4478 */
4479 if (is_system) {
4480
4481 /*
4482 * Update cpuinfo
4483 *
4484 * local PMU is taken care of in pfm_stop()
4485 */
4486 PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
4487 PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
4488
4489 /*
4490 * save PMDs in context
4491 * release ownership
4492 */
4493 pfm_flush_pmds(current, ctx);
4494
4495 /*
4496 * at this point we are done with the PMU
4497 * so we can unreserve the resource.
4498 */
4499 if (prev_state != PFM_CTX_ZOMBIE)
4500 pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu);
4501
4502 /*
4503 * disconnect context from task
4504 */
4505 task->thread.pfm_context = NULL;
4506 /*
4507 * disconnect task from context
4508 */
4509 ctx->ctx_task = NULL;
4510
4511 /*
4512 * There is nothing more to cleanup here.
4513 */
4514 return 0;
4515 }
4516
4517 /*
4518 * per-task mode
4519 */
4520 tregs = task == current ? regs : ia64_task_regs(task);
4521
4522 if (task == current) {
4523 /*
4524 * cancel user level control
4525 */
4526 ia64_psr(regs)->sp = 1;
4527
4528 DPRINT(("setting psr.sp for [%d]\n", task->pid));
4529 }
4530 /*
4531 * save PMDs to context
4532 * release ownership
4533 */
4534 pfm_flush_pmds(task, ctx);
4535
4536 /*
4537 * at this point we are done with the PMU
4538 * so we can unreserve the resource.
4539 *
4540 * when state was ZOMBIE, we have already unreserved.
4541 */
4542 if (prev_state != PFM_CTX_ZOMBIE)
4543 pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu);
4544
4545 /*
4546 * reset activation counter and psr
4547 */
4548 ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
4549 SET_LAST_CPU(ctx, -1);
4550
4551 /*
4552 * PMU state will not be restored
4553 */
4554 task->thread.flags &= ~IA64_THREAD_PM_VALID;
4555
4556 /*
4557 * break links between context and task
4558 */
4559 task->thread.pfm_context = NULL;
4560 ctx->ctx_task = NULL;
4561
4562 PFM_SET_WORK_PENDING(task, 0);
4563
4564 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
4565 ctx->ctx_fl_can_restart = 0;
4566 ctx->ctx_fl_going_zombie = 0;
4567
4568 DPRINT(("disconnected [%d] from context\n", task->pid));
4569
4570 return 0;
4571}
4572
4573
4574/*
4575 * called only from exit_thread(): task == current
4576 * we come here only if current has a context attached (loaded or masked)
4577 */
4578void
4579pfm_exit_thread(struct task_struct *task)
4580{
4581 pfm_context_t *ctx;
4582 unsigned long flags;
4583 struct pt_regs *regs = ia64_task_regs(task);
4584 int ret, state;
4585 int free_ok = 0;
4586
4587 ctx = PFM_GET_CTX(task);
4588
4589 PROTECT_CTX(ctx, flags);
4590
4591 DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid));
4592
4593 state = ctx->ctx_state;
4594 switch(state) {
4595 case PFM_CTX_UNLOADED:
4596 /*
4597 * only comes to thios function if pfm_context is not NULL, i.e., cannot
4598 * be in unloaded state
4599 */
4600 printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid);
4601 break;
4602 case PFM_CTX_LOADED:
4603 case PFM_CTX_MASKED:
4604 ret = pfm_context_unload(ctx, NULL, 0, regs);
4605 if (ret) {
4606 printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
4607 }
4608 DPRINT(("ctx unloaded for current state was %d\n", state));
4609
4610 pfm_end_notify_user(ctx);
4611 break;
4612 case PFM_CTX_ZOMBIE:
4613 ret = pfm_context_unload(ctx, NULL, 0, regs);
4614 if (ret) {
4615 printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
4616 }
4617 free_ok = 1;
4618 break;
4619 default:
4620 printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state);
4621 break;
4622 }
4623 UNPROTECT_CTX(ctx, flags);
4624
4625 { u64 psr = pfm_get_psr();
4626 BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
4627 BUG_ON(GET_PMU_OWNER());
4628 BUG_ON(ia64_psr(regs)->up);
4629 BUG_ON(ia64_psr(regs)->pp);
4630 }
4631
4632 /*
4633 * All memory free operations (especially for vmalloc'ed memory)
4634 * MUST be done with interrupts ENABLED.
4635 */
4636 if (free_ok) pfm_context_free(ctx);
4637}
4638
4639/*
4640 * functions MUST be listed in the increasing order of their index (see permfon.h)
4641 */
4642#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz }
4643#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL }
4644#define PFM_CMD_PCLRWS (PFM_CMD_FD|PFM_CMD_ARG_RW|PFM_CMD_STOP)
4645#define PFM_CMD_PCLRW (PFM_CMD_FD|PFM_CMD_ARG_RW)
4646#define PFM_CMD_NONE { NULL, "no-cmd", 0, 0, 0, NULL}
4647
4648static pfm_cmd_desc_t pfm_cmd_tab[]={
4649/* 0 */PFM_CMD_NONE,
4650/* 1 */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
4651/* 2 */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
4652/* 3 */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
4653/* 4 */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS),
4654/* 5 */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS),
4655/* 6 */PFM_CMD_NONE,
4656/* 7 */PFM_CMD_NONE,
4657/* 8 */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize),
4658/* 9 */PFM_CMD_NONE,
4659/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW),
4660/* 11 */PFM_CMD_NONE,
4661/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL),
4662/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL),
4663/* 14 */PFM_CMD_NONE,
4664/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
4665/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL),
4666/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS),
4667/* 18 */PFM_CMD_NONE,
4668/* 19 */PFM_CMD_NONE,
4669/* 20 */PFM_CMD_NONE,
4670/* 21 */PFM_CMD_NONE,
4671/* 22 */PFM_CMD_NONE,
4672/* 23 */PFM_CMD_NONE,
4673/* 24 */PFM_CMD_NONE,
4674/* 25 */PFM_CMD_NONE,
4675/* 26 */PFM_CMD_NONE,
4676/* 27 */PFM_CMD_NONE,
4677/* 28 */PFM_CMD_NONE,
4678/* 29 */PFM_CMD_NONE,
4679/* 30 */PFM_CMD_NONE,
4680/* 31 */PFM_CMD_NONE,
4681/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL),
4682/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL)
4683};
4684#define PFM_CMD_COUNT (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
4685
4686static int
4687pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
4688{
4689 struct task_struct *task;
4690 int state, old_state;
4691
4692recheck:
4693 state = ctx->ctx_state;
4694 task = ctx->ctx_task;
4695
4696 if (task == NULL) {
4697 DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
4698 return 0;
4699 }
4700
4701 DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
4702 ctx->ctx_fd,
4703 state,
4704 task->pid,
4705 task->state, PFM_CMD_STOPPED(cmd)));
4706
4707 /*
4708 * self-monitoring always ok.
4709 *
4710 * for system-wide the caller can either be the creator of the
4711 * context (to one to which the context is attached to) OR
4712 * a task running on the same CPU as the session.
4713 */
4714 if (task == current || ctx->ctx_fl_system) return 0;
4715
4716 /*
4717 * if context is UNLOADED we are safe to go
4718 */
4719 if (state == PFM_CTX_UNLOADED) return 0;
4720
4721 /*
4722 * no command can operate on a zombie context
4723 */
4724 if (state == PFM_CTX_ZOMBIE) {
4725 DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
4726 return -EINVAL;
4727 }
4728
4729 /*
4730 * context is LOADED or MASKED. Some commands may need to have
4731 * the task stopped.
4732 *
4733 * We could lift this restriction for UP but it would mean that
4734 * the user has no guarantee the task would not run between
4735 * two successive calls to perfmonctl(). That's probably OK.
4736 * If this user wants to ensure the task does not run, then
4737 * the task must be stopped.
4738 */
4739 if (PFM_CMD_STOPPED(cmd)) {
4740 if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
4741 DPRINT(("[%d] task not in stopped state\n", task->pid));
4742 return -EBUSY;
4743 }
4744 /*
4745 * task is now stopped, wait for ctxsw out
4746 *
4747 * This is an interesting point in the code.
4748 * We need to unprotect the context because
4749 * the pfm_save_regs() routines needs to grab
4750 * the same lock. There are danger in doing
4751 * this because it leaves a window open for
4752 * another task to get access to the context
4753 * and possibly change its state. The one thing
4754 * that is not possible is for the context to disappear
4755 * because we are protected by the VFS layer, i.e.,
4756 * get_fd()/put_fd().
4757 */
4758 old_state = state;
4759
4760 UNPROTECT_CTX(ctx, flags);
4761
4762 wait_task_inactive(task);
4763
4764 PROTECT_CTX(ctx, flags);
4765
4766 /*
4767 * we must recheck to verify if state has changed
4768 */
4769 if (ctx->ctx_state != old_state) {
4770 DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
4771 goto recheck;
4772 }
4773 }
4774 return 0;
4775}
4776
4777/*
4778 * system-call entry point (must return long)
4779 */
4780asmlinkage long
4781sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
4782{
4783 struct file *file = NULL;
4784 pfm_context_t *ctx = NULL;
4785 unsigned long flags = 0UL;
4786 void *args_k = NULL;
4787 long ret; /* will expand int return types */
4788 size_t base_sz, sz, xtra_sz = 0;
4789 int narg, completed_args = 0, call_made = 0, cmd_flags;
4790 int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
4791 int (*getsize)(void *arg, size_t *sz);
4792#define PFM_MAX_ARGSIZE 4096
4793
4794 /*
4795 * reject any call if perfmon was disabled at initialization
4796 */
4797 if (unlikely(pmu_conf == NULL)) return -ENOSYS;
4798
4799 if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) {
4800 DPRINT(("invalid cmd=%d\n", cmd));
4801 return -EINVAL;
4802 }
4803
4804 func = pfm_cmd_tab[cmd].cmd_func;
4805 narg = pfm_cmd_tab[cmd].cmd_narg;
4806 base_sz = pfm_cmd_tab[cmd].cmd_argsize;
4807 getsize = pfm_cmd_tab[cmd].cmd_getsize;
4808 cmd_flags = pfm_cmd_tab[cmd].cmd_flags;
4809
4810 if (unlikely(func == NULL)) {
4811 DPRINT(("invalid cmd=%d\n", cmd));
4812 return -EINVAL;
4813 }
4814
4815 DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n",
4816 PFM_CMD_NAME(cmd),
4817 cmd,
4818 narg,
4819 base_sz,
4820 count));
4821
4822 /*
4823 * check if number of arguments matches what the command expects
4824 */
4825 if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count)))
4826 return -EINVAL;
4827
4828restart_args:
4829 sz = xtra_sz + base_sz*count;
4830 /*
4831 * limit abuse to min page size
4832 */
4833 if (unlikely(sz > PFM_MAX_ARGSIZE)) {
4834 printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz);
4835 return -E2BIG;
4836 }
4837
4838 /*
4839 * allocate default-sized argument buffer
4840 */
4841 if (likely(count && args_k == NULL)) {
4842 args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL);
4843 if (args_k == NULL) return -ENOMEM;
4844 }
4845
4846 ret = -EFAULT;
4847
4848 /*
4849 * copy arguments
4850 *
4851 * assume sz = 0 for command without parameters
4852 */
4853 if (sz && copy_from_user(args_k, arg, sz)) {
4854 DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg));
4855 goto error_args;
4856 }
4857
4858 /*
4859 * check if command supports extra parameters
4860 */
4861 if (completed_args == 0 && getsize) {
4862 /*
4863 * get extra parameters size (based on main argument)
4864 */
4865 ret = (*getsize)(args_k, &xtra_sz);
4866 if (ret) goto error_args;
4867
4868 completed_args = 1;
4869
4870 DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz));
4871
4872 /* retry if necessary */
4873 if (likely(xtra_sz)) goto restart_args;
4874 }
4875
4876 if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd;
4877
4878 ret = -EBADF;
4879
4880 file = fget(fd);
4881 if (unlikely(file == NULL)) {
4882 DPRINT(("invalid fd %d\n", fd));
4883 goto error_args;
4884 }
4885 if (unlikely(PFM_IS_FILE(file) == 0)) {
4886 DPRINT(("fd %d not related to perfmon\n", fd));
4887 goto error_args;
4888 }
4889
4890 ctx = (pfm_context_t *)file->private_data;
4891 if (unlikely(ctx == NULL)) {
4892 DPRINT(("no context for fd %d\n", fd));
4893 goto error_args;
4894 }
4895 prefetch(&ctx->ctx_state);
4896
4897 PROTECT_CTX(ctx, flags);
4898
4899 /*
4900 * check task is stopped
4901 */
4902 ret = pfm_check_task_state(ctx, cmd, flags);
4903 if (unlikely(ret)) goto abort_locked;
4904
4905skip_fd:
4906 ret = (*func)(ctx, args_k, count, ia64_task_regs(current));
4907
4908 call_made = 1;
4909
4910abort_locked:
4911 if (likely(ctx)) {
4912 DPRINT(("context unlocked\n"));
4913 UNPROTECT_CTX(ctx, flags);
4914 fput(file);
4915 }
4916
4917 /* copy argument back to user, if needed */
4918 if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
4919
4920error_args:
4921 if (args_k) kfree(args_k);
4922
4923 DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
4924
4925 return ret;
4926}
4927
4928static void
4929pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_regs *regs)
4930{
4931 pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt;
4932 pfm_ovfl_ctrl_t rst_ctrl;
4933 int state;
4934 int ret = 0;
4935
4936 state = ctx->ctx_state;
4937 /*
4938 * Unlock sampling buffer and reset index atomically
4939 * XXX: not really needed when blocking
4940 */
4941 if (CTX_HAS_SMPL(ctx)) {
4942
4943 rst_ctrl.bits.mask_monitoring = 0;
4944 rst_ctrl.bits.reset_ovfl_pmds = 0;
4945
4946 if (state == PFM_CTX_LOADED)
4947 ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
4948 else
4949 ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
4950 } else {
4951 rst_ctrl.bits.mask_monitoring = 0;
4952 rst_ctrl.bits.reset_ovfl_pmds = 1;
4953 }
4954
4955 if (ret == 0) {
4956 if (rst_ctrl.bits.reset_ovfl_pmds) {
4957 pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET);
4958 }
4959 if (rst_ctrl.bits.mask_monitoring == 0) {
4960 DPRINT(("resuming monitoring\n"));
4961 if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current);
4962 } else {
4963 DPRINT(("stopping monitoring\n"));
4964 //pfm_stop_monitoring(current, regs);
4965 }
4966 ctx->ctx_state = PFM_CTX_LOADED;
4967 }
4968}
4969
4970/*
4971 * context MUST BE LOCKED when calling
4972 * can only be called for current
4973 */
4974static void
4975pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
4976{
4977 int ret;
4978
4979 DPRINT(("entering for [%d]\n", current->pid));
4980
4981 ret = pfm_context_unload(ctx, NULL, 0, regs);
4982 if (ret) {
4983 printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
4984 }
4985
4986 /*
4987 * and wakeup controlling task, indicating we are now disconnected
4988 */
4989 wake_up_interruptible(&ctx->ctx_zombieq);
4990
4991 /*
4992 * given that context is still locked, the controlling
4993 * task will only get access when we return from
4994 * pfm_handle_work().
4995 */
4996}
4997
4998static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
4999
5000void
5001pfm_handle_work(void)
5002{
5003 pfm_context_t *ctx;
5004 struct pt_regs *regs;
5005 unsigned long flags;
5006 unsigned long ovfl_regs;
5007 unsigned int reason;
5008 int ret;
5009
5010 ctx = PFM_GET_CTX(current);
5011 if (ctx == NULL) {
5012 printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
5013 return;
5014 }
5015
5016 PROTECT_CTX(ctx, flags);
5017
5018 PFM_SET_WORK_PENDING(current, 0);
5019
5020 pfm_clear_task_notify();
5021
5022 regs = ia64_task_regs(current);
5023
5024 /*
5025 * extract reason for being here and clear
5026 */
5027 reason = ctx->ctx_fl_trap_reason;
5028 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
5029 ovfl_regs = ctx->ctx_ovfl_regs[0];
5030
5031 DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state));
5032
5033 /*
5034 * must be done before we check for simple-reset mode
5035 */
5036 if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie;
5037
5038
5039 //if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
5040 if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
5041
5042 UNPROTECT_CTX(ctx, flags);
5043
5044 /*
5045 * pfm_handle_work() is currently called with interrupts disabled.
5046 * The down_interruptible call may sleep, therefore we
5047 * must re-enable interrupts to avoid deadlocks. It is
5048 * safe to do so because this function is called ONLY
5049 * when returning to user level (PUStk=1), in which case
5050 * there is no risk of kernel stack overflow due to deep
5051 * interrupt nesting.
5052 */
5053 BUG_ON(flags & IA64_PSR_I);
5054 local_irq_enable();
5055
5056 DPRINT(("before block sleeping\n"));
5057
5058 /*
5059 * may go through without blocking on SMP systems
5060 * if restart has been received already by the time we call down()
5061 */
5062 ret = down_interruptible(&ctx->ctx_restart_sem);
5063
5064 DPRINT(("after block sleeping ret=%d\n", ret));
5065
5066 /*
5067 * disable interrupts to restore state we had upon entering
5068 * this function
5069 */
5070 local_irq_disable();
5071
5072 PROTECT_CTX(ctx, flags);
5073
5074 /*
5075 * we need to read the ovfl_regs only after wake-up
5076 * because we may have had pfm_write_pmds() in between
5077 * and that can changed PMD values and therefore
5078 * ovfl_regs is reset for these new PMD values.
5079 */
5080 ovfl_regs = ctx->ctx_ovfl_regs[0];
5081
5082 if (ctx->ctx_fl_going_zombie) {
5083do_zombie:
5084 DPRINT(("context is zombie, bailing out\n"));
5085 pfm_context_force_terminate(ctx, regs);
5086 goto nothing_to_do;
5087 }
5088 /*
5089 * in case of interruption of down() we don't restart anything
5090 */
5091 if (ret < 0) goto nothing_to_do;
5092
5093skip_blocking:
5094 pfm_resume_after_ovfl(ctx, ovfl_regs, regs);
5095 ctx->ctx_ovfl_regs[0] = 0UL;
5096
5097nothing_to_do:
5098
5099 UNPROTECT_CTX(ctx, flags);
5100}
5101
5102static int
5103pfm_notify_user(pfm_context_t *ctx, pfm_msg_t *msg)
5104{
5105 if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
5106 DPRINT(("ignoring overflow notification, owner is zombie\n"));
5107 return 0;
5108 }
5109
5110 DPRINT(("waking up somebody\n"));
5111
5112 if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait);
5113
5114 /*
5115 * safe, we are not in intr handler, nor in ctxsw when
5116 * we come here
5117 */
5118 kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN);
5119
5120 return 0;
5121}
5122
5123static int
5124pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds)
5125{
5126 pfm_msg_t *msg = NULL;
5127
5128 if (ctx->ctx_fl_no_msg == 0) {
5129 msg = pfm_get_new_msg(ctx);
5130 if (msg == NULL) {
5131 printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n");
5132 return -1;
5133 }
5134
5135 msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL;
5136 msg->pfm_ovfl_msg.msg_ctx_fd = ctx->ctx_fd;
5137 msg->pfm_ovfl_msg.msg_active_set = 0;
5138 msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds;
5139 msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL;
5140 msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL;
5141 msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL;
5142 msg->pfm_ovfl_msg.msg_tstamp = 0UL;
5143 }
5144
5145 DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n",
5146 msg,
5147 ctx->ctx_fl_no_msg,
5148 ctx->ctx_fd,
5149 ovfl_pmds));
5150
5151 return pfm_notify_user(ctx, msg);
5152}
5153
5154static int
5155pfm_end_notify_user(pfm_context_t *ctx)
5156{
5157 pfm_msg_t *msg;
5158
5159 msg = pfm_get_new_msg(ctx);
5160 if (msg == NULL) {
5161 printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n");
5162 return -1;
5163 }
5164 /* no leak */
5165 memset(msg, 0, sizeof(*msg));
5166
5167 msg->pfm_end_msg.msg_type = PFM_MSG_END;
5168 msg->pfm_end_msg.msg_ctx_fd = ctx->ctx_fd;
5169 msg->pfm_ovfl_msg.msg_tstamp = 0UL;
5170
5171 DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n",
5172 msg,
5173 ctx->ctx_fl_no_msg,
5174 ctx->ctx_fd));
5175
5176 return pfm_notify_user(ctx, msg);
5177}
5178
5179/*
5180 * main overflow processing routine.
5181 * it can be called from the interrupt path or explicitely during the context switch code
5182 */
5183static void
5184pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
5185{
5186 pfm_ovfl_arg_t *ovfl_arg;
5187 unsigned long mask;
5188 unsigned long old_val, ovfl_val, new_val;
5189 unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
5190 unsigned long tstamp;
5191 pfm_ovfl_ctrl_t ovfl_ctrl;
5192 unsigned int i, has_smpl;
5193 int must_notify = 0;
5194
5195 if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring;
5196
5197 /*
5198 * sanity test. Should never happen
5199 */
5200 if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check;
5201
5202 tstamp = ia64_get_itc();
5203 mask = pmc0 >> PMU_FIRST_COUNTER;
5204 ovfl_val = pmu_conf->ovfl_val;
5205 has_smpl = CTX_HAS_SMPL(ctx);
5206
5207 DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
5208 "used_pmds=0x%lx\n",
5209 pmc0,
5210 task ? task->pid: -1,
5211 (regs ? regs->cr_iip : 0),
5212 CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
5213 ctx->ctx_used_pmds[0]));
5214
5215
5216 /*
5217 * first we update the virtual counters
5218 * assume there was a prior ia64_srlz_d() issued
5219 */
5220 for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
5221
5222 /* skip pmd which did not overflow */
5223 if ((mask & 0x1) == 0) continue;
5224
5225 /*
5226 * Note that the pmd is not necessarily 0 at this point as qualified events
5227 * may have happened before the PMU was frozen. The residual count is not
5228 * taken into consideration here but will be with any read of the pmd via
5229 * pfm_read_pmds().
5230 */
5231 old_val = new_val = ctx->ctx_pmds[i].val;
5232 new_val += 1 + ovfl_val;
5233 ctx->ctx_pmds[i].val = new_val;
5234
5235 /*
5236 * check for overflow condition
5237 */
5238 if (likely(old_val > new_val)) {
5239 ovfl_pmds |= 1UL << i;
5240 if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify |= 1UL << i;
5241 }
5242
5243 DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
5244 i,
5245 new_val,
5246 old_val,
5247 ia64_get_pmd(i) & ovfl_val,
5248 ovfl_pmds,
5249 ovfl_notify));
5250 }
5251
5252 /*
5253 * there was no 64-bit overflow, nothing else to do
5254 */
5255 if (ovfl_pmds == 0UL) return;
5256
5257 /*
5258 * reset all control bits
5259 */
5260 ovfl_ctrl.val = 0;
5261 reset_pmds = 0UL;
5262
5263 /*
5264 * if a sampling format module exists, then we "cache" the overflow by
5265 * calling the module's handler() routine.
5266 */
5267 if (has_smpl) {
5268 unsigned long start_cycles, end_cycles;
5269 unsigned long pmd_mask;
5270 int j, k, ret = 0;
5271 int this_cpu = smp_processor_id();
5272
5273 pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
5274 ovfl_arg = &ctx->ctx_ovfl_arg;
5275
5276 prefetch(ctx->ctx_smpl_hdr);
5277
5278 for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) {
5279
5280 mask = 1UL << i;
5281
5282 if ((pmd_mask & 0x1) == 0) continue;
5283
5284 ovfl_arg->ovfl_pmd = (unsigned char )i;
5285 ovfl_arg->ovfl_notify = ovfl_notify & mask ? 1 : 0;
5286 ovfl_arg->active_set = 0;
5287 ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
5288 ovfl_arg->smpl_pmds[0] = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
5289
5290 ovfl_arg->pmd_value = ctx->ctx_pmds[i].val;
5291 ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
5292 ovfl_arg->pmd_eventid = ctx->ctx_pmds[i].eventid;
5293
5294 /*
5295 * copy values of pmds of interest. Sampling format may copy them
5296 * into sampling buffer.
5297 */
5298 if (smpl_pmds) {
5299 for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
5300 if ((smpl_pmds & 0x1) == 0) continue;
5301 ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ? pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
5302 DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
5303 }
5304 }
5305
5306 pfm_stats[this_cpu].pfm_smpl_handler_calls++;
5307
5308 start_cycles = ia64_get_itc();
5309
5310 /*
5311 * call custom buffer format record (handler) routine
5312 */
5313 ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
5314
5315 end_cycles = ia64_get_itc();
5316
5317 /*
5318 * For those controls, we take the union because they have
5319 * an all or nothing behavior.
5320 */
5321 ovfl_ctrl.bits.notify_user |= ovfl_arg->ovfl_ctrl.bits.notify_user;
5322 ovfl_ctrl.bits.block_task |= ovfl_arg->ovfl_ctrl.bits.block_task;
5323 ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
5324 /*
5325 * build the bitmask of pmds to reset now
5326 */
5327 if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
5328
5329 pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
5330 }
5331 /*
5332 * when the module cannot handle the rest of the overflows, we abort right here
5333 */
5334 if (ret && pmd_mask) {
5335 DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n",
5336 pmd_mask<<PMU_FIRST_COUNTER));
5337 }
5338 /*
5339 * remove the pmds we reset now from the set of pmds to reset in pfm_restart()
5340 */
5341 ovfl_pmds &= ~reset_pmds;
5342 } else {
5343 /*
5344 * when no sampling module is used, then the default
5345 * is to notify on overflow if requested by user
5346 */
5347 ovfl_ctrl.bits.notify_user = ovfl_notify ? 1 : 0;
5348 ovfl_ctrl.bits.block_task = ovfl_notify ? 1 : 0;
5349 ovfl_ctrl.bits.mask_monitoring = ovfl_notify ? 1 : 0; /* XXX: change for saturation */
5350 ovfl_ctrl.bits.reset_ovfl_pmds = ovfl_notify ? 0 : 1;
5351 /*
5352 * if needed, we reset all overflowed pmds
5353 */
5354 if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
5355 }
5356
5357 DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
5358
5359 /*
5360 * reset the requested PMD registers using the short reset values
5361 */
5362 if (reset_pmds) {
5363 unsigned long bm = reset_pmds;
5364 pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET);
5365 }
5366
5367 if (ovfl_notify && ovfl_ctrl.bits.notify_user) {
5368 /*
5369 * keep track of what to reset when unblocking
5370 */
5371 ctx->ctx_ovfl_regs[0] = ovfl_pmds;
5372
5373 /*
5374 * check for blocking context
5375 */
5376 if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) {
5377
5378 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK;
5379
5380 /*
5381 * set the perfmon specific checking pending work for the task
5382 */
5383 PFM_SET_WORK_PENDING(task, 1);
5384
5385 /*
5386 * when coming from ctxsw, current still points to the
5387 * previous task, therefore we must work with task and not current.
5388 */
5389 pfm_set_task_notify(task);
5390 }
5391 /*
5392 * defer until state is changed (shorten spin window). the context is locked
5393 * anyway, so the signal receiver would come spin for nothing.
5394 */
5395 must_notify = 1;
5396 }
5397
5398 DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n",
5399 GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1,
5400 PFM_GET_WORK_PENDING(task),
5401 ctx->ctx_fl_trap_reason,
5402 ovfl_pmds,
5403 ovfl_notify,
5404 ovfl_ctrl.bits.mask_monitoring ? 1 : 0));
5405 /*
5406 * in case monitoring must be stopped, we toggle the psr bits
5407 */
5408 if (ovfl_ctrl.bits.mask_monitoring) {
5409 pfm_mask_monitoring(task);
5410 ctx->ctx_state = PFM_CTX_MASKED;
5411 ctx->ctx_fl_can_restart = 1;
5412 }
5413
5414 /*
5415 * send notification now
5416 */
5417 if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify);
5418
5419 return;
5420
5421sanity_check:
5422 printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n",
5423 smp_processor_id(),
5424 task ? task->pid : -1,
5425 pmc0);
5426 return;
5427
5428stop_monitoring:
5429 /*
5430 * in SMP, zombie context is never restored but reclaimed in pfm_load_regs().
5431 * Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can
5432 * come here as zombie only if the task is the current task. In which case, we
5433 * can access the PMU hardware directly.
5434 *
5435 * Note that zombies do have PM_VALID set. So here we do the minimal.
5436 *
5437 * In case the context was zombified it could not be reclaimed at the time
5438 * the monitoring program exited. At this point, the PMU reservation has been
5439 * returned, the sampiing buffer has been freed. We must convert this call
5440 * into a spurious interrupt. However, we must also avoid infinite overflows
5441 * by stopping monitoring for this task. We can only come here for a per-task
5442 * context. All we need to do is to stop monitoring using the psr bits which
5443 * are always task private. By re-enabling secure montioring, we ensure that
5444 * the monitored task will not be able to re-activate monitoring.
5445 * The task will eventually be context switched out, at which point the context
5446 * will be reclaimed (that includes releasing ownership of the PMU).
5447 *
5448 * So there might be a window of time where the number of per-task session is zero
5449 * yet one PMU might have a owner and get at most one overflow interrupt for a zombie
5450 * context. This is safe because if a per-task session comes in, it will push this one
5451 * out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide
5452 * session is force on that CPU, given that we use task pinning, pfm_save_regs() will
5453 * also push our zombie context out.
5454 *
5455 * Overall pretty hairy stuff....
5456 */
5457 DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1));
5458 pfm_clear_psr_up();
5459 ia64_psr(regs)->up = 0;
5460 ia64_psr(regs)->sp = 1;
5461 return;
5462}
5463
5464static int
5465pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
5466{
5467 struct task_struct *task;
5468 pfm_context_t *ctx;
5469 unsigned long flags;
5470 u64 pmc0;
5471 int this_cpu = smp_processor_id();
5472 int retval = 0;
5473
5474 pfm_stats[this_cpu].pfm_ovfl_intr_count++;
5475
5476 /*
5477 * srlz.d done before arriving here
5478 */
5479 pmc0 = ia64_get_pmc(0);
5480
5481 task = GET_PMU_OWNER();
5482 ctx = GET_PMU_CTX();
5483
5484 /*
5485 * if we have some pending bits set
5486 * assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1
5487 */
5488 if (PMC0_HAS_OVFL(pmc0) && task) {
5489 /*
5490 * we assume that pmc0.fr is always set here
5491 */
5492
5493 /* sanity check */
5494 if (!ctx) goto report_spurious1;
5495
5496 if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0)
5497 goto report_spurious2;
5498
5499 PROTECT_CTX_NOPRINT(ctx, flags);
5500
5501 pfm_overflow_handler(task, ctx, pmc0, regs);
5502
5503 UNPROTECT_CTX_NOPRINT(ctx, flags);
5504
5505 } else {
5506 pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++;
5507 retval = -1;
5508 }
5509 /*
5510 * keep it unfrozen at all times
5511 */
5512 pfm_unfreeze_pmu();
5513
5514 return retval;
5515
5516report_spurious1:
5517 printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
5518 this_cpu, task->pid);
5519 pfm_unfreeze_pmu();
5520 return -1;
5521report_spurious2:
5522 printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n",
5523 this_cpu,
5524 task->pid);
5525 pfm_unfreeze_pmu();
5526 return -1;
5527}
5528
5529static irqreturn_t
5530pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
5531{
5532 unsigned long start_cycles, total_cycles;
5533 unsigned long min, max;
5534 int this_cpu;
5535 int ret;
5536
5537 this_cpu = get_cpu();
5538 min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
5539 max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
5540
5541 start_cycles = ia64_get_itc();
5542
5543 ret = pfm_do_interrupt_handler(irq, arg, regs);
5544
5545 total_cycles = ia64_get_itc();
5546
5547 /*
5548 * don't measure spurious interrupts
5549 */
5550 if (likely(ret == 0)) {
5551 total_cycles -= start_cycles;
5552
5553 if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
5554 if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
5555
5556 pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
5557 }
5558 put_cpu_no_resched();
5559 return IRQ_HANDLED;
5560}
5561
5562/*
5563 * /proc/perfmon interface, for debug only
5564 */
5565
5566#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1)
5567
5568static void *
5569pfm_proc_start(struct seq_file *m, loff_t *pos)
5570{
5571 if (*pos == 0) {
5572 return PFM_PROC_SHOW_HEADER;
5573 }
5574
5575 while (*pos <= NR_CPUS) {
5576 if (cpu_online(*pos - 1)) {
5577 return (void *)*pos;
5578 }
5579 ++*pos;
5580 }
5581 return NULL;
5582}
5583
5584static void *
5585pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
5586{
5587 ++*pos;
5588 return pfm_proc_start(m, pos);
5589}
5590
5591static void
5592pfm_proc_stop(struct seq_file *m, void *v)
5593{
5594}
5595
5596static void
5597pfm_proc_show_header(struct seq_file *m)
5598{
5599 struct list_head * pos;
5600 pfm_buffer_fmt_t * entry;
5601 unsigned long flags;
5602
5603 seq_printf(m,
5604 "perfmon version : %u.%u\n"
5605 "model : %s\n"
5606 "fastctxsw : %s\n"
5607 "expert mode : %s\n"
5608 "ovfl_mask : 0x%lx\n"
5609 "PMU flags : 0x%x\n",
5610 PFM_VERSION_MAJ, PFM_VERSION_MIN,
5611 pmu_conf->pmu_name,
5612 pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
5613 pfm_sysctl.expert_mode > 0 ? "Yes": "No",
5614 pmu_conf->ovfl_val,
5615 pmu_conf->flags);
5616
5617 LOCK_PFS(flags);
5618
5619 seq_printf(m,
5620 "proc_sessions : %u\n"
5621 "sys_sessions : %u\n"
5622 "sys_use_dbregs : %u\n"
5623 "ptrace_use_dbregs : %u\n",
5624 pfm_sessions.pfs_task_sessions,
5625 pfm_sessions.pfs_sys_sessions,
5626 pfm_sessions.pfs_sys_use_dbregs,
5627 pfm_sessions.pfs_ptrace_use_dbregs);
5628
5629 UNLOCK_PFS(flags);
5630
5631 spin_lock(&pfm_buffer_fmt_lock);
5632
5633 list_for_each(pos, &pfm_buffer_fmt_list) {
5634 entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
5635 seq_printf(m, "format : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
5636 entry->fmt_uuid[0],
5637 entry->fmt_uuid[1],
5638 entry->fmt_uuid[2],
5639 entry->fmt_uuid[3],
5640 entry->fmt_uuid[4],
5641 entry->fmt_uuid[5],
5642 entry->fmt_uuid[6],
5643 entry->fmt_uuid[7],
5644 entry->fmt_uuid[8],
5645 entry->fmt_uuid[9],
5646 entry->fmt_uuid[10],
5647 entry->fmt_uuid[11],
5648 entry->fmt_uuid[12],
5649 entry->fmt_uuid[13],
5650 entry->fmt_uuid[14],
5651 entry->fmt_uuid[15],
5652 entry->fmt_name);
5653 }
5654 spin_unlock(&pfm_buffer_fmt_lock);
5655
5656}
5657
5658static int
5659pfm_proc_show(struct seq_file *m, void *v)
5660{
5661 unsigned long psr;
5662 unsigned int i;
5663 int cpu;
5664
5665 if (v == PFM_PROC_SHOW_HEADER) {
5666 pfm_proc_show_header(m);
5667 return 0;
5668 }
5669
5670 /* show info for CPU (v - 1) */
5671
5672 cpu = (long)v - 1;
5673 seq_printf(m,
5674 "CPU%-2d overflow intrs : %lu\n"
5675 "CPU%-2d overflow cycles : %lu\n"
5676 "CPU%-2d overflow min : %lu\n"
5677 "CPU%-2d overflow max : %lu\n"
5678 "CPU%-2d smpl handler calls : %lu\n"
5679 "CPU%-2d smpl handler cycles : %lu\n"
5680 "CPU%-2d spurious intrs : %lu\n"
5681 "CPU%-2d replay intrs : %lu\n"
5682 "CPU%-2d syst_wide : %d\n"
5683 "CPU%-2d dcr_pp : %d\n"
5684 "CPU%-2d exclude idle : %d\n"
5685 "CPU%-2d owner : %d\n"
5686 "CPU%-2d context : %p\n"
5687 "CPU%-2d activations : %lu\n",
5688 cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
5689 cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
5690 cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
5691 cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
5692 cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
5693 cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
5694 cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
5695 cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
5696 cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
5697 cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
5698 cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
5699 cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
5700 cpu, pfm_get_cpu_data(pmu_ctx, cpu),
5701 cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
5702
5703 if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
5704
5705 psr = pfm_get_psr();
5706
5707 ia64_srlz_d();
5708
5709 seq_printf(m,
5710 "CPU%-2d psr : 0x%lx\n"
5711 "CPU%-2d pmc0 : 0x%lx\n",
5712 cpu, psr,
5713 cpu, ia64_get_pmc(0));
5714
5715 for (i=0; PMC_IS_LAST(i) == 0; i++) {
5716 if (PMC_IS_COUNTING(i) == 0) continue;
5717 seq_printf(m,
5718 "CPU%-2d pmc%u : 0x%lx\n"
5719 "CPU%-2d pmd%u : 0x%lx\n",
5720 cpu, i, ia64_get_pmc(i),
5721 cpu, i, ia64_get_pmd(i));
5722 }
5723 }
5724 return 0;
5725}
5726
5727struct seq_operations pfm_seq_ops = {
5728 .start = pfm_proc_start,
5729 .next = pfm_proc_next,
5730 .stop = pfm_proc_stop,
5731 .show = pfm_proc_show
5732};
5733
5734static int
5735pfm_proc_open(struct inode *inode, struct file *file)
5736{
5737 return seq_open(file, &pfm_seq_ops);
5738}
5739
5740
5741/*
5742 * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
5743 * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
5744 * is active or inactive based on mode. We must rely on the value in
5745 * local_cpu_data->pfm_syst_info
5746 */
5747void
5748pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
5749{
5750 struct pt_regs *regs;
5751 unsigned long dcr;
5752 unsigned long dcr_pp;
5753
5754 dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
5755
5756 /*
5757 * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
5758 * on every CPU, so we can rely on the pid to identify the idle task.
5759 */
5760 if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
5761 regs = ia64_task_regs(task);
5762 ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
5763 return;
5764 }
5765 /*
5766 * if monitoring has started
5767 */
5768 if (dcr_pp) {
5769 dcr = ia64_getreg(_IA64_REG_CR_DCR);
5770 /*
5771 * context switching in?
5772 */
5773 if (is_ctxswin) {
5774 /* mask monitoring for the idle task */
5775 ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
5776 pfm_clear_psr_pp();
5777 ia64_srlz_i();
5778 return;
5779 }
5780 /*
5781 * context switching out
5782 * restore monitoring for next task
5783 *
5784 * Due to inlining this odd if-then-else construction generates
5785 * better code.
5786 */
5787 ia64_setreg(_IA64_REG_CR_DCR, dcr |IA64_DCR_PP);
5788 pfm_set_psr_pp();
5789 ia64_srlz_i();
5790 }
5791}
5792
5793#ifdef CONFIG_SMP
5794
5795static void
5796pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
5797{
5798 struct task_struct *task = ctx->ctx_task;
5799
5800 ia64_psr(regs)->up = 0;
5801 ia64_psr(regs)->sp = 1;
5802
5803 if (GET_PMU_OWNER() == task) {
5804 DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
5805 SET_PMU_OWNER(NULL, NULL);
5806 }
5807
5808 /*
5809 * disconnect the task from the context and vice-versa
5810 */
5811 PFM_SET_WORK_PENDING(task, 0);
5812
5813 task->thread.pfm_context = NULL;
5814 task->thread.flags &= ~IA64_THREAD_PM_VALID;
5815
5816 DPRINT(("force cleanup for [%d]\n", task->pid));
5817}
5818
5819
5820/*
5821 * in 2.6, interrupts are masked when we come here and the runqueue lock is held
5822 */
5823void
5824pfm_save_regs(struct task_struct *task)
5825{
5826 pfm_context_t *ctx;
5827 struct thread_struct *t;
5828 unsigned long flags;
5829 u64 psr;
5830
5831
5832 ctx = PFM_GET_CTX(task);
5833 if (ctx == NULL) return;
5834 t = &task->thread;
5835
5836 /*
5837 * we always come here with interrupts ALREADY disabled by
5838 * the scheduler. So we simply need to protect against concurrent
5839 * access, not CPU concurrency.
5840 */
5841 flags = pfm_protect_ctx_ctxsw(ctx);
5842
5843 if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
5844 struct pt_regs *regs = ia64_task_regs(task);
5845
5846 pfm_clear_psr_up();
5847
5848 pfm_force_cleanup(ctx, regs);
5849
5850 BUG_ON(ctx->ctx_smpl_hdr);
5851
5852 pfm_unprotect_ctx_ctxsw(ctx, flags);
5853
5854 pfm_context_free(ctx);
5855 return;
5856 }
5857
5858 /*
5859 * save current PSR: needed because we modify it
5860 */
5861 ia64_srlz_d();
5862 psr = pfm_get_psr();
5863
5864 BUG_ON(psr & (IA64_PSR_I));
5865
5866 /*
5867 * stop monitoring:
5868 * This is the last instruction which may generate an overflow
5869 *
5870 * We do not need to set psr.sp because, it is irrelevant in kernel.
5871 * It will be restored from ipsr when going back to user level
5872 */
5873 pfm_clear_psr_up();
5874
5875 /*
5876 * keep a copy of psr.up (for reload)
5877 */
5878 ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
5879
5880 /*
5881 * release ownership of this PMU.
5882 * PM interrupts are masked, so nothing
5883 * can happen.
5884 */
5885 SET_PMU_OWNER(NULL, NULL);
5886
5887 /*
5888 * we systematically save the PMD as we have no
5889 * guarantee we will be schedule at that same
5890 * CPU again.
5891 */
5892 pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
5893
5894 /*
5895 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
5896 * we will need it on the restore path to check
5897 * for pending overflow.
5898 */
5899 t->pmcs[0] = ia64_get_pmc(0);
5900
5901 /*
5902 * unfreeze PMU if had pending overflows
5903 */
5904 if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
5905
5906 /*
5907 * finally, allow context access.
5908 * interrupts will still be masked after this call.
5909 */
5910 pfm_unprotect_ctx_ctxsw(ctx, flags);
5911}
5912
5913#else /* !CONFIG_SMP */
5914void
5915pfm_save_regs(struct task_struct *task)
5916{
5917 pfm_context_t *ctx;
5918 u64 psr;
5919
5920 ctx = PFM_GET_CTX(task);
5921 if (ctx == NULL) return;
5922
5923 /*
5924 * save current PSR: needed because we modify it
5925 */
5926 psr = pfm_get_psr();
5927
5928 BUG_ON(psr & (IA64_PSR_I));
5929
5930 /*
5931 * stop monitoring:
5932 * This is the last instruction which may generate an overflow
5933 *
5934 * We do not need to set psr.sp because, it is irrelevant in kernel.
5935 * It will be restored from ipsr when going back to user level
5936 */
5937 pfm_clear_psr_up();
5938
5939 /*
5940 * keep a copy of psr.up (for reload)
5941 */
5942 ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
5943}
5944
5945static void
5946pfm_lazy_save_regs (struct task_struct *task)
5947{
5948 pfm_context_t *ctx;
5949 struct thread_struct *t;
5950 unsigned long flags;
5951
5952 { u64 psr = pfm_get_psr();
5953 BUG_ON(psr & IA64_PSR_UP);
5954 }
5955
5956 ctx = PFM_GET_CTX(task);
5957 t = &task->thread;
5958
5959 /*
5960 * we need to mask PMU overflow here to
5961 * make sure that we maintain pmc0 until
5962 * we save it. overflow interrupts are
5963 * treated as spurious if there is no
5964 * owner.
5965 *
5966 * XXX: I don't think this is necessary
5967 */
5968 PROTECT_CTX(ctx,flags);
5969
5970 /*
5971 * release ownership of this PMU.
5972 * must be done before we save the registers.
5973 *
5974 * after this call any PMU interrupt is treated
5975 * as spurious.
5976 */
5977 SET_PMU_OWNER(NULL, NULL);
5978
5979 /*
5980 * save all the pmds we use
5981 */
5982 pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
5983
5984 /*
5985 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
5986 * it is needed to check for pended overflow
5987 * on the restore path
5988 */
5989 t->pmcs[0] = ia64_get_pmc(0);
5990
5991 /*
5992 * unfreeze PMU if had pending overflows
5993 */
5994 if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
5995
5996 /*
5997 * now get can unmask PMU interrupts, they will
5998 * be treated as purely spurious and we will not
5999 * lose any information
6000 */
6001 UNPROTECT_CTX(ctx,flags);
6002}
6003#endif /* CONFIG_SMP */
6004
6005#ifdef CONFIG_SMP
6006/*
6007 * in 2.6, interrupts are masked when we come here and the runqueue lock is held
6008 */
6009void
6010pfm_load_regs (struct task_struct *task)
6011{
6012 pfm_context_t *ctx;
6013 struct thread_struct *t;
6014 unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
6015 unsigned long flags;
6016 u64 psr, psr_up;
6017 int need_irq_resend;
6018
6019 ctx = PFM_GET_CTX(task);
6020 if (unlikely(ctx == NULL)) return;
6021
6022 BUG_ON(GET_PMU_OWNER());
6023
6024 t = &task->thread;
6025 /*
6026 * possible on unload
6027 */
6028 if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return;
6029
6030 /*
6031 * we always come here with interrupts ALREADY disabled by
6032 * the scheduler. So we simply need to protect against concurrent
6033 * access, not CPU concurrency.
6034 */
6035 flags = pfm_protect_ctx_ctxsw(ctx);
6036 psr = pfm_get_psr();
6037
6038 need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
6039
6040 BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
6041 BUG_ON(psr & IA64_PSR_I);
6042
6043 if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
6044 struct pt_regs *regs = ia64_task_regs(task);
6045
6046 BUG_ON(ctx->ctx_smpl_hdr);
6047
6048 pfm_force_cleanup(ctx, regs);
6049
6050 pfm_unprotect_ctx_ctxsw(ctx, flags);
6051
6052 /*
6053 * this one (kmalloc'ed) is fine with interrupts disabled
6054 */
6055 pfm_context_free(ctx);
6056
6057 return;
6058 }
6059
6060 /*
6061 * we restore ALL the debug registers to avoid picking up
6062 * stale state.
6063 */
6064 if (ctx->ctx_fl_using_dbreg) {
6065 pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
6066 pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
6067 }
6068 /*
6069 * retrieve saved psr.up
6070 */
6071 psr_up = ctx->ctx_saved_psr_up;
6072
6073 /*
6074 * if we were the last user of the PMU on that CPU,
6075 * then nothing to do except restore psr
6076 */
6077 if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) {
6078
6079 /*
6080 * retrieve partial reload masks (due to user modifications)
6081 */
6082 pmc_mask = ctx->ctx_reload_pmcs[0];
6083 pmd_mask = ctx->ctx_reload_pmds[0];
6084
6085 } else {
6086 /*
6087 * To avoid leaking information to the user level when psr.sp=0,
6088 * we must reload ALL implemented pmds (even the ones we don't use).
6089 * In the kernel we only allow PFM_READ_PMDS on registers which
6090 * we initialized or requested (sampling) so there is no risk there.
6091 */
6092 pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
6093
6094 /*
6095 * ALL accessible PMCs are systematically reloaded, unused registers
6096 * get their default (from pfm_reset_pmu_state()) values to avoid picking
6097 * up stale configuration.
6098 *
6099 * PMC0 is never in the mask. It is always restored separately.
6100 */
6101 pmc_mask = ctx->ctx_all_pmcs[0];
6102 }
6103 /*
6104 * when context is MASKED, we will restore PMC with plm=0
6105 * and PMD with stale information, but that's ok, nothing
6106 * will be captured.
6107 *
6108 * XXX: optimize here
6109 */
6110 if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask);
6111 if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask);
6112
6113 /*
6114 * check for pending overflow at the time the state
6115 * was saved.
6116 */
6117 if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
6118 /*
6119 * reload pmc0 with the overflow information
6120 * On McKinley PMU, this will trigger a PMU interrupt
6121 */
6122 ia64_set_pmc(0, t->pmcs[0]);
6123 ia64_srlz_d();
6124 t->pmcs[0] = 0UL;
6125
6126 /*
6127 * will replay the PMU interrupt
6128 */
6129 if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
6130
6131 pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
6132 }
6133
6134 /*
6135 * we just did a reload, so we reset the partial reload fields
6136 */
6137 ctx->ctx_reload_pmcs[0] = 0UL;
6138 ctx->ctx_reload_pmds[0] = 0UL;
6139
6140 SET_LAST_CPU(ctx, smp_processor_id());
6141
6142 /*
6143 * dump activation value for this PMU
6144 */
6145 INC_ACTIVATION();
6146 /*
6147 * record current activation for this context
6148 */
6149 SET_ACTIVATION(ctx);
6150
6151 /*
6152 * establish new ownership.
6153 */
6154 SET_PMU_OWNER(task, ctx);
6155
6156 /*
6157 * restore the psr.up bit. measurement
6158 * is active again.
6159 * no PMU interrupt can happen at this point
6160 * because we still have interrupts disabled.
6161 */
6162 if (likely(psr_up)) pfm_set_psr_up();
6163
6164 /*
6165 * allow concurrent access to context
6166 */
6167 pfm_unprotect_ctx_ctxsw(ctx, flags);
6168}
6169#else /* !CONFIG_SMP */
6170/*
6171 * reload PMU state for UP kernels
6172 * in 2.5 we come here with interrupts disabled
6173 */
6174void
6175pfm_load_regs (struct task_struct *task)
6176{
6177 struct thread_struct *t;
6178 pfm_context_t *ctx;
6179 struct task_struct *owner;
6180 unsigned long pmd_mask, pmc_mask;
6181 u64 psr, psr_up;
6182 int need_irq_resend;
6183
6184 owner = GET_PMU_OWNER();
6185 ctx = PFM_GET_CTX(task);
6186 t = &task->thread;
6187 psr = pfm_get_psr();
6188
6189 BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
6190 BUG_ON(psr & IA64_PSR_I);
6191
6192 /*
6193 * we restore ALL the debug registers to avoid picking up
6194 * stale state.
6195 *
6196 * This must be done even when the task is still the owner
6197 * as the registers may have been modified via ptrace()
6198 * (not perfmon) by the previous task.
6199 */
6200 if (ctx->ctx_fl_using_dbreg) {
6201 pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
6202 pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
6203 }
6204
6205 /*
6206 * retrieved saved psr.up
6207 */
6208 psr_up = ctx->ctx_saved_psr_up;
6209 need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
6210
6211 /*
6212 * short path, our state is still there, just
6213 * need to restore psr and we go
6214 *
6215 * we do not touch either PMC nor PMD. the psr is not touched
6216 * by the overflow_handler. So we are safe w.r.t. to interrupt
6217 * concurrency even without interrupt masking.
6218 */
6219 if (likely(owner == task)) {
6220 if (likely(psr_up)) pfm_set_psr_up();
6221 return;
6222 }
6223
6224 /*
6225 * someone else is still using the PMU, first push it out and
6226 * then we'll be able to install our stuff !
6227 *
6228 * Upon return, there will be no owner for the current PMU
6229 */
6230 if (owner) pfm_lazy_save_regs(owner);
6231
6232 /*
6233 * To avoid leaking information to the user level when psr.sp=0,
6234 * we must reload ALL implemented pmds (even the ones we don't use).
6235 * In the kernel we only allow PFM_READ_PMDS on registers which
6236 * we initialized or requested (sampling) so there is no risk there.
6237 */
6238 pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
6239
6240 /*
6241 * ALL accessible PMCs are systematically reloaded, unused registers
6242 * get their default (from pfm_reset_pmu_state()) values to avoid picking
6243 * up stale configuration.
6244 *
6245 * PMC0 is never in the mask. It is always restored separately
6246 */
6247 pmc_mask = ctx->ctx_all_pmcs[0];
6248
6249 pfm_restore_pmds(t->pmds, pmd_mask);
6250 pfm_restore_pmcs(t->pmcs, pmc_mask);
6251
6252 /*
6253 * check for pending overflow at the time the state
6254 * was saved.
6255 */
6256 if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
6257 /*
6258 * reload pmc0 with the overflow information
6259 * On McKinley PMU, this will trigger a PMU interrupt
6260 */
6261 ia64_set_pmc(0, t->pmcs[0]);
6262 ia64_srlz_d();
6263
6264 t->pmcs[0] = 0UL;
6265
6266 /*
6267 * will replay the PMU interrupt
6268 */
6269 if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
6270
6271 pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
6272 }
6273
6274 /*
6275 * establish new ownership.
6276 */
6277 SET_PMU_OWNER(task, ctx);
6278
6279 /*
6280 * restore the psr.up bit. measurement
6281 * is active again.
6282 * no PMU interrupt can happen at this point
6283 * because we still have interrupts disabled.
6284 */
6285 if (likely(psr_up)) pfm_set_psr_up();
6286}
6287#endif /* CONFIG_SMP */
6288
6289/*
6290 * this function assumes monitoring is stopped
6291 */
6292static void
6293pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
6294{
6295 u64 pmc0;
6296 unsigned long mask2, val, pmd_val, ovfl_val;
6297 int i, can_access_pmu = 0;
6298 int is_self;
6299
6300 /*
6301 * is the caller the task being monitored (or which initiated the
6302 * session for system wide measurements)
6303 */
6304 is_self = ctx->ctx_task == task ? 1 : 0;
6305
6306 /*
6307 * can access PMU is task is the owner of the PMU state on the current CPU
6308 * or if we are running on the CPU bound to the context in system-wide mode
6309 * (that is not necessarily the task the context is attached to in this mode).
6310 * In system-wide we always have can_access_pmu true because a task running on an
6311 * invalid processor is flagged earlier in the call stack (see pfm_stop).
6312 */
6313 can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
6314 if (can_access_pmu) {
6315 /*
6316 * Mark the PMU as not owned
6317 * This will cause the interrupt handler to do nothing in case an overflow
6318 * interrupt was in-flight
6319 * This also guarantees that pmc0 will contain the final state
6320 * It virtually gives us full control on overflow processing from that point
6321 * on.
6322 */
6323 SET_PMU_OWNER(NULL, NULL);
6324 DPRINT(("releasing ownership\n"));
6325
6326 /*
6327 * read current overflow status:
6328 *
6329 * we are guaranteed to read the final stable state
6330 */
6331 ia64_srlz_d();
6332 pmc0 = ia64_get_pmc(0); /* slow */
6333
6334 /*
6335 * reset freeze bit, overflow status information destroyed
6336 */
6337 pfm_unfreeze_pmu();
6338 } else {
6339 pmc0 = task->thread.pmcs[0];
6340 /*
6341 * clear whatever overflow status bits there were
6342 */
6343 task->thread.pmcs[0] = 0;
6344 }
6345 ovfl_val = pmu_conf->ovfl_val;
6346 /*
6347 * we save all the used pmds
6348 * we take care of overflows for counting PMDs
6349 *
6350 * XXX: sampling situation is not taken into account here
6351 */
6352 mask2 = ctx->ctx_used_pmds[0];
6353
6354 DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
6355
6356 for (i = 0; mask2; i++, mask2>>=1) {
6357
6358 /* skip non used pmds */
6359 if ((mask2 & 0x1) == 0) continue;
6360
6361 /*
6362 * can access PMU always true in system wide mode
6363 */
6364 val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i];
6365
6366 if (PMD_IS_COUNTING(i)) {
6367 DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n",
6368 task->pid,
6369 i,
6370 ctx->ctx_pmds[i].val,
6371 val & ovfl_val));
6372
6373 /*
6374 * we rebuild the full 64 bit value of the counter
6375 */
6376 val = ctx->ctx_pmds[i].val + (val & ovfl_val);
6377
6378 /*
6379 * now everything is in ctx_pmds[] and we need
6380 * to clear the saved context from save_regs() such that
6381 * pfm_read_pmds() gets the correct value
6382 */
6383 pmd_val = 0UL;
6384
6385 /*
6386 * take care of overflow inline
6387 */
6388 if (pmc0 & (1UL << i)) {
6389 val += 1 + ovfl_val;
6390 DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
6391 }
6392 }
6393
6394 DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
6395
6396 if (is_self) task->thread.pmds[i] = pmd_val;
6397
6398 ctx->ctx_pmds[i].val = val;
6399 }
6400}
6401
6402static struct irqaction perfmon_irqaction = {
6403 .handler = pfm_interrupt_handler,
6404 .flags = SA_INTERRUPT,
6405 .name = "perfmon"
6406};
6407
6408/*
6409 * perfmon initialization routine, called from the initcall() table
6410 */
6411static int init_pfm_fs(void);
6412
6413static int __init
6414pfm_probe_pmu(void)
6415{
6416 pmu_config_t **p;
6417 int family;
6418
6419 family = local_cpu_data->family;
6420 p = pmu_confs;
6421
6422 while(*p) {
6423 if ((*p)->probe) {
6424 if ((*p)->probe() == 0) goto found;
6425 } else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) {
6426 goto found;
6427 }
6428 p++;
6429 }
6430 return -1;
6431found:
6432 pmu_conf = *p;
6433 return 0;
6434}
6435
6436static struct file_operations pfm_proc_fops = {
6437 .open = pfm_proc_open,
6438 .read = seq_read,
6439 .llseek = seq_lseek,
6440 .release = seq_release,
6441};
6442
6443int __init
6444pfm_init(void)
6445{
6446 unsigned int n, n_counters, i;
6447
6448 printk("perfmon: version %u.%u IRQ %u\n",
6449 PFM_VERSION_MAJ,
6450 PFM_VERSION_MIN,
6451 IA64_PERFMON_VECTOR);
6452
6453 if (pfm_probe_pmu()) {
6454 printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n",
6455 local_cpu_data->family);
6456 return -ENODEV;
6457 }
6458
6459 /*
6460 * compute the number of implemented PMD/PMC from the
6461 * description tables
6462 */
6463 n = 0;
6464 for (i=0; PMC_IS_LAST(i) == 0; i++) {
6465 if (PMC_IS_IMPL(i) == 0) continue;
6466 pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63);
6467 n++;
6468 }
6469 pmu_conf->num_pmcs = n;
6470
6471 n = 0; n_counters = 0;
6472 for (i=0; PMD_IS_LAST(i) == 0; i++) {
6473 if (PMD_IS_IMPL(i) == 0) continue;
6474 pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63);
6475 n++;
6476 if (PMD_IS_COUNTING(i)) n_counters++;
6477 }
6478 pmu_conf->num_pmds = n;
6479 pmu_conf->num_counters = n_counters;
6480
6481 /*
6482 * sanity checks on the number of debug registers
6483 */
6484 if (pmu_conf->use_rr_dbregs) {
6485 if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
6486 printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
6487 pmu_conf = NULL;
6488 return -1;
6489 }
6490 if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
6491 printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
6492 pmu_conf = NULL;
6493 return -1;
6494 }
6495 }
6496
6497 printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
6498 pmu_conf->pmu_name,
6499 pmu_conf->num_pmcs,
6500 pmu_conf->num_pmds,
6501 pmu_conf->num_counters,
6502 ffz(pmu_conf->ovfl_val));
6503
6504 /* sanity check */
6505 if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
6506 printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
6507 pmu_conf = NULL;
6508 return -1;
6509 }
6510
6511 /*
6512 * create /proc/perfmon (mostly for debugging purposes)
6513 */
6514 perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
6515 if (perfmon_dir == NULL) {
6516 printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
6517 pmu_conf = NULL;
6518 return -1;
6519 }
6520 /*
6521 * install customized file operations for /proc/perfmon entry
6522 */
6523 perfmon_dir->proc_fops = &pfm_proc_fops;
6524
6525 /*
6526 * create /proc/sys/kernel/perfmon (for debugging purposes)
6527 */
6528 pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
6529
6530 /*
6531 * initialize all our spinlocks
6532 */
6533 spin_lock_init(&pfm_sessions.pfs_lock);
6534 spin_lock_init(&pfm_buffer_fmt_lock);
6535
6536 init_pfm_fs();
6537
6538 for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
6539
6540 return 0;
6541}
6542
6543__initcall(pfm_init);
6544
6545/*
6546 * this function is called before pfm_init()
6547 */
6548void
6549pfm_init_percpu (void)
6550{
6551 /*
6552 * make sure no measurement is active
6553 * (may inherit programmed PMCs from EFI).
6554 */
6555 pfm_clear_psr_pp();
6556 pfm_clear_psr_up();
6557
6558 /*
6559 * we run with the PMU not frozen at all times
6560 */
6561 pfm_unfreeze_pmu();
6562
6563 if (smp_processor_id() == 0)
6564 register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
6565
6566 ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
6567 ia64_srlz_d();
6568}
6569
6570/*
6571 * used for debug purposes only
6572 */
6573void
6574dump_pmu_state(const char *from)
6575{
6576 struct task_struct *task;
6577 struct thread_struct *t;
6578 struct pt_regs *regs;
6579 pfm_context_t *ctx;
6580 unsigned long psr, dcr, info, flags;
6581 int i, this_cpu;
6582
6583 local_irq_save(flags);
6584
6585 this_cpu = smp_processor_id();
6586 regs = ia64_task_regs(current);
6587 info = PFM_CPUINFO_GET();
6588 dcr = ia64_getreg(_IA64_REG_CR_DCR);
6589
6590 if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) {
6591 local_irq_restore(flags);
6592 return;
6593 }
6594
6595 printk("CPU%d from %s() current [%d] iip=0x%lx %s\n",
6596 this_cpu,
6597 from,
6598 current->pid,
6599 regs->cr_iip,
6600 current->comm);
6601
6602 task = GET_PMU_OWNER();
6603 ctx = GET_PMU_CTX();
6604
6605 printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx);
6606
6607 psr = pfm_get_psr();
6608
6609 printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n",
6610 this_cpu,
6611 ia64_get_pmc(0),
6612 psr & IA64_PSR_PP ? 1 : 0,
6613 psr & IA64_PSR_UP ? 1 : 0,
6614 dcr & IA64_DCR_PP ? 1 : 0,
6615 info,
6616 ia64_psr(regs)->up,
6617 ia64_psr(regs)->pp);
6618
6619 ia64_psr(regs)->up = 0;
6620 ia64_psr(regs)->pp = 0;
6621
6622 t = &current->thread;
6623
6624 for (i=1; PMC_IS_LAST(i) == 0; i++) {
6625 if (PMC_IS_IMPL(i) == 0) continue;
6626 printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]);
6627 }
6628
6629 for (i=1; PMD_IS_LAST(i) == 0; i++) {
6630 if (PMD_IS_IMPL(i) == 0) continue;
6631 printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]);
6632 }
6633
6634 if (ctx) {
6635 printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n",
6636 this_cpu,
6637 ctx->ctx_state,
6638 ctx->ctx_smpl_vaddr,
6639 ctx->ctx_smpl_hdr,
6640 ctx->ctx_msgq_head,
6641 ctx->ctx_msgq_tail,
6642 ctx->ctx_saved_psr_up);
6643 }
6644 local_irq_restore(flags);
6645}
6646
6647/*
6648 * called from process.c:copy_thread(). task is new child.
6649 */
6650void
6651pfm_inherit(struct task_struct *task, struct pt_regs *regs)
6652{
6653 struct thread_struct *thread;
6654
6655 DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid));
6656
6657 thread = &task->thread;
6658
6659 /*
6660 * cut links inherited from parent (current)
6661 */
6662 thread->pfm_context = NULL;
6663
6664 PFM_SET_WORK_PENDING(task, 0);
6665
6666 /*
6667 * the psr bits are already set properly in copy_threads()
6668 */
6669}
6670#else /* !CONFIG_PERFMON */
6671asmlinkage long
6672sys_perfmonctl (int fd, int cmd, void *arg, int count)
6673{
6674 return -ENOSYS;
6675}
6676#endif /* CONFIG_PERFMON */
diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c
new file mode 100644
index 000000000000..965d29004555
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_default_smpl.c
@@ -0,0 +1,306 @@
1/*
2 * Copyright (C) 2002-2003 Hewlett-Packard Co
3 * Stephane Eranian <eranian@hpl.hp.com>
4 *
5 * This file implements the default sampling buffer format
6 * for the Linux/ia64 perfmon-2 subsystem.
7 */
8#include <linux/kernel.h>
9#include <linux/types.h>
10#include <linux/module.h>
11#include <linux/config.h>
12#include <linux/init.h>
13#include <asm/delay.h>
14#include <linux/smp.h>
15
16#include <asm/perfmon.h>
17#include <asm/perfmon_default_smpl.h>
18
19MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
20MODULE_DESCRIPTION("perfmon default sampling format");
21MODULE_LICENSE("GPL");
22
23MODULE_PARM(debug, "i");
24MODULE_PARM_DESC(debug, "debug");
25
26MODULE_PARM(debug_ovfl, "i");
27MODULE_PARM_DESC(debug_ovfl, "debug ovfl");
28
29
30#define DEFAULT_DEBUG 1
31
32#ifdef DEFAULT_DEBUG
33#define DPRINT(a) \
34 do { \
35 if (unlikely(debug >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
36 } while (0)
37
38#define DPRINT_ovfl(a) \
39 do { \
40 if (unlikely(debug_ovfl >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
41 } while (0)
42
43#else
44#define DPRINT(a)
45#define DPRINT_ovfl(a)
46#endif
47
48static int debug, debug_ovfl;
49
50static int
51default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data)
52{
53 pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data;
54 int ret = 0;
55
56 if (data == NULL) {
57 DPRINT(("[%d] no argument passed\n", task->pid));
58 return -EINVAL;
59 }
60
61 DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu));
62
63 /*
64 * must hold at least the buffer header + one minimally sized entry
65 */
66 if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL;
67
68 DPRINT(("buf_size=%lu\n", arg->buf_size));
69
70 return ret;
71}
72
73static int
74default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size)
75{
76 pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
77
78 /*
79 * size has been validated in default_validate
80 */
81 *size = arg->buf_size;
82
83 return 0;
84}
85
86static int
87default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data)
88{
89 pfm_default_smpl_hdr_t *hdr;
90 pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
91
92 hdr = (pfm_default_smpl_hdr_t *)buf;
93
94 hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION;
95 hdr->hdr_buf_size = arg->buf_size;
96 hdr->hdr_cur_offs = sizeof(*hdr);
97 hdr->hdr_overflows = 0UL;
98 hdr->hdr_count = 0UL;
99
100 DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n",
101 task->pid,
102 buf,
103 hdr->hdr_buf_size,
104 sizeof(*hdr),
105 hdr->hdr_version,
106 hdr->hdr_cur_offs));
107
108 return 0;
109}
110
111static int
112default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp)
113{
114 pfm_default_smpl_hdr_t *hdr;
115 pfm_default_smpl_entry_t *ent;
116 void *cur, *last;
117 unsigned long *e, entry_size;
118 unsigned int npmds, i;
119 unsigned char ovfl_pmd;
120 unsigned char ovfl_notify;
121
122 if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) {
123 DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg));
124 return -EINVAL;
125 }
126
127 hdr = (pfm_default_smpl_hdr_t *)buf;
128 cur = buf+hdr->hdr_cur_offs;
129 last = buf+hdr->hdr_buf_size;
130 ovfl_pmd = arg->ovfl_pmd;
131 ovfl_notify = arg->ovfl_notify;
132
133 /*
134 * precheck for sanity
135 */
136 if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
137
138 npmds = hweight64(arg->smpl_pmds[0]);
139
140 ent = (pfm_default_smpl_entry_t *)cur;
141
142 prefetch(arg->smpl_pmds_values);
143
144 entry_size = sizeof(*ent) + (npmds << 3);
145
146 /* position for first pmd */
147 e = (unsigned long *)(ent+1);
148
149 hdr->hdr_count++;
150
151 DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n",
152 task->pid,
153 hdr->hdr_count,
154 cur, last,
155 last-cur,
156 ovfl_pmd,
157 ovfl_notify, npmds));
158
159 /*
160 * current = task running at the time of the overflow.
161 *
162 * per-task mode:
163 * - this is ususally the task being monitored.
164 * Under certain conditions, it might be a different task
165 *
166 * system-wide:
167 * - this is not necessarily the task controlling the session
168 */
169 ent->pid = current->pid;
170 ent->ovfl_pmd = ovfl_pmd;
171 ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val;
172
173 /*
174 * where did the fault happen (includes slot number)
175 */
176 ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3);
177
178 ent->tstamp = stamp;
179 ent->cpu = smp_processor_id();
180 ent->set = arg->active_set;
181 ent->tgid = current->tgid;
182
183 /*
184 * selectively store PMDs in increasing index number
185 */
186 if (npmds) {
187 unsigned long *val = arg->smpl_pmds_values;
188 for(i=0; i < npmds; i++) {
189 *e++ = *val++;
190 }
191 }
192
193 /*
194 * update position for next entry
195 */
196 hdr->hdr_cur_offs += entry_size;
197 cur += entry_size;
198
199 /*
200 * post check to avoid losing the last sample
201 */
202 if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
203
204 /*
205 * keep same ovfl_pmds, ovfl_notify
206 */
207 arg->ovfl_ctrl.bits.notify_user = 0;
208 arg->ovfl_ctrl.bits.block_task = 0;
209 arg->ovfl_ctrl.bits.mask_monitoring = 0;
210 arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */
211
212 return 0;
213full:
214 DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify));
215
216 /*
217 * increment number of buffer overflow.
218 * important to detect duplicate set of samples.
219 */
220 hdr->hdr_overflows++;
221
222 /*
223 * if no notification requested, then we saturate the buffer
224 */
225 if (ovfl_notify == 0) {
226 arg->ovfl_ctrl.bits.notify_user = 0;
227 arg->ovfl_ctrl.bits.block_task = 0;
228 arg->ovfl_ctrl.bits.mask_monitoring = 1;
229 arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0;
230 } else {
231 arg->ovfl_ctrl.bits.notify_user = 1;
232 arg->ovfl_ctrl.bits.block_task = 1; /* ignored for non-blocking context */
233 arg->ovfl_ctrl.bits.mask_monitoring = 1;
234 arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */
235 }
236 return -1; /* we are full, sorry */
237}
238
239static int
240default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
241{
242 pfm_default_smpl_hdr_t *hdr;
243
244 hdr = (pfm_default_smpl_hdr_t *)buf;
245
246 hdr->hdr_count = 0UL;
247 hdr->hdr_cur_offs = sizeof(*hdr);
248
249 ctrl->bits.mask_monitoring = 0;
250 ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */
251
252 return 0;
253}
254
255static int
256default_exit(struct task_struct *task, void *buf, struct pt_regs *regs)
257{
258 DPRINT(("[%d] exit(%p)\n", task->pid, buf));
259 return 0;
260}
261
262static pfm_buffer_fmt_t default_fmt={
263 .fmt_name = "default_format",
264 .fmt_uuid = PFM_DEFAULT_SMPL_UUID,
265 .fmt_arg_size = sizeof(pfm_default_smpl_arg_t),
266 .fmt_validate = default_validate,
267 .fmt_getsize = default_get_size,
268 .fmt_init = default_init,
269 .fmt_handler = default_handler,
270 .fmt_restart = default_restart,
271 .fmt_restart_active = default_restart,
272 .fmt_exit = default_exit,
273};
274
275static int __init
276pfm_default_smpl_init_module(void)
277{
278 int ret;
279
280 ret = pfm_register_buffer_fmt(&default_fmt);
281 if (ret == 0) {
282 printk("perfmon_default_smpl: %s v%u.%u registered\n",
283 default_fmt.fmt_name,
284 PFM_DEFAULT_SMPL_VERSION_MAJ,
285 PFM_DEFAULT_SMPL_VERSION_MIN);
286 } else {
287 printk("perfmon_default_smpl: %s cannot register ret=%d\n",
288 default_fmt.fmt_name,
289 ret);
290 }
291
292 return ret;
293}
294
295static void __exit
296pfm_default_smpl_cleanup_module(void)
297{
298 int ret;
299 ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid);
300
301 printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret);
302}
303
304module_init(pfm_default_smpl_init_module);
305module_exit(pfm_default_smpl_cleanup_module);
306
diff --git a/arch/ia64/kernel/perfmon_generic.h b/arch/ia64/kernel/perfmon_generic.h
new file mode 100644
index 000000000000..67489478041e
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_generic.h
@@ -0,0 +1,45 @@
1/*
2 * This file contains the generic PMU register description tables
3 * and pmc checker used by perfmon.c.
4 *
5 * Copyright (C) 2002-2003 Hewlett Packard Co
6 * Stephane Eranian <eranian@hpl.hp.com>
7 */
8
9static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={
10/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
11/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
12/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
13/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
14/* pmc4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
15/* pmc5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
16/* pmc6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
17/* pmc7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
18 { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
19};
20
21static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={
22/* pmd0 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
23/* pmd1 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
24/* pmd2 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
25/* pmd3 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
26/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
27/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
28/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
29/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
30 { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
31};
32
33/*
34 * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
35 */
36static pmu_config_t pmu_conf_gen={
37 .pmu_name = "Generic",
38 .pmu_family = 0xff, /* any */
39 .ovfl_val = (1UL << 32) - 1,
40 .num_ibrs = 0, /* does not use */
41 .num_dbrs = 0, /* does not use */
42 .pmd_desc = pfm_gen_pmd_desc,
43 .pmc_desc = pfm_gen_pmc_desc
44};
45
diff --git a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h
new file mode 100644
index 000000000000..d1d508a0fbd3
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_itanium.h
@@ -0,0 +1,115 @@
1/*
2 * This file contains the Itanium PMU register description tables
3 * and pmc checker used by perfmon.c.
4 *
5 * Copyright (C) 2002-2003 Hewlett Packard Co
6 * Stephane Eranian <eranian@hpl.hp.com>
7 */
8static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
9
10static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={
11/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
12/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
13/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
14/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
15/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
16/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
17/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
18/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
19/* pmc8 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
20/* pmc9 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
21/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
22/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
23/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
24/* pmc13 */ { PFM_REG_CONFIG , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
25 { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
26};
27
28static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={
29/* pmd0 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
30/* pmd1 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
31/* pmd2 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
32/* pmd3 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
33/* pmd4 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
34/* pmd5 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
35/* pmd6 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
36/* pmd7 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
37/* pmd8 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
38/* pmd9 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
39/* pmd10 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
40/* pmd11 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
41/* pmd12 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
42/* pmd13 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
43/* pmd14 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
44/* pmd15 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
45/* pmd16 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
46/* pmd17 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
47 { PFM_REG_END , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
48};
49
50static int
51pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
52{
53 int ret;
54 int is_loaded;
55
56 /* sanitfy check */
57 if (ctx == NULL) return -EINVAL;
58
59 is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
60
61 /*
62 * we must clear the (instruction) debug registers if pmc13.ta bit is cleared
63 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
64 */
65 if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) {
66
67 DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val));
68
69 /* don't mix debug with perfmon */
70 if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
71
72 /*
73 * a count of 0 will mark the debug registers as in use and also
74 * ensure that they are properly cleared.
75 */
76 ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs);
77 if (ret) return ret;
78 }
79
80 /*
81 * we must clear the (data) debug registers if pmc11.pt bit is cleared
82 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
83 */
84 if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) {
85
86 DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val));
87
88 /* don't mix debug with perfmon */
89 if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
90
91 /*
92 * a count of 0 will mark the debug registers as in use and also
93 * ensure that they are properly cleared.
94 */
95 ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs);
96 if (ret) return ret;
97 }
98 return 0;
99}
100
101/*
102 * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
103 */
104static pmu_config_t pmu_conf_ita={
105 .pmu_name = "Itanium",
106 .pmu_family = 0x7,
107 .ovfl_val = (1UL << 32) - 1,
108 .pmd_desc = pfm_ita_pmd_desc,
109 .pmc_desc = pfm_ita_pmc_desc,
110 .num_ibrs = 8,
111 .num_dbrs = 8,
112 .use_rr_dbregs = 1, /* debug register are use for range retrictions */
113};
114
115
diff --git a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h
new file mode 100644
index 000000000000..9becccda2897
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_mckinley.h
@@ -0,0 +1,187 @@
1/*
2 * This file contains the McKinley PMU register description tables
3 * and pmc checker used by perfmon.c.
4 *
5 * Copyright (C) 2002-2003 Hewlett Packard Co
6 * Stephane Eranian <eranian@hpl.hp.com>
7 */
8static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
9
10static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={
11/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
12/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
13/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
14/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
15/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
16/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
17/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
18/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
19/* pmc8 */ { PFM_REG_CONFIG , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
20/* pmc9 */ { PFM_REG_CONFIG , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
21/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
22/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL, pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
23/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
24/* pmc13 */ { PFM_REG_CONFIG , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
25/* pmc14 */ { PFM_REG_CONFIG , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
26/* pmc15 */ { PFM_REG_CONFIG , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
27 { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
28};
29
30static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={
31/* pmd0 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
32/* pmd1 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
33/* pmd2 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
34/* pmd3 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
35/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
36/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
37/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
38/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
39/* pmd8 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
40/* pmd9 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
41/* pmd10 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
42/* pmd11 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
43/* pmd12 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
44/* pmd13 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
45/* pmd14 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
46/* pmd15 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
47/* pmd16 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
48/* pmd17 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
49 { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
50};
51
52/*
53 * PMC reserved fields must have their power-up values preserved
54 */
55static int
56pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs)
57{
58 unsigned long tmp1, tmp2, ival = *val;
59
60 /* remove reserved areas from user value */
61 tmp1 = ival & PMC_RSVD_MASK(cnum);
62
63 /* get reserved fields values */
64 tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum);
65
66 *val = tmp1 | tmp2;
67
68 DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n",
69 cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val));
70 return 0;
71}
72
73/*
74 * task can be NULL if the context is unloaded
75 */
76static int
77pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
78{
79 int ret = 0, check_case1 = 0;
80 unsigned long val8 = 0, val14 = 0, val13 = 0;
81 int is_loaded;
82
83 /* first preserve the reserved fields */
84 pfm_mck_reserved(cnum, val, regs);
85
86 /* sanitfy check */
87 if (ctx == NULL) return -EINVAL;
88
89 is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
90
91 /*
92 * we must clear the debug registers if pmc13 has a value which enable
93 * memory pipeline event constraints. In this case we need to clear the
94 * the debug registers if they have not yet been accessed. This is required
95 * to avoid picking stale state.
96 * PMC13 is "active" if:
97 * one of the pmc13.cfg_dbrpXX field is different from 0x3
98 * AND
99 * at the corresponding pmc13.ena_dbrpXX is set.
100 */
101 DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded));
102
103 if (cnum == 13 && is_loaded
104 && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) {
105
106 DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val));
107
108 /* don't mix debug with perfmon */
109 if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
110
111 /*
112 * a count of 0 will mark the debug registers as in use and also
113 * ensure that they are properly cleared.
114 */
115 ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs);
116 if (ret) return ret;
117 }
118 /*
119 * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled
120 * before they are (fl_using_dbreg==0) to avoid picking up stale information.
121 */
122 if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) {
123
124 DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val));
125
126 /* don't mix debug with perfmon */
127 if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
128
129 /*
130 * a count of 0 will mark the debug registers as in use and also
131 * ensure that they are properly cleared.
132 */
133 ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs);
134 if (ret) return ret;
135
136 }
137
138 switch(cnum) {
139 case 4: *val |= 1UL << 23; /* force power enable bit */
140 break;
141 case 8: val8 = *val;
142 val13 = ctx->ctx_pmcs[13];
143 val14 = ctx->ctx_pmcs[14];
144 check_case1 = 1;
145 break;
146 case 13: val8 = ctx->ctx_pmcs[8];
147 val13 = *val;
148 val14 = ctx->ctx_pmcs[14];
149 check_case1 = 1;
150 break;
151 case 14: val8 = ctx->ctx_pmcs[8];
152 val13 = ctx->ctx_pmcs[13];
153 val14 = *val;
154 check_case1 = 1;
155 break;
156 }
157 /* check illegal configuration which can produce inconsistencies in tagging
158 * i-side events in L1D and L2 caches
159 */
160 if (check_case1) {
161 ret = ((val13 >> 45) & 0xf) == 0
162 && ((val8 & 0x1) == 0)
163 && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0)
164 ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0));
165
166 if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n"));
167 }
168
169 return ret ? -EINVAL : 0;
170}
171
172/*
173 * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
174 */
175static pmu_config_t pmu_conf_mck={
176 .pmu_name = "Itanium 2",
177 .pmu_family = 0x1f,
178 .flags = PFM_PMU_IRQ_RESEND,
179 .ovfl_val = (1UL << 47) - 1,
180 .pmd_desc = pfm_mck_pmd_desc,
181 .pmc_desc = pfm_mck_pmc_desc,
182 .num_ibrs = 8,
183 .num_dbrs = 8,
184 .use_rr_dbregs = 1 /* debug register are use for range retrictions */
185};
186
187
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
new file mode 100644
index 000000000000..91293388dd29
--- /dev/null
+++ b/arch/ia64/kernel/process.c
@@ -0,0 +1,800 @@
1/*
2 * Architecture-specific setup.
3 *
4 * Copyright (C) 1998-2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 */
7#define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */
8#include <linux/config.h>
9
10#include <linux/cpu.h>
11#include <linux/pm.h>
12#include <linux/elf.h>
13#include <linux/errno.h>
14#include <linux/kallsyms.h>
15#include <linux/kernel.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/personality.h>
20#include <linux/sched.h>
21#include <linux/slab.h>
22#include <linux/smp_lock.h>
23#include <linux/stddef.h>
24#include <linux/thread_info.h>
25#include <linux/unistd.h>
26#include <linux/efi.h>
27#include <linux/interrupt.h>
28#include <linux/delay.h>
29
30#include <asm/cpu.h>
31#include <asm/delay.h>
32#include <asm/elf.h>
33#include <asm/ia32.h>
34#include <asm/irq.h>
35#include <asm/pgalloc.h>
36#include <asm/processor.h>
37#include <asm/sal.h>
38#include <asm/tlbflush.h>
39#include <asm/uaccess.h>
40#include <asm/unwind.h>
41#include <asm/user.h>
42
43#include "entry.h"
44
45#ifdef CONFIG_PERFMON
46# include <asm/perfmon.h>
47#endif
48
49#include "sigframe.h"
50
51void (*ia64_mark_idle)(int);
52static cpumask_t cpu_idle_map;
53
54unsigned long boot_option_idle_override = 0;
55EXPORT_SYMBOL(boot_option_idle_override);
56
57void
58ia64_do_show_stack (struct unw_frame_info *info, void *arg)
59{
60 unsigned long ip, sp, bsp;
61 char buf[128]; /* don't make it so big that it overflows the stack! */
62
63 printk("\nCall Trace:\n");
64 do {
65 unw_get_ip(info, &ip);
66 if (ip == 0)
67 break;
68
69 unw_get_sp(info, &sp);
70 unw_get_bsp(info, &bsp);
71 snprintf(buf, sizeof(buf),
72 " [<%016lx>] %%s\n"
73 " sp=%016lx bsp=%016lx\n",
74 ip, sp, bsp);
75 print_symbol(buf, ip);
76 } while (unw_unwind(info) >= 0);
77}
78
79void
80show_stack (struct task_struct *task, unsigned long *sp)
81{
82 if (!task)
83 unw_init_running(ia64_do_show_stack, NULL);
84 else {
85 struct unw_frame_info info;
86
87 unw_init_from_blocked_task(&info, task);
88 ia64_do_show_stack(&info, NULL);
89 }
90}
91
92void
93dump_stack (void)
94{
95 show_stack(NULL, NULL);
96}
97
98EXPORT_SYMBOL(dump_stack);
99
100void
101show_regs (struct pt_regs *regs)
102{
103 unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
104
105 print_modules();
106 printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm);
107 printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n",
108 regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
109 print_symbol("ip is at %s\n", ip);
110 printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
111 regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
112 printk("rnat: %016lx bsps: %016lx pr : %016lx\n",
113 regs->ar_rnat, regs->ar_bspstore, regs->pr);
114 printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
115 regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
116 printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
117 printk("b0 : %016lx b6 : %016lx b7 : %016lx\n", regs->b0, regs->b6, regs->b7);
118 printk("f6 : %05lx%016lx f7 : %05lx%016lx\n",
119 regs->f6.u.bits[1], regs->f6.u.bits[0],
120 regs->f7.u.bits[1], regs->f7.u.bits[0]);
121 printk("f8 : %05lx%016lx f9 : %05lx%016lx\n",
122 regs->f8.u.bits[1], regs->f8.u.bits[0],
123 regs->f9.u.bits[1], regs->f9.u.bits[0]);
124 printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
125 regs->f10.u.bits[1], regs->f10.u.bits[0],
126 regs->f11.u.bits[1], regs->f11.u.bits[0]);
127
128 printk("r1 : %016lx r2 : %016lx r3 : %016lx\n", regs->r1, regs->r2, regs->r3);
129 printk("r8 : %016lx r9 : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10);
130 printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13);
131 printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16);
132 printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19);
133 printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22);
134 printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25);
135 printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28);
136 printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31);
137
138 if (user_mode(regs)) {
139 /* print the stacked registers */
140 unsigned long val, *bsp, ndirty;
141 int i, sof, is_nat = 0;
142
143 sof = regs->cr_ifs & 0x7f; /* size of frame */
144 ndirty = (regs->loadrs >> 19);
145 bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty);
146 for (i = 0; i < sof; ++i) {
147 get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i));
148 printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val,
149 ((i == sof - 1) || (i % 3) == 2) ? "\n" : " ");
150 }
151 } else
152 show_stack(NULL, NULL);
153}
154
155void
156do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
157{
158 if (fsys_mode(current, &scr->pt)) {
159 /* defer signal-handling etc. until we return to privilege-level 0. */
160 if (!ia64_psr(&scr->pt)->lp)
161 ia64_psr(&scr->pt)->lp = 1;
162 return;
163 }
164
165#ifdef CONFIG_PERFMON
166 if (current->thread.pfm_needs_checking)
167 pfm_handle_work();
168#endif
169
170 /* deal with pending signal delivery */
171 if (test_thread_flag(TIF_SIGPENDING))
172 ia64_do_signal(oldset, scr, in_syscall);
173}
174
175static int pal_halt = 1;
176static int __init nohalt_setup(char * str)
177{
178 pal_halt = 0;
179 return 1;
180}
181__setup("nohalt", nohalt_setup);
182
183/*
184 * We use this if we don't have any better idle routine..
185 */
186void
187default_idle (void)
188{
189 unsigned long pmu_active = ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_PP | IA64_PSR_UP);
190
191 while (!need_resched())
192 if (pal_halt && !pmu_active)
193 safe_halt();
194 else
195 cpu_relax();
196}
197
198#ifdef CONFIG_HOTPLUG_CPU
199/* We don't actually take CPU down, just spin without interrupts. */
200static inline void play_dead(void)
201{
202 extern void ia64_cpu_local_tick (void);
203 /* Ack it */
204 __get_cpu_var(cpu_state) = CPU_DEAD;
205
206 /* We shouldn't have to disable interrupts while dead, but
207 * some interrupts just don't seem to go away, and this makes
208 * it "work" for testing purposes. */
209 max_xtp();
210 local_irq_disable();
211 /* Death loop */
212 while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
213 cpu_relax();
214
215 /*
216 * Enable timer interrupts from now on
217 * Not required if we put processor in SAL_BOOT_RENDEZ mode.
218 */
219 local_flush_tlb_all();
220 cpu_set(smp_processor_id(), cpu_online_map);
221 wmb();
222 ia64_cpu_local_tick ();
223 local_irq_enable();
224}
225#else
226static inline void play_dead(void)
227{
228 BUG();
229}
230#endif /* CONFIG_HOTPLUG_CPU */
231
232
233void cpu_idle_wait(void)
234{
235 int cpu;
236 cpumask_t map;
237
238 for_each_online_cpu(cpu)
239 cpu_set(cpu, cpu_idle_map);
240
241 wmb();
242 do {
243 ssleep(1);
244 cpus_and(map, cpu_idle_map, cpu_online_map);
245 } while (!cpus_empty(map));
246}
247EXPORT_SYMBOL_GPL(cpu_idle_wait);
248
249void __attribute__((noreturn))
250cpu_idle (void)
251{
252 void (*mark_idle)(int) = ia64_mark_idle;
253 int cpu = smp_processor_id();
254
255 /* endless idle loop with no priority at all */
256 while (1) {
257#ifdef CONFIG_SMP
258 if (!need_resched())
259 min_xtp();
260#endif
261 while (!need_resched()) {
262 void (*idle)(void);
263
264 if (mark_idle)
265 (*mark_idle)(1);
266
267 if (cpu_isset(cpu, cpu_idle_map))
268 cpu_clear(cpu, cpu_idle_map);
269 rmb();
270 idle = pm_idle;
271 if (!idle)
272 idle = default_idle;
273 (*idle)();
274 }
275
276 if (mark_idle)
277 (*mark_idle)(0);
278
279#ifdef CONFIG_SMP
280 normal_xtp();
281#endif
282 schedule();
283 check_pgt_cache();
284 if (cpu_is_offline(smp_processor_id()))
285 play_dead();
286 }
287}
288
289void
290ia64_save_extra (struct task_struct *task)
291{
292#ifdef CONFIG_PERFMON
293 unsigned long info;
294#endif
295
296 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
297 ia64_save_debug_regs(&task->thread.dbr[0]);
298
299#ifdef CONFIG_PERFMON
300 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
301 pfm_save_regs(task);
302
303 info = __get_cpu_var(pfm_syst_info);
304 if (info & PFM_CPUINFO_SYST_WIDE)
305 pfm_syst_wide_update_task(task, info, 0);
306#endif
307
308#ifdef CONFIG_IA32_SUPPORT
309 if (IS_IA32_PROCESS(ia64_task_regs(task)))
310 ia32_save_state(task);
311#endif
312}
313
314void
315ia64_load_extra (struct task_struct *task)
316{
317#ifdef CONFIG_PERFMON
318 unsigned long info;
319#endif
320
321 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
322 ia64_load_debug_regs(&task->thread.dbr[0]);
323
324#ifdef CONFIG_PERFMON
325 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
326 pfm_load_regs(task);
327
328 info = __get_cpu_var(pfm_syst_info);
329 if (info & PFM_CPUINFO_SYST_WIDE)
330 pfm_syst_wide_update_task(task, info, 1);
331#endif
332
333#ifdef CONFIG_IA32_SUPPORT
334 if (IS_IA32_PROCESS(ia64_task_regs(task)))
335 ia32_load_state(task);
336#endif
337}
338
339/*
340 * Copy the state of an ia-64 thread.
341 *
342 * We get here through the following call chain:
343 *
344 * from user-level: from kernel:
345 *
346 * <clone syscall> <some kernel call frames>
347 * sys_clone :
348 * do_fork do_fork
349 * copy_thread copy_thread
350 *
351 * This means that the stack layout is as follows:
352 *
353 * +---------------------+ (highest addr)
354 * | struct pt_regs |
355 * +---------------------+
356 * | struct switch_stack |
357 * +---------------------+
358 * | |
359 * | memory stack |
360 * | | <-- sp (lowest addr)
361 * +---------------------+
362 *
363 * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an
364 * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register,
365 * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the
366 * pt_regs structure in the parent is congruent to that of the child, modulo 512. Since
367 * the stack is page aligned and the page size is at least 4KB, this is always the case,
368 * so there is nothing to worry about.
369 */
370int
371copy_thread (int nr, unsigned long clone_flags,
372 unsigned long user_stack_base, unsigned long user_stack_size,
373 struct task_struct *p, struct pt_regs *regs)
374{
375 extern char ia64_ret_from_clone, ia32_ret_from_clone;
376 struct switch_stack *child_stack, *stack;
377 unsigned long rbs, child_rbs, rbs_size;
378 struct pt_regs *child_ptregs;
379 int retval = 0;
380
381#ifdef CONFIG_SMP
382 /*
383 * For SMP idle threads, fork_by_hand() calls do_fork with
384 * NULL regs.
385 */
386 if (!regs)
387 return 0;
388#endif
389
390 stack = ((struct switch_stack *) regs) - 1;
391
392 child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
393 child_stack = (struct switch_stack *) child_ptregs - 1;
394
395 /* copy parent's switch_stack & pt_regs to child: */
396 memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
397
398 rbs = (unsigned long) current + IA64_RBS_OFFSET;
399 child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
400 rbs_size = stack->ar_bspstore - rbs;
401
402 /* copy the parent's register backing store to the child: */
403 memcpy((void *) child_rbs, (void *) rbs, rbs_size);
404
405 if (likely(user_mode(child_ptregs))) {
406 if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs))
407 child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */
408 if (user_stack_base) {
409 child_ptregs->r12 = user_stack_base + user_stack_size - 16;
410 child_ptregs->ar_bspstore = user_stack_base;
411 child_ptregs->ar_rnat = 0;
412 child_ptregs->loadrs = 0;
413 }
414 } else {
415 /*
416 * Note: we simply preserve the relative position of
417 * the stack pointer here. There is no need to
418 * allocate a scratch area here, since that will have
419 * been taken care of by the caller of sys_clone()
420 * already.
421 */
422 child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */
423 child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */
424 }
425 child_stack->ar_bspstore = child_rbs + rbs_size;
426 if (IS_IA32_PROCESS(regs))
427 child_stack->b0 = (unsigned long) &ia32_ret_from_clone;
428 else
429 child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
430
431 /* copy parts of thread_struct: */
432 p->thread.ksp = (unsigned long) child_stack - 16;
433
434 /* stop some PSR bits from being inherited.
435 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
436 * therefore we must specify them explicitly here and not include them in
437 * IA64_PSR_BITS_TO_CLEAR.
438 */
439 child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
440 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
441
442 /*
443 * NOTE: The calling convention considers all floating point
444 * registers in the high partition (fph) to be scratch. Since
445 * the only way to get to this point is through a system call,
446 * we know that the values in fph are all dead. Hence, there
447 * is no need to inherit the fph state from the parent to the
448 * child and all we have to do is to make sure that
449 * IA64_THREAD_FPH_VALID is cleared in the child.
450 *
451 * XXX We could push this optimization a bit further by
452 * clearing IA64_THREAD_FPH_VALID on ANY system call.
453 * However, it's not clear this is worth doing. Also, it
454 * would be a slight deviation from the normal Linux system
455 * call behavior where scratch registers are preserved across
456 * system calls (unless used by the system call itself).
457 */
458# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
459 | IA64_THREAD_PM_VALID)
460# define THREAD_FLAGS_TO_SET 0
461 p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
462 | THREAD_FLAGS_TO_SET);
463 ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */
464#ifdef CONFIG_IA32_SUPPORT
465 /*
466 * If we're cloning an IA32 task then save the IA32 extra
467 * state from the current task to the new task
468 */
469 if (IS_IA32_PROCESS(ia64_task_regs(current))) {
470 ia32_save_state(p);
471 if (clone_flags & CLONE_SETTLS)
472 retval = ia32_clone_tls(p, child_ptregs);
473
474 /* Copy partially mapped page list */
475 if (!retval)
476 retval = ia32_copy_partial_page_list(p, clone_flags);
477 }
478#endif
479
480#ifdef CONFIG_PERFMON
481 if (current->thread.pfm_context)
482 pfm_inherit(p, child_ptregs);
483#endif
484 return retval;
485}
486
487static void
488do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
489{
490 unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm;
491 elf_greg_t *dst = arg;
492 struct pt_regs *pt;
493 char nat;
494 int i;
495
496 memset(dst, 0, sizeof(elf_gregset_t)); /* don't leak any kernel bits to user-level */
497
498 if (unw_unwind_to_user(info) < 0)
499 return;
500
501 unw_get_sp(info, &sp);
502 pt = (struct pt_regs *) (sp + 16);
503
504 urbs_end = ia64_get_user_rbs_end(task, pt, &cfm);
505
506 if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0)
507 return;
508
509 ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end),
510 &ar_rnat);
511
512 /*
513 * coredump format:
514 * r0-r31
515 * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
516 * predicate registers (p0-p63)
517 * b0-b7
518 * ip cfm user-mask
519 * ar.rsc ar.bsp ar.bspstore ar.rnat
520 * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
521 */
522
523 /* r0 is zero */
524 for (i = 1, mask = (1UL << i); i < 32; ++i) {
525 unw_get_gr(info, i, &dst[i], &nat);
526 if (nat)
527 nat_bits |= mask;
528 mask <<= 1;
529 }
530 dst[32] = nat_bits;
531 unw_get_pr(info, &dst[33]);
532
533 for (i = 0; i < 8; ++i)
534 unw_get_br(info, i, &dst[34 + i]);
535
536 unw_get_rp(info, &ip);
537 dst[42] = ip + ia64_psr(pt)->ri;
538 dst[43] = cfm;
539 dst[44] = pt->cr_ipsr & IA64_PSR_UM;
540
541 unw_get_ar(info, UNW_AR_RSC, &dst[45]);
542 /*
543 * For bsp and bspstore, unw_get_ar() would return the kernel
544 * addresses, but we need the user-level addresses instead:
545 */
546 dst[46] = urbs_end; /* note: by convention PT_AR_BSP points to the end of the urbs! */
547 dst[47] = pt->ar_bspstore;
548 dst[48] = ar_rnat;
549 unw_get_ar(info, UNW_AR_CCV, &dst[49]);
550 unw_get_ar(info, UNW_AR_UNAT, &dst[50]);
551 unw_get_ar(info, UNW_AR_FPSR, &dst[51]);
552 dst[52] = pt->ar_pfs; /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
553 unw_get_ar(info, UNW_AR_LC, &dst[53]);
554 unw_get_ar(info, UNW_AR_EC, &dst[54]);
555 unw_get_ar(info, UNW_AR_CSD, &dst[55]);
556 unw_get_ar(info, UNW_AR_SSD, &dst[56]);
557}
558
559void
560do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg)
561{
562 elf_fpreg_t *dst = arg;
563 int i;
564
565 memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */
566
567 if (unw_unwind_to_user(info) < 0)
568 return;
569
570 /* f0 is 0.0, f1 is 1.0 */
571
572 for (i = 2; i < 32; ++i)
573 unw_get_fr(info, i, dst + i);
574
575 ia64_flush_fph(task);
576 if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0)
577 memcpy(dst + 32, task->thread.fph, 96*16);
578}
579
580void
581do_copy_regs (struct unw_frame_info *info, void *arg)
582{
583 do_copy_task_regs(current, info, arg);
584}
585
586void
587do_dump_fpu (struct unw_frame_info *info, void *arg)
588{
589 do_dump_task_fpu(current, info, arg);
590}
591
592int
593dump_task_regs(struct task_struct *task, elf_gregset_t *regs)
594{
595 struct unw_frame_info tcore_info;
596
597 if (current == task) {
598 unw_init_running(do_copy_regs, regs);
599 } else {
600 memset(&tcore_info, 0, sizeof(tcore_info));
601 unw_init_from_blocked_task(&tcore_info, task);
602 do_copy_task_regs(task, &tcore_info, regs);
603 }
604 return 1;
605}
606
607void
608ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
609{
610 unw_init_running(do_copy_regs, dst);
611}
612
613int
614dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst)
615{
616 struct unw_frame_info tcore_info;
617
618 if (current == task) {
619 unw_init_running(do_dump_fpu, dst);
620 } else {
621 memset(&tcore_info, 0, sizeof(tcore_info));
622 unw_init_from_blocked_task(&tcore_info, task);
623 do_dump_task_fpu(task, &tcore_info, dst);
624 }
625 return 1;
626}
627
628int
629dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
630{
631 unw_init_running(do_dump_fpu, dst);
632 return 1; /* f0-f31 are always valid so we always return 1 */
633}
634
635long
636sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
637 struct pt_regs *regs)
638{
639 char *fname;
640 int error;
641
642 fname = getname(filename);
643 error = PTR_ERR(fname);
644 if (IS_ERR(fname))
645 goto out;
646 error = do_execve(fname, argv, envp, regs);
647 putname(fname);
648out:
649 return error;
650}
651
652pid_t
653kernel_thread (int (*fn)(void *), void *arg, unsigned long flags)
654{
655 extern void start_kernel_thread (void);
656 unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
657 struct {
658 struct switch_stack sw;
659 struct pt_regs pt;
660 } regs;
661
662 memset(&regs, 0, sizeof(regs));
663 regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */
664 regs.pt.r1 = helper_fptr[1]; /* set GP */
665 regs.pt.r9 = (unsigned long) fn; /* 1st argument */
666 regs.pt.r11 = (unsigned long) arg; /* 2nd argument */
667 /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */
668 regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
669 regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */
670 regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
671 regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
672 regs.sw.pr = (1 << PRED_KERNEL_STACK);
673 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL);
674}
675EXPORT_SYMBOL(kernel_thread);
676
677/* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */
678int
679kernel_thread_helper (int (*fn)(void *), void *arg)
680{
681#ifdef CONFIG_IA32_SUPPORT
682 if (IS_IA32_PROCESS(ia64_task_regs(current))) {
683 /* A kernel thread is always a 64-bit process. */
684 current->thread.map_base = DEFAULT_MAP_BASE;
685 current->thread.task_size = DEFAULT_TASK_SIZE;
686 ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob);
687 ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1);
688 }
689#endif
690 return (*fn)(arg);
691}
692
693/*
694 * Flush thread state. This is called when a thread does an execve().
695 */
696void
697flush_thread (void)
698{
699 /* drop floating-point and debug-register state if it exists: */
700 current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
701 ia64_drop_fpu(current);
702 if (IS_IA32_PROCESS(ia64_task_regs(current)))
703 ia32_drop_partial_page_list(current);
704}
705
706/*
707 * Clean up state associated with current thread. This is called when
708 * the thread calls exit().
709 */
710void
711exit_thread (void)
712{
713 ia64_drop_fpu(current);
714#ifdef CONFIG_PERFMON
715 /* if needed, stop monitoring and flush state to perfmon context */
716 if (current->thread.pfm_context)
717 pfm_exit_thread(current);
718
719 /* free debug register resources */
720 if (current->thread.flags & IA64_THREAD_DBG_VALID)
721 pfm_release_debug_registers(current);
722#endif
723 if (IS_IA32_PROCESS(ia64_task_regs(current)))
724 ia32_drop_partial_page_list(current);
725}
726
727unsigned long
728get_wchan (struct task_struct *p)
729{
730 struct unw_frame_info info;
731 unsigned long ip;
732 int count = 0;
733
734 /*
735 * Note: p may not be a blocked task (it could be current or
736 * another process running on some other CPU. Rather than
737 * trying to determine if p is really blocked, we just assume
738 * it's blocked and rely on the unwind routines to fail
739 * gracefully if the process wasn't really blocked after all.
740 * --davidm 99/12/15
741 */
742 unw_init_from_blocked_task(&info, p);
743 do {
744 if (unw_unwind(&info) < 0)
745 return 0;
746 unw_get_ip(&info, &ip);
747 if (!in_sched_functions(ip))
748 return ip;
749 } while (count++ < 16);
750 return 0;
751}
752
753void
754cpu_halt (void)
755{
756 pal_power_mgmt_info_u_t power_info[8];
757 unsigned long min_power;
758 int i, min_power_state;
759
760 if (ia64_pal_halt_info(power_info) != 0)
761 return;
762
763 min_power_state = 0;
764 min_power = power_info[0].pal_power_mgmt_info_s.power_consumption;
765 for (i = 1; i < 8; ++i)
766 if (power_info[i].pal_power_mgmt_info_s.im
767 && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) {
768 min_power = power_info[i].pal_power_mgmt_info_s.power_consumption;
769 min_power_state = i;
770 }
771
772 while (1)
773 ia64_pal_halt(min_power_state);
774}
775
776void
777machine_restart (char *restart_cmd)
778{
779 (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
780}
781
782EXPORT_SYMBOL(machine_restart);
783
784void
785machine_halt (void)
786{
787 cpu_halt();
788}
789
790EXPORT_SYMBOL(machine_halt);
791
792void
793machine_power_off (void)
794{
795 if (pm_power_off)
796 pm_power_off();
797 machine_halt();
798}
799
800EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
new file mode 100644
index 000000000000..55789fcd7210
--- /dev/null
+++ b/arch/ia64/kernel/ptrace.c
@@ -0,0 +1,1627 @@
1/*
2 * Kernel support for the ptrace() and syscall tracing interfaces.
3 *
4 * Copyright (C) 1999-2005 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * Derived from the x86 and Alpha versions.
8 */
9#include <linux/config.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/slab.h>
13#include <linux/mm.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/smp_lock.h>
17#include <linux/user.h>
18#include <linux/security.h>
19#include <linux/audit.h>
20
21#include <asm/pgtable.h>
22#include <asm/processor.h>
23#include <asm/ptrace_offsets.h>
24#include <asm/rse.h>
25#include <asm/system.h>
26#include <asm/uaccess.h>
27#include <asm/unwind.h>
28#ifdef CONFIG_PERFMON
29#include <asm/perfmon.h>
30#endif
31
32#include "entry.h"
33
34/*
35 * Bits in the PSR that we allow ptrace() to change:
36 * be, up, ac, mfl, mfh (the user mask; five bits total)
37 * db (debug breakpoint fault; one bit)
38 * id (instruction debug fault disable; one bit)
39 * dd (data debug fault disable; one bit)
40 * ri (restart instruction; two bits)
41 * is (instruction set; one bit)
42 */
43#define IPSR_MASK (IA64_PSR_UM | IA64_PSR_DB | IA64_PSR_IS \
44 | IA64_PSR_ID | IA64_PSR_DD | IA64_PSR_RI)
45
46#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */
47#define PFM_MASK MASK(38)
48
49#define PTRACE_DEBUG 0
50
51#if PTRACE_DEBUG
52# define dprintk(format...) printk(format)
53# define inline
54#else
55# define dprintk(format...)
56#endif
57
58/* Return TRUE if PT was created due to kernel-entry via a system-call. */
59
60static inline int
61in_syscall (struct pt_regs *pt)
62{
63 return (long) pt->cr_ifs >= 0;
64}
65
66/*
67 * Collect the NaT bits for r1-r31 from scratch_unat and return a NaT
68 * bitset where bit i is set iff the NaT bit of register i is set.
69 */
70unsigned long
71ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat)
72{
73# define GET_BITS(first, last, unat) \
74 ({ \
75 unsigned long bit = ia64_unat_pos(&pt->r##first); \
76 unsigned long nbits = (last - first + 1); \
77 unsigned long mask = MASK(nbits) << first; \
78 unsigned long dist; \
79 if (bit < first) \
80 dist = 64 + bit - first; \
81 else \
82 dist = bit - first; \
83 ia64_rotr(unat, dist) & mask; \
84 })
85 unsigned long val;
86
87 /*
88 * Registers that are stored consecutively in struct pt_regs
89 * can be handled in parallel. If the register order in
90 * struct_pt_regs changes, this code MUST be updated.
91 */
92 val = GET_BITS( 1, 1, scratch_unat);
93 val |= GET_BITS( 2, 3, scratch_unat);
94 val |= GET_BITS(12, 13, scratch_unat);
95 val |= GET_BITS(14, 14, scratch_unat);
96 val |= GET_BITS(15, 15, scratch_unat);
97 val |= GET_BITS( 8, 11, scratch_unat);
98 val |= GET_BITS(16, 31, scratch_unat);
99 return val;
100
101# undef GET_BITS
102}
103
104/*
105 * Set the NaT bits for the scratch registers according to NAT and
106 * return the resulting unat (assuming the scratch registers are
107 * stored in PT).
108 */
109unsigned long
110ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat)
111{
112# define PUT_BITS(first, last, nat) \
113 ({ \
114 unsigned long bit = ia64_unat_pos(&pt->r##first); \
115 unsigned long nbits = (last - first + 1); \
116 unsigned long mask = MASK(nbits) << first; \
117 long dist; \
118 if (bit < first) \
119 dist = 64 + bit - first; \
120 else \
121 dist = bit - first; \
122 ia64_rotl(nat & mask, dist); \
123 })
124 unsigned long scratch_unat;
125
126 /*
127 * Registers that are stored consecutively in struct pt_regs
128 * can be handled in parallel. If the register order in
129 * struct_pt_regs changes, this code MUST be updated.
130 */
131 scratch_unat = PUT_BITS( 1, 1, nat);
132 scratch_unat |= PUT_BITS( 2, 3, nat);
133 scratch_unat |= PUT_BITS(12, 13, nat);
134 scratch_unat |= PUT_BITS(14, 14, nat);
135 scratch_unat |= PUT_BITS(15, 15, nat);
136 scratch_unat |= PUT_BITS( 8, 11, nat);
137 scratch_unat |= PUT_BITS(16, 31, nat);
138
139 return scratch_unat;
140
141# undef PUT_BITS
142}
143
144#define IA64_MLX_TEMPLATE 0x2
145#define IA64_MOVL_OPCODE 6
146
147void
148ia64_increment_ip (struct pt_regs *regs)
149{
150 unsigned long w0, ri = ia64_psr(regs)->ri + 1;
151
152 if (ri > 2) {
153 ri = 0;
154 regs->cr_iip += 16;
155 } else if (ri == 2) {
156 get_user(w0, (char __user *) regs->cr_iip + 0);
157 if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
158 /*
159 * rfi'ing to slot 2 of an MLX bundle causes
160 * an illegal operation fault. We don't want
161 * that to happen...
162 */
163 ri = 0;
164 regs->cr_iip += 16;
165 }
166 }
167 ia64_psr(regs)->ri = ri;
168}
169
170void
171ia64_decrement_ip (struct pt_regs *regs)
172{
173 unsigned long w0, ri = ia64_psr(regs)->ri - 1;
174
175 if (ia64_psr(regs)->ri == 0) {
176 regs->cr_iip -= 16;
177 ri = 2;
178 get_user(w0, (char __user *) regs->cr_iip + 0);
179 if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
180 /*
181 * rfi'ing to slot 2 of an MLX bundle causes
182 * an illegal operation fault. We don't want
183 * that to happen...
184 */
185 ri = 1;
186 }
187 }
188 ia64_psr(regs)->ri = ri;
189}
190
191/*
192 * This routine is used to read an rnat bits that are stored on the
193 * kernel backing store. Since, in general, the alignment of the user
194 * and kernel are different, this is not completely trivial. In
195 * essence, we need to construct the user RNAT based on up to two
196 * kernel RNAT values and/or the RNAT value saved in the child's
197 * pt_regs.
198 *
199 * user rbs
200 *
201 * +--------+ <-- lowest address
202 * | slot62 |
203 * +--------+
204 * | rnat | 0x....1f8
205 * +--------+
206 * | slot00 | \
207 * +--------+ |
208 * | slot01 | > child_regs->ar_rnat
209 * +--------+ |
210 * | slot02 | / kernel rbs
211 * +--------+ +--------+
212 * <- child_regs->ar_bspstore | slot61 | <-- krbs
213 * +- - - - + +--------+
214 * | slot62 |
215 * +- - - - + +--------+
216 * | rnat |
217 * +- - - - + +--------+
218 * vrnat | slot00 |
219 * +- - - - + +--------+
220 * = =
221 * +--------+
222 * | slot00 | \
223 * +--------+ |
224 * | slot01 | > child_stack->ar_rnat
225 * +--------+ |
226 * | slot02 | /
227 * +--------+
228 * <--- child_stack->ar_bspstore
229 *
230 * The way to think of this code is as follows: bit 0 in the user rnat
231 * corresponds to some bit N (0 <= N <= 62) in one of the kernel rnat
232 * value. The kernel rnat value holding this bit is stored in
233 * variable rnat0. rnat1 is loaded with the kernel rnat value that
234 * form the upper bits of the user rnat value.
235 *
236 * Boundary cases:
237 *
238 * o when reading the rnat "below" the first rnat slot on the kernel
239 * backing store, rnat0/rnat1 are set to 0 and the low order bits are
240 * merged in from pt->ar_rnat.
241 *
242 * o when reading the rnat "above" the last rnat slot on the kernel
243 * backing store, rnat0/rnat1 gets its value from sw->ar_rnat.
244 */
245static unsigned long
246get_rnat (struct task_struct *task, struct switch_stack *sw,
247 unsigned long *krbs, unsigned long *urnat_addr,
248 unsigned long *urbs_end)
249{
250 unsigned long rnat0 = 0, rnat1 = 0, urnat = 0, *slot0_kaddr;
251 unsigned long umask = 0, mask, m;
252 unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
253 long num_regs, nbits;
254 struct pt_regs *pt;
255
256 pt = ia64_task_regs(task);
257 kbsp = (unsigned long *) sw->ar_bspstore;
258 ubspstore = (unsigned long *) pt->ar_bspstore;
259
260 if (urbs_end < urnat_addr)
261 nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_end);
262 else
263 nbits = 63;
264 mask = MASK(nbits);
265 /*
266 * First, figure out which bit number slot 0 in user-land maps
267 * to in the kernel rnat. Do this by figuring out how many
268 * register slots we're beyond the user's backingstore and
269 * then computing the equivalent address in kernel space.
270 */
271 num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
272 slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
273 shift = ia64_rse_slot_num(slot0_kaddr);
274 rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
275 rnat0_kaddr = rnat1_kaddr - 64;
276
277 if (ubspstore + 63 > urnat_addr) {
278 /* some bits need to be merged in from pt->ar_rnat */
279 umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
280 urnat = (pt->ar_rnat & umask);
281 mask &= ~umask;
282 if (!mask)
283 return urnat;
284 }
285
286 m = mask << shift;
287 if (rnat0_kaddr >= kbsp)
288 rnat0 = sw->ar_rnat;
289 else if (rnat0_kaddr > krbs)
290 rnat0 = *rnat0_kaddr;
291 urnat |= (rnat0 & m) >> shift;
292
293 m = mask >> (63 - shift);
294 if (rnat1_kaddr >= kbsp)
295 rnat1 = sw->ar_rnat;
296 else if (rnat1_kaddr > krbs)
297 rnat1 = *rnat1_kaddr;
298 urnat |= (rnat1 & m) << (63 - shift);
299 return urnat;
300}
301
302/*
303 * The reverse of get_rnat.
304 */
305static void
306put_rnat (struct task_struct *task, struct switch_stack *sw,
307 unsigned long *krbs, unsigned long *urnat_addr, unsigned long urnat,
308 unsigned long *urbs_end)
309{
310 unsigned long rnat0 = 0, rnat1 = 0, *slot0_kaddr, umask = 0, mask, m;
311 unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
312 long num_regs, nbits;
313 struct pt_regs *pt;
314 unsigned long cfm, *urbs_kargs;
315
316 pt = ia64_task_regs(task);
317 kbsp = (unsigned long *) sw->ar_bspstore;
318 ubspstore = (unsigned long *) pt->ar_bspstore;
319
320 urbs_kargs = urbs_end;
321 if (in_syscall(pt)) {
322 /*
323 * If entered via syscall, don't allow user to set rnat bits
324 * for syscall args.
325 */
326 cfm = pt->cr_ifs;
327 urbs_kargs = ia64_rse_skip_regs(urbs_end, -(cfm & 0x7f));
328 }
329
330 if (urbs_kargs >= urnat_addr)
331 nbits = 63;
332 else {
333 if ((urnat_addr - 63) >= urbs_kargs)
334 return;
335 nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_kargs);
336 }
337 mask = MASK(nbits);
338
339 /*
340 * First, figure out which bit number slot 0 in user-land maps
341 * to in the kernel rnat. Do this by figuring out how many
342 * register slots we're beyond the user's backingstore and
343 * then computing the equivalent address in kernel space.
344 */
345 num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
346 slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
347 shift = ia64_rse_slot_num(slot0_kaddr);
348 rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
349 rnat0_kaddr = rnat1_kaddr - 64;
350
351 if (ubspstore + 63 > urnat_addr) {
352 /* some bits need to be place in pt->ar_rnat: */
353 umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
354 pt->ar_rnat = (pt->ar_rnat & ~umask) | (urnat & umask);
355 mask &= ~umask;
356 if (!mask)
357 return;
358 }
359 /*
360 * Note: Section 11.1 of the EAS guarantees that bit 63 of an
361 * rnat slot is ignored. so we don't have to clear it here.
362 */
363 rnat0 = (urnat << shift);
364 m = mask << shift;
365 if (rnat0_kaddr >= kbsp)
366 sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat0 & m);
367 else if (rnat0_kaddr > krbs)
368 *rnat0_kaddr = ((*rnat0_kaddr & ~m) | (rnat0 & m));
369
370 rnat1 = (urnat >> (63 - shift));
371 m = mask >> (63 - shift);
372 if (rnat1_kaddr >= kbsp)
373 sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat1 & m);
374 else if (rnat1_kaddr > krbs)
375 *rnat1_kaddr = ((*rnat1_kaddr & ~m) | (rnat1 & m));
376}
377
378static inline int
379on_kernel_rbs (unsigned long addr, unsigned long bspstore,
380 unsigned long urbs_end)
381{
382 unsigned long *rnat_addr = ia64_rse_rnat_addr((unsigned long *)
383 urbs_end);
384 return (addr >= bspstore && addr <= (unsigned long) rnat_addr);
385}
386
387/*
388 * Read a word from the user-level backing store of task CHILD. ADDR
389 * is the user-level address to read the word from, VAL a pointer to
390 * the return value, and USER_BSP gives the end of the user-level
391 * backing store (i.e., it's the address that would be in ar.bsp after
392 * the user executed a "cover" instruction).
393 *
394 * This routine takes care of accessing the kernel register backing
395 * store for those registers that got spilled there. It also takes
396 * care of calculating the appropriate RNaT collection words.
397 */
398long
399ia64_peek (struct task_struct *child, struct switch_stack *child_stack,
400 unsigned long user_rbs_end, unsigned long addr, long *val)
401{
402 unsigned long *bspstore, *krbs, regnum, *laddr, *urbs_end, *rnat_addr;
403 struct pt_regs *child_regs;
404 size_t copied;
405 long ret;
406
407 urbs_end = (long *) user_rbs_end;
408 laddr = (unsigned long *) addr;
409 child_regs = ia64_task_regs(child);
410 bspstore = (unsigned long *) child_regs->ar_bspstore;
411 krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
412 if (on_kernel_rbs(addr, (unsigned long) bspstore,
413 (unsigned long) urbs_end))
414 {
415 /*
416 * Attempt to read the RBS in an area that's actually
417 * on the kernel RBS => read the corresponding bits in
418 * the kernel RBS.
419 */
420 rnat_addr = ia64_rse_rnat_addr(laddr);
421 ret = get_rnat(child, child_stack, krbs, rnat_addr, urbs_end);
422
423 if (laddr == rnat_addr) {
424 /* return NaT collection word itself */
425 *val = ret;
426 return 0;
427 }
428
429 if (((1UL << ia64_rse_slot_num(laddr)) & ret) != 0) {
430 /*
431 * It is implementation dependent whether the
432 * data portion of a NaT value gets saved on a
433 * st8.spill or RSE spill (e.g., see EAS 2.6,
434 * 4.4.4.6 Register Spill and Fill). To get
435 * consistent behavior across all possible
436 * IA-64 implementations, we return zero in
437 * this case.
438 */
439 *val = 0;
440 return 0;
441 }
442
443 if (laddr < urbs_end) {
444 /*
445 * The desired word is on the kernel RBS and
446 * is not a NaT.
447 */
448 regnum = ia64_rse_num_regs(bspstore, laddr);
449 *val = *ia64_rse_skip_regs(krbs, regnum);
450 return 0;
451 }
452 }
453 copied = access_process_vm(child, addr, &ret, sizeof(ret), 0);
454 if (copied != sizeof(ret))
455 return -EIO;
456 *val = ret;
457 return 0;
458}
459
460long
461ia64_poke (struct task_struct *child, struct switch_stack *child_stack,
462 unsigned long user_rbs_end, unsigned long addr, long val)
463{
464 unsigned long *bspstore, *krbs, regnum, *laddr;
465 unsigned long *urbs_end = (long *) user_rbs_end;
466 struct pt_regs *child_regs;
467
468 laddr = (unsigned long *) addr;
469 child_regs = ia64_task_regs(child);
470 bspstore = (unsigned long *) child_regs->ar_bspstore;
471 krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
472 if (on_kernel_rbs(addr, (unsigned long) bspstore,
473 (unsigned long) urbs_end))
474 {
475 /*
476 * Attempt to write the RBS in an area that's actually
477 * on the kernel RBS => write the corresponding bits
478 * in the kernel RBS.
479 */
480 if (ia64_rse_is_rnat_slot(laddr))
481 put_rnat(child, child_stack, krbs, laddr, val,
482 urbs_end);
483 else {
484 if (laddr < urbs_end) {
485 regnum = ia64_rse_num_regs(bspstore, laddr);
486 *ia64_rse_skip_regs(krbs, regnum) = val;
487 }
488 }
489 } else if (access_process_vm(child, addr, &val, sizeof(val), 1)
490 != sizeof(val))
491 return -EIO;
492 return 0;
493}
494
495/*
496 * Calculate the address of the end of the user-level register backing
497 * store. This is the address that would have been stored in ar.bsp
498 * if the user had executed a "cover" instruction right before
499 * entering the kernel. If CFMP is not NULL, it is used to return the
500 * "current frame mask" that was active at the time the kernel was
501 * entered.
502 */
503unsigned long
504ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt,
505 unsigned long *cfmp)
506{
507 unsigned long *krbs, *bspstore, cfm = pt->cr_ifs;
508 long ndirty;
509
510 krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
511 bspstore = (unsigned long *) pt->ar_bspstore;
512 ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
513
514 if (in_syscall(pt))
515 ndirty += (cfm & 0x7f);
516 else
517 cfm &= ~(1UL << 63); /* clear valid bit */
518
519 if (cfmp)
520 *cfmp = cfm;
521 return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty);
522}
523
524/*
525 * Synchronize (i.e, write) the RSE backing store living in kernel
526 * space to the VM of the CHILD task. SW and PT are the pointers to
527 * the switch_stack and pt_regs structures, respectively.
528 * USER_RBS_END is the user-level address at which the backing store
529 * ends.
530 */
531long
532ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw,
533 unsigned long user_rbs_start, unsigned long user_rbs_end)
534{
535 unsigned long addr, val;
536 long ret;
537
538 /* now copy word for word from kernel rbs to user rbs: */
539 for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
540 ret = ia64_peek(child, sw, user_rbs_end, addr, &val);
541 if (ret < 0)
542 return ret;
543 if (access_process_vm(child, addr, &val, sizeof(val), 1)
544 != sizeof(val))
545 return -EIO;
546 }
547 return 0;
548}
549
550static inline int
551thread_matches (struct task_struct *thread, unsigned long addr)
552{
553 unsigned long thread_rbs_end;
554 struct pt_regs *thread_regs;
555
556 if (ptrace_check_attach(thread, 0) < 0)
557 /*
558 * If the thread is not in an attachable state, we'll
559 * ignore it. The net effect is that if ADDR happens
560 * to overlap with the portion of the thread's
561 * register backing store that is currently residing
562 * on the thread's kernel stack, then ptrace() may end
563 * up accessing a stale value. But if the thread
564 * isn't stopped, that's a problem anyhow, so we're
565 * doing as well as we can...
566 */
567 return 0;
568
569 thread_regs = ia64_task_regs(thread);
570 thread_rbs_end = ia64_get_user_rbs_end(thread, thread_regs, NULL);
571 if (!on_kernel_rbs(addr, thread_regs->ar_bspstore, thread_rbs_end))
572 return 0;
573
574 return 1; /* looks like we've got a winner */
575}
576
577/*
578 * GDB apparently wants to be able to read the register-backing store
579 * of any thread when attached to a given process. If we are peeking
580 * or poking an address that happens to reside in the kernel-backing
581 * store of another thread, we need to attach to that thread, because
582 * otherwise we end up accessing stale data.
583 *
584 * task_list_lock must be read-locked before calling this routine!
585 */
586static struct task_struct *
587find_thread_for_addr (struct task_struct *child, unsigned long addr)
588{
589 struct task_struct *g, *p;
590 struct mm_struct *mm;
591 int mm_users;
592
593 if (!(mm = get_task_mm(child)))
594 return child;
595
596 /* -1 because of our get_task_mm(): */
597 mm_users = atomic_read(&mm->mm_users) - 1;
598 if (mm_users <= 1)
599 goto out; /* not multi-threaded */
600
601 /*
602 * First, traverse the child's thread-list. Good for scalability with
603 * NPTL-threads.
604 */
605 p = child;
606 do {
607 if (thread_matches(p, addr)) {
608 child = p;
609 goto out;
610 }
611 if (mm_users-- <= 1)
612 goto out;
613 } while ((p = next_thread(p)) != child);
614
615 do_each_thread(g, p) {
616 if (child->mm != mm)
617 continue;
618
619 if (thread_matches(p, addr)) {
620 child = p;
621 goto out;
622 }
623 } while_each_thread(g, p);
624 out:
625 mmput(mm);
626 return child;
627}
628
629/*
630 * Write f32-f127 back to task->thread.fph if it has been modified.
631 */
632inline void
633ia64_flush_fph (struct task_struct *task)
634{
635 struct ia64_psr *psr = ia64_psr(ia64_task_regs(task));
636
637 if (ia64_is_local_fpu_owner(task) && psr->mfh) {
638 psr->mfh = 0;
639 task->thread.flags |= IA64_THREAD_FPH_VALID;
640 ia64_save_fpu(&task->thread.fph[0]);
641 }
642}
643
644/*
645 * Sync the fph state of the task so that it can be manipulated
646 * through thread.fph. If necessary, f32-f127 are written back to
647 * thread.fph or, if the fph state hasn't been used before, thread.fph
648 * is cleared to zeroes. Also, access to f32-f127 is disabled to
649 * ensure that the task picks up the state from thread.fph when it
650 * executes again.
651 */
652void
653ia64_sync_fph (struct task_struct *task)
654{
655 struct ia64_psr *psr = ia64_psr(ia64_task_regs(task));
656
657 ia64_flush_fph(task);
658 if (!(task->thread.flags & IA64_THREAD_FPH_VALID)) {
659 task->thread.flags |= IA64_THREAD_FPH_VALID;
660 memset(&task->thread.fph, 0, sizeof(task->thread.fph));
661 }
662 ia64_drop_fpu(task);
663 psr->dfh = 1;
664}
665
666static int
667access_fr (struct unw_frame_info *info, int regnum, int hi,
668 unsigned long *data, int write_access)
669{
670 struct ia64_fpreg fpval;
671 int ret;
672
673 ret = unw_get_fr(info, regnum, &fpval);
674 if (ret < 0)
675 return ret;
676
677 if (write_access) {
678 fpval.u.bits[hi] = *data;
679 ret = unw_set_fr(info, regnum, fpval);
680 } else
681 *data = fpval.u.bits[hi];
682 return ret;
683}
684
685/*
686 * Change the machine-state of CHILD such that it will return via the normal
687 * kernel exit-path, rather than the syscall-exit path.
688 */
689static void
690convert_to_non_syscall (struct task_struct *child, struct pt_regs *pt,
691 unsigned long cfm)
692{
693 struct unw_frame_info info, prev_info;
694 unsigned long ip, pr;
695
696 unw_init_from_blocked_task(&info, child);
697 while (1) {
698 prev_info = info;
699 if (unw_unwind(&info) < 0)
700 return;
701 if (unw_get_rp(&info, &ip) < 0)
702 return;
703 if (ip < FIXADDR_USER_END)
704 break;
705 }
706
707 unw_get_pr(&prev_info, &pr);
708 pr &= ~(1UL << PRED_SYSCALL);
709 pr |= (1UL << PRED_NON_SYSCALL);
710 unw_set_pr(&prev_info, pr);
711
712 pt->cr_ifs = (1UL << 63) | cfm;
713}
714
715static int
716access_nat_bits (struct task_struct *child, struct pt_regs *pt,
717 struct unw_frame_info *info,
718 unsigned long *data, int write_access)
719{
720 unsigned long regnum, nat_bits, scratch_unat, dummy = 0;
721 char nat = 0;
722
723 if (write_access) {
724 nat_bits = *data;
725 scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits);
726 if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) {
727 dprintk("ptrace: failed to set ar.unat\n");
728 return -1;
729 }
730 for (regnum = 4; regnum <= 7; ++regnum) {
731 unw_get_gr(info, regnum, &dummy, &nat);
732 unw_set_gr(info, regnum, dummy,
733 (nat_bits >> regnum) & 1);
734 }
735 } else {
736 if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) {
737 dprintk("ptrace: failed to read ar.unat\n");
738 return -1;
739 }
740 nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat);
741 for (regnum = 4; regnum <= 7; ++regnum) {
742 unw_get_gr(info, regnum, &dummy, &nat);
743 nat_bits |= (nat != 0) << regnum;
744 }
745 *data = nat_bits;
746 }
747 return 0;
748}
749
750static int
751access_uarea (struct task_struct *child, unsigned long addr,
752 unsigned long *data, int write_access)
753{
754 unsigned long *ptr, regnum, urbs_end, rnat_addr, cfm;
755 struct switch_stack *sw;
756 struct pt_regs *pt;
757# define pt_reg_addr(pt, reg) ((void *) \
758 ((unsigned long) (pt) \
759 + offsetof(struct pt_regs, reg)))
760
761
762 pt = ia64_task_regs(child);
763 sw = (struct switch_stack *) (child->thread.ksp + 16);
764
765 if ((addr & 0x7) != 0) {
766 dprintk("ptrace: unaligned register address 0x%lx\n", addr);
767 return -1;
768 }
769
770 if (addr < PT_F127 + 16) {
771 /* accessing fph */
772 if (write_access)
773 ia64_sync_fph(child);
774 else
775 ia64_flush_fph(child);
776 ptr = (unsigned long *)
777 ((unsigned long) &child->thread.fph + addr);
778 } else if ((addr >= PT_F10) && (addr < PT_F11 + 16)) {
779 /* scratch registers untouched by kernel (saved in pt_regs) */
780 ptr = pt_reg_addr(pt, f10) + (addr - PT_F10);
781 } else if (addr >= PT_F12 && addr < PT_F15 + 16) {
782 /*
783 * Scratch registers untouched by kernel (saved in
784 * switch_stack).
785 */
786 ptr = (unsigned long *) ((long) sw
787 + (addr - PT_NAT_BITS - 32));
788 } else if (addr < PT_AR_LC + 8) {
789 /* preserved state: */
790 struct unw_frame_info info;
791 char nat = 0;
792 int ret;
793
794 unw_init_from_blocked_task(&info, child);
795 if (unw_unwind_to_user(&info) < 0)
796 return -1;
797
798 switch (addr) {
799 case PT_NAT_BITS:
800 return access_nat_bits(child, pt, &info,
801 data, write_access);
802
803 case PT_R4: case PT_R5: case PT_R6: case PT_R7:
804 if (write_access) {
805 /* read NaT bit first: */
806 unsigned long dummy;
807
808 ret = unw_get_gr(&info, (addr - PT_R4)/8 + 4,
809 &dummy, &nat);
810 if (ret < 0)
811 return ret;
812 }
813 return unw_access_gr(&info, (addr - PT_R4)/8 + 4, data,
814 &nat, write_access);
815
816 case PT_B1: case PT_B2: case PT_B3:
817 case PT_B4: case PT_B5:
818 return unw_access_br(&info, (addr - PT_B1)/8 + 1, data,
819 write_access);
820
821 case PT_AR_EC:
822 return unw_access_ar(&info, UNW_AR_EC, data,
823 write_access);
824
825 case PT_AR_LC:
826 return unw_access_ar(&info, UNW_AR_LC, data,
827 write_access);
828
829 default:
830 if (addr >= PT_F2 && addr < PT_F5 + 16)
831 return access_fr(&info, (addr - PT_F2)/16 + 2,
832 (addr & 8) != 0, data,
833 write_access);
834 else if (addr >= PT_F16 && addr < PT_F31 + 16)
835 return access_fr(&info,
836 (addr - PT_F16)/16 + 16,
837 (addr & 8) != 0,
838 data, write_access);
839 else {
840 dprintk("ptrace: rejecting access to register "
841 "address 0x%lx\n", addr);
842 return -1;
843 }
844 }
845 } else if (addr < PT_F9+16) {
846 /* scratch state */
847 switch (addr) {
848 case PT_AR_BSP:
849 /*
850 * By convention, we use PT_AR_BSP to refer to
851 * the end of the user-level backing store.
852 * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof)
853 * to get the real value of ar.bsp at the time
854 * the kernel was entered.
855 *
856 * Furthermore, when changing the contents of
857 * PT_AR_BSP (or PT_CFM) we MUST copy any
858 * users-level stacked registers that are
859 * stored on the kernel stack back to
860 * user-space because otherwise, we might end
861 * up clobbering kernel stacked registers.
862 * Also, if this happens while the task is
863 * blocked in a system call, which convert the
864 * state such that the non-system-call exit
865 * path is used. This ensures that the proper
866 * state will be picked up when resuming
867 * execution. However, it *also* means that
868 * once we write PT_AR_BSP/PT_CFM, it won't be
869 * possible to modify the syscall arguments of
870 * the pending system call any longer. This
871 * shouldn't be an issue because modifying
872 * PT_AR_BSP/PT_CFM generally implies that
873 * we're either abandoning the pending system
874 * call or that we defer it's re-execution
875 * (e.g., due to GDB doing an inferior
876 * function call).
877 */
878 urbs_end = ia64_get_user_rbs_end(child, pt, &cfm);
879 if (write_access) {
880 if (*data != urbs_end) {
881 if (ia64_sync_user_rbs(child, sw,
882 pt->ar_bspstore,
883 urbs_end) < 0)
884 return -1;
885 if (in_syscall(pt))
886 convert_to_non_syscall(child,
887 pt,
888 cfm);
889 /*
890 * Simulate user-level write
891 * of ar.bsp:
892 */
893 pt->loadrs = 0;
894 pt->ar_bspstore = *data;
895 }
896 } else
897 *data = urbs_end;
898 return 0;
899
900 case PT_CFM:
901 urbs_end = ia64_get_user_rbs_end(child, pt, &cfm);
902 if (write_access) {
903 if (((cfm ^ *data) & PFM_MASK) != 0) {
904 if (ia64_sync_user_rbs(child, sw,
905 pt->ar_bspstore,
906 urbs_end) < 0)
907 return -1;
908 if (in_syscall(pt))
909 convert_to_non_syscall(child,
910 pt,
911 cfm);
912 pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK)
913 | (*data & PFM_MASK));
914 }
915 } else
916 *data = cfm;
917 return 0;
918
919 case PT_CR_IPSR:
920 if (write_access)
921 pt->cr_ipsr = ((*data & IPSR_MASK)
922 | (pt->cr_ipsr & ~IPSR_MASK));
923 else
924 *data = (pt->cr_ipsr & IPSR_MASK);
925 return 0;
926
927 case PT_AR_RNAT:
928 urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
929 rnat_addr = (long) ia64_rse_rnat_addr((long *)
930 urbs_end);
931 if (write_access)
932 return ia64_poke(child, sw, urbs_end,
933 rnat_addr, *data);
934 else
935 return ia64_peek(child, sw, urbs_end,
936 rnat_addr, data);
937
938 case PT_R1:
939 ptr = pt_reg_addr(pt, r1);
940 break;
941 case PT_R2: case PT_R3:
942 ptr = pt_reg_addr(pt, r2) + (addr - PT_R2);
943 break;
944 case PT_R8: case PT_R9: case PT_R10: case PT_R11:
945 ptr = pt_reg_addr(pt, r8) + (addr - PT_R8);
946 break;
947 case PT_R12: case PT_R13:
948 ptr = pt_reg_addr(pt, r12) + (addr - PT_R12);
949 break;
950 case PT_R14:
951 ptr = pt_reg_addr(pt, r14);
952 break;
953 case PT_R15:
954 ptr = pt_reg_addr(pt, r15);
955 break;
956 case PT_R16: case PT_R17: case PT_R18: case PT_R19:
957 case PT_R20: case PT_R21: case PT_R22: case PT_R23:
958 case PT_R24: case PT_R25: case PT_R26: case PT_R27:
959 case PT_R28: case PT_R29: case PT_R30: case PT_R31:
960 ptr = pt_reg_addr(pt, r16) + (addr - PT_R16);
961 break;
962 case PT_B0:
963 ptr = pt_reg_addr(pt, b0);
964 break;
965 case PT_B6:
966 ptr = pt_reg_addr(pt, b6);
967 break;
968 case PT_B7:
969 ptr = pt_reg_addr(pt, b7);
970 break;
971 case PT_F6: case PT_F6+8: case PT_F7: case PT_F7+8:
972 case PT_F8: case PT_F8+8: case PT_F9: case PT_F9+8:
973 ptr = pt_reg_addr(pt, f6) + (addr - PT_F6);
974 break;
975 case PT_AR_BSPSTORE:
976 ptr = pt_reg_addr(pt, ar_bspstore);
977 break;
978 case PT_AR_RSC:
979 ptr = pt_reg_addr(pt, ar_rsc);
980 break;
981 case PT_AR_UNAT:
982 ptr = pt_reg_addr(pt, ar_unat);
983 break;
984 case PT_AR_PFS:
985 ptr = pt_reg_addr(pt, ar_pfs);
986 break;
987 case PT_AR_CCV:
988 ptr = pt_reg_addr(pt, ar_ccv);
989 break;
990 case PT_AR_FPSR:
991 ptr = pt_reg_addr(pt, ar_fpsr);
992 break;
993 case PT_CR_IIP:
994 ptr = pt_reg_addr(pt, cr_iip);
995 break;
996 case PT_PR:
997 ptr = pt_reg_addr(pt, pr);
998 break;
999 /* scratch register */
1000
1001 default:
1002 /* disallow accessing anything else... */
1003 dprintk("ptrace: rejecting access to register "
1004 "address 0x%lx\n", addr);
1005 return -1;
1006 }
1007 } else if (addr <= PT_AR_SSD) {
1008 ptr = pt_reg_addr(pt, ar_csd) + (addr - PT_AR_CSD);
1009 } else {
1010 /* access debug registers */
1011
1012 if (addr >= PT_IBR) {
1013 regnum = (addr - PT_IBR) >> 3;
1014 ptr = &child->thread.ibr[0];
1015 } else {
1016 regnum = (addr - PT_DBR) >> 3;
1017 ptr = &child->thread.dbr[0];
1018 }
1019
1020 if (regnum >= 8) {
1021 dprintk("ptrace: rejecting access to register "
1022 "address 0x%lx\n", addr);
1023 return -1;
1024 }
1025#ifdef CONFIG_PERFMON
1026 /*
1027 * Check if debug registers are used by perfmon. This
1028 * test must be done once we know that we can do the
1029 * operation, i.e. the arguments are all valid, but
1030 * before we start modifying the state.
1031 *
1032 * Perfmon needs to keep a count of how many processes
1033 * are trying to modify the debug registers for system
1034 * wide monitoring sessions.
1035 *
1036 * We also include read access here, because they may
1037 * cause the PMU-installed debug register state
1038 * (dbr[], ibr[]) to be reset. The two arrays are also
1039 * used by perfmon, but we do not use
1040 * IA64_THREAD_DBG_VALID. The registers are restored
1041 * by the PMU context switch code.
1042 */
1043 if (pfm_use_debug_registers(child)) return -1;
1044#endif
1045
1046 if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
1047 child->thread.flags |= IA64_THREAD_DBG_VALID;
1048 memset(child->thread.dbr, 0,
1049 sizeof(child->thread.dbr));
1050 memset(child->thread.ibr, 0,
1051 sizeof(child->thread.ibr));
1052 }
1053
1054 ptr += regnum;
1055
1056 if ((regnum & 1) && write_access) {
1057 /* don't let the user set kernel-level breakpoints: */
1058 *ptr = *data & ~(7UL << 56);
1059 return 0;
1060 }
1061 }
1062 if (write_access)
1063 *ptr = *data;
1064 else
1065 *data = *ptr;
1066 return 0;
1067}
1068
1069static long
1070ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
1071{
1072 unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val;
1073 struct unw_frame_info info;
1074 struct ia64_fpreg fpval;
1075 struct switch_stack *sw;
1076 struct pt_regs *pt;
1077 long ret, retval = 0;
1078 char nat = 0;
1079 int i;
1080
1081 if (!access_ok(VERIFY_WRITE, ppr, sizeof(struct pt_all_user_regs)))
1082 return -EIO;
1083
1084 pt = ia64_task_regs(child);
1085 sw = (struct switch_stack *) (child->thread.ksp + 16);
1086 unw_init_from_blocked_task(&info, child);
1087 if (unw_unwind_to_user(&info) < 0) {
1088 return -EIO;
1089 }
1090
1091 if (((unsigned long) ppr & 0x7) != 0) {
1092 dprintk("ptrace:unaligned register address %p\n", ppr);
1093 return -EIO;
1094 }
1095
1096 if (access_uarea(child, PT_CR_IPSR, &psr, 0) < 0
1097 || access_uarea(child, PT_AR_EC, &ec, 0) < 0
1098 || access_uarea(child, PT_AR_LC, &lc, 0) < 0
1099 || access_uarea(child, PT_AR_RNAT, &rnat, 0) < 0
1100 || access_uarea(child, PT_AR_BSP, &bsp, 0) < 0
1101 || access_uarea(child, PT_CFM, &cfm, 0)
1102 || access_uarea(child, PT_NAT_BITS, &nat_bits, 0))
1103 return -EIO;
1104
1105 /* control regs */
1106
1107 retval |= __put_user(pt->cr_iip, &ppr->cr_iip);
1108 retval |= __put_user(psr, &ppr->cr_ipsr);
1109
1110 /* app regs */
1111
1112 retval |= __put_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
1113 retval |= __put_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]);
1114 retval |= __put_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
1115 retval |= __put_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
1116 retval |= __put_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
1117 retval |= __put_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
1118
1119 retval |= __put_user(ec, &ppr->ar[PT_AUR_EC]);
1120 retval |= __put_user(lc, &ppr->ar[PT_AUR_LC]);
1121 retval |= __put_user(rnat, &ppr->ar[PT_AUR_RNAT]);
1122 retval |= __put_user(bsp, &ppr->ar[PT_AUR_BSP]);
1123 retval |= __put_user(cfm, &ppr->cfm);
1124
1125 /* gr1-gr3 */
1126
1127 retval |= __copy_to_user(&ppr->gr[1], &pt->r1, sizeof(long));
1128 retval |= __copy_to_user(&ppr->gr[2], &pt->r2, sizeof(long) *2);
1129
1130 /* gr4-gr7 */
1131
1132 for (i = 4; i < 8; i++) {
1133 if (unw_access_gr(&info, i, &val, &nat, 0) < 0)
1134 return -EIO;
1135 retval |= __put_user(val, &ppr->gr[i]);
1136 }
1137
1138 /* gr8-gr11 */
1139
1140 retval |= __copy_to_user(&ppr->gr[8], &pt->r8, sizeof(long) * 4);
1141
1142 /* gr12-gr15 */
1143
1144 retval |= __copy_to_user(&ppr->gr[12], &pt->r12, sizeof(long) * 2);
1145 retval |= __copy_to_user(&ppr->gr[14], &pt->r14, sizeof(long));
1146 retval |= __copy_to_user(&ppr->gr[15], &pt->r15, sizeof(long));
1147
1148 /* gr16-gr31 */
1149
1150 retval |= __copy_to_user(&ppr->gr[16], &pt->r16, sizeof(long) * 16);
1151
1152 /* b0 */
1153
1154 retval |= __put_user(pt->b0, &ppr->br[0]);
1155
1156 /* b1-b5 */
1157
1158 for (i = 1; i < 6; i++) {
1159 if (unw_access_br(&info, i, &val, 0) < 0)
1160 return -EIO;
1161 __put_user(val, &ppr->br[i]);
1162 }
1163
1164 /* b6-b7 */
1165
1166 retval |= __put_user(pt->b6, &ppr->br[6]);
1167 retval |= __put_user(pt->b7, &ppr->br[7]);
1168
1169 /* fr2-fr5 */
1170
1171 for (i = 2; i < 6; i++) {
1172 if (unw_get_fr(&info, i, &fpval) < 0)
1173 return -EIO;
1174 retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
1175 }
1176
1177 /* fr6-fr11 */
1178
1179 retval |= __copy_to_user(&ppr->fr[6], &pt->f6,
1180 sizeof(struct ia64_fpreg) * 6);
1181
1182 /* fp scratch regs(12-15) */
1183
1184 retval |= __copy_to_user(&ppr->fr[12], &sw->f12,
1185 sizeof(struct ia64_fpreg) * 4);
1186
1187 /* fr16-fr31 */
1188
1189 for (i = 16; i < 32; i++) {
1190 if (unw_get_fr(&info, i, &fpval) < 0)
1191 return -EIO;
1192 retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
1193 }
1194
1195 /* fph */
1196
1197 ia64_flush_fph(child);
1198 retval |= __copy_to_user(&ppr->fr[32], &child->thread.fph,
1199 sizeof(ppr->fr[32]) * 96);
1200
1201 /* preds */
1202
1203 retval |= __put_user(pt->pr, &ppr->pr);
1204
1205 /* nat bits */
1206
1207 retval |= __put_user(nat_bits, &ppr->nat);
1208
1209 ret = retval ? -EIO : 0;
1210 return ret;
1211}
1212
1213static long
1214ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
1215{
1216 unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val = 0;
1217 struct unw_frame_info info;
1218 struct switch_stack *sw;
1219 struct ia64_fpreg fpval;
1220 struct pt_regs *pt;
1221 long ret, retval = 0;
1222 int i;
1223
1224 memset(&fpval, 0, sizeof(fpval));
1225
1226 if (!access_ok(VERIFY_READ, ppr, sizeof(struct pt_all_user_regs)))
1227 return -EIO;
1228
1229 pt = ia64_task_regs(child);
1230 sw = (struct switch_stack *) (child->thread.ksp + 16);
1231 unw_init_from_blocked_task(&info, child);
1232 if (unw_unwind_to_user(&info) < 0) {
1233 return -EIO;
1234 }
1235
1236 if (((unsigned long) ppr & 0x7) != 0) {
1237 dprintk("ptrace:unaligned register address %p\n", ppr);
1238 return -EIO;
1239 }
1240
1241 /* control regs */
1242
1243 retval |= __get_user(pt->cr_iip, &ppr->cr_iip);
1244 retval |= __get_user(psr, &ppr->cr_ipsr);
1245
1246 /* app regs */
1247
1248 retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
1249 retval |= __get_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]);
1250 retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
1251 retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
1252 retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
1253 retval |= __get_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
1254
1255 retval |= __get_user(ec, &ppr->ar[PT_AUR_EC]);
1256 retval |= __get_user(lc, &ppr->ar[PT_AUR_LC]);
1257 retval |= __get_user(rnat, &ppr->ar[PT_AUR_RNAT]);
1258 retval |= __get_user(bsp, &ppr->ar[PT_AUR_BSP]);
1259 retval |= __get_user(cfm, &ppr->cfm);
1260
1261 /* gr1-gr3 */
1262
1263 retval |= __copy_from_user(&pt->r1, &ppr->gr[1], sizeof(long));
1264 retval |= __copy_from_user(&pt->r2, &ppr->gr[2], sizeof(long) * 2);
1265
1266 /* gr4-gr7 */
1267
1268 for (i = 4; i < 8; i++) {
1269 retval |= __get_user(val, &ppr->gr[i]);
1270 /* NaT bit will be set via PT_NAT_BITS: */
1271 if (unw_set_gr(&info, i, val, 0) < 0)
1272 return -EIO;
1273 }
1274
1275 /* gr8-gr11 */
1276
1277 retval |= __copy_from_user(&pt->r8, &ppr->gr[8], sizeof(long) * 4);
1278
1279 /* gr12-gr15 */
1280
1281 retval |= __copy_from_user(&pt->r12, &ppr->gr[12], sizeof(long) * 2);
1282 retval |= __copy_from_user(&pt->r14, &ppr->gr[14], sizeof(long));
1283 retval |= __copy_from_user(&pt->r15, &ppr->gr[15], sizeof(long));
1284
1285 /* gr16-gr31 */
1286
1287 retval |= __copy_from_user(&pt->r16, &ppr->gr[16], sizeof(long) * 16);
1288
1289 /* b0 */
1290
1291 retval |= __get_user(pt->b0, &ppr->br[0]);
1292
1293 /* b1-b5 */
1294
1295 for (i = 1; i < 6; i++) {
1296 retval |= __get_user(val, &ppr->br[i]);
1297 unw_set_br(&info, i, val);
1298 }
1299
1300 /* b6-b7 */
1301
1302 retval |= __get_user(pt->b6, &ppr->br[6]);
1303 retval |= __get_user(pt->b7, &ppr->br[7]);
1304
1305 /* fr2-fr5 */
1306
1307 for (i = 2; i < 6; i++) {
1308 retval |= __copy_from_user(&fpval, &ppr->fr[i], sizeof(fpval));
1309 if (unw_set_fr(&info, i, fpval) < 0)
1310 return -EIO;
1311 }
1312
1313 /* fr6-fr11 */
1314
1315 retval |= __copy_from_user(&pt->f6, &ppr->fr[6],
1316 sizeof(ppr->fr[6]) * 6);
1317
1318 /* fp scratch regs(12-15) */
1319
1320 retval |= __copy_from_user(&sw->f12, &ppr->fr[12],
1321 sizeof(ppr->fr[12]) * 4);
1322
1323 /* fr16-fr31 */
1324
1325 for (i = 16; i < 32; i++) {
1326 retval |= __copy_from_user(&fpval, &ppr->fr[i],
1327 sizeof(fpval));
1328 if (unw_set_fr(&info, i, fpval) < 0)
1329 return -EIO;
1330 }
1331
1332 /* fph */
1333
1334 ia64_sync_fph(child);
1335 retval |= __copy_from_user(&child->thread.fph, &ppr->fr[32],
1336 sizeof(ppr->fr[32]) * 96);
1337
1338 /* preds */
1339
1340 retval |= __get_user(pt->pr, &ppr->pr);
1341
1342 /* nat bits */
1343
1344 retval |= __get_user(nat_bits, &ppr->nat);
1345
1346 retval |= access_uarea(child, PT_CR_IPSR, &psr, 1);
1347 retval |= access_uarea(child, PT_AR_EC, &ec, 1);
1348 retval |= access_uarea(child, PT_AR_LC, &lc, 1);
1349 retval |= access_uarea(child, PT_AR_RNAT, &rnat, 1);
1350 retval |= access_uarea(child, PT_AR_BSP, &bsp, 1);
1351 retval |= access_uarea(child, PT_CFM, &cfm, 1);
1352 retval |= access_uarea(child, PT_NAT_BITS, &nat_bits, 1);
1353
1354 ret = retval ? -EIO : 0;
1355 return ret;
1356}
1357
1358/*
1359 * Called by kernel/ptrace.c when detaching..
1360 *
1361 * Make sure the single step bit is not set.
1362 */
1363void
1364ptrace_disable (struct task_struct *child)
1365{
1366 struct ia64_psr *child_psr = ia64_psr(ia64_task_regs(child));
1367
1368 /* make sure the single step/taken-branch trap bits are not set: */
1369 child_psr->ss = 0;
1370 child_psr->tb = 0;
1371}
1372
1373asmlinkage long
1374sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data)
1375{
1376 struct pt_regs *pt;
1377 unsigned long urbs_end, peek_or_poke;
1378 struct task_struct *child;
1379 struct switch_stack *sw;
1380 long ret;
1381
1382 lock_kernel();
1383 ret = -EPERM;
1384 if (request == PTRACE_TRACEME) {
1385 /* are we already being traced? */
1386 if (current->ptrace & PT_PTRACED)
1387 goto out;
1388 ret = security_ptrace(current->parent, current);
1389 if (ret)
1390 goto out;
1391 current->ptrace |= PT_PTRACED;
1392 ret = 0;
1393 goto out;
1394 }
1395
1396 peek_or_poke = (request == PTRACE_PEEKTEXT
1397 || request == PTRACE_PEEKDATA
1398 || request == PTRACE_POKETEXT
1399 || request == PTRACE_POKEDATA);
1400 ret = -ESRCH;
1401 read_lock(&tasklist_lock);
1402 {
1403 child = find_task_by_pid(pid);
1404 if (child) {
1405 if (peek_or_poke)
1406 child = find_thread_for_addr(child, addr);
1407 get_task_struct(child);
1408 }
1409 }
1410 read_unlock(&tasklist_lock);
1411 if (!child)
1412 goto out;
1413 ret = -EPERM;
1414 if (pid == 1) /* no messing around with init! */
1415 goto out_tsk;
1416
1417 if (request == PTRACE_ATTACH) {
1418 ret = ptrace_attach(child);
1419 goto out_tsk;
1420 }
1421
1422 ret = ptrace_check_attach(child, request == PTRACE_KILL);
1423 if (ret < 0)
1424 goto out_tsk;
1425
1426 pt = ia64_task_regs(child);
1427 sw = (struct switch_stack *) (child->thread.ksp + 16);
1428
1429 switch (request) {
1430 case PTRACE_PEEKTEXT:
1431 case PTRACE_PEEKDATA:
1432 /* read word at location addr */
1433 urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
1434 ret = ia64_peek(child, sw, urbs_end, addr, &data);
1435 if (ret == 0) {
1436 ret = data;
1437 /* ensure "ret" is not mistaken as an error code: */
1438 force_successful_syscall_return();
1439 }
1440 goto out_tsk;
1441
1442 case PTRACE_POKETEXT:
1443 case PTRACE_POKEDATA:
1444 /* write the word at location addr */
1445 urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
1446 ret = ia64_poke(child, sw, urbs_end, addr, data);
1447 goto out_tsk;
1448
1449 case PTRACE_PEEKUSR:
1450 /* read the word at addr in the USER area */
1451 if (access_uarea(child, addr, &data, 0) < 0) {
1452 ret = -EIO;
1453 goto out_tsk;
1454 }
1455 ret = data;
1456 /* ensure "ret" is not mistaken as an error code */
1457 force_successful_syscall_return();
1458 goto out_tsk;
1459
1460 case PTRACE_POKEUSR:
1461 /* write the word at addr in the USER area */
1462 if (access_uarea(child, addr, &data, 1) < 0) {
1463 ret = -EIO;
1464 goto out_tsk;
1465 }
1466 ret = 0;
1467 goto out_tsk;
1468
1469 case PTRACE_OLD_GETSIGINFO:
1470 /* for backwards-compatibility */
1471 ret = ptrace_request(child, PTRACE_GETSIGINFO, addr, data);
1472 goto out_tsk;
1473
1474 case PTRACE_OLD_SETSIGINFO:
1475 /* for backwards-compatibility */
1476 ret = ptrace_request(child, PTRACE_SETSIGINFO, addr, data);
1477 goto out_tsk;
1478
1479 case PTRACE_SYSCALL:
1480 /* continue and stop at next (return from) syscall */
1481 case PTRACE_CONT:
1482 /* restart after signal. */
1483 ret = -EIO;
1484 if (data > _NSIG)
1485 goto out_tsk;
1486 if (request == PTRACE_SYSCALL)
1487 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
1488 else
1489 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
1490 child->exit_code = data;
1491
1492 /*
1493 * Make sure the single step/taken-branch trap bits
1494 * are not set:
1495 */
1496 ia64_psr(pt)->ss = 0;
1497 ia64_psr(pt)->tb = 0;
1498
1499 wake_up_process(child);
1500 ret = 0;
1501 goto out_tsk;
1502
1503 case PTRACE_KILL:
1504 /*
1505 * Make the child exit. Best I can do is send it a
1506 * sigkill. Perhaps it should be put in the status
1507 * that it wants to exit.
1508 */
1509 if (child->exit_state == EXIT_ZOMBIE)
1510 /* already dead */
1511 goto out_tsk;
1512 child->exit_code = SIGKILL;
1513
1514 ptrace_disable(child);
1515 wake_up_process(child);
1516 ret = 0;
1517 goto out_tsk;
1518
1519 case PTRACE_SINGLESTEP:
1520 /* let child execute for one instruction */
1521 case PTRACE_SINGLEBLOCK:
1522 ret = -EIO;
1523 if (data > _NSIG)
1524 goto out_tsk;
1525
1526 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
1527 if (request == PTRACE_SINGLESTEP) {
1528 ia64_psr(pt)->ss = 1;
1529 } else {
1530 ia64_psr(pt)->tb = 1;
1531 }
1532 child->exit_code = data;
1533
1534 /* give it a chance to run. */
1535 wake_up_process(child);
1536 ret = 0;
1537 goto out_tsk;
1538
1539 case PTRACE_DETACH:
1540 /* detach a process that was attached. */
1541 ret = ptrace_detach(child, data);
1542 goto out_tsk;
1543
1544 case PTRACE_GETREGS:
1545 ret = ptrace_getregs(child,
1546 (struct pt_all_user_regs __user *) data);
1547 goto out_tsk;
1548
1549 case PTRACE_SETREGS:
1550 ret = ptrace_setregs(child,
1551 (struct pt_all_user_regs __user *) data);
1552 goto out_tsk;
1553
1554 default:
1555 ret = ptrace_request(child, request, addr, data);
1556 goto out_tsk;
1557 }
1558 out_tsk:
1559 put_task_struct(child);
1560 out:
1561 unlock_kernel();
1562 return ret;
1563}
1564
1565
1566void
1567syscall_trace (void)
1568{
1569 if (!test_thread_flag(TIF_SYSCALL_TRACE))
1570 return;
1571 if (!(current->ptrace & PT_PTRACED))
1572 return;
1573 /*
1574 * The 0x80 provides a way for the tracing parent to
1575 * distinguish between a syscall stop and SIGTRAP delivery.
1576 */
1577 ptrace_notify(SIGTRAP
1578 | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
1579
1580 /*
1581 * This isn't the same as continuing with a signal, but it
1582 * will do for normal use. strace only continues with a
1583 * signal if the stopping signal is not SIGTRAP. -brl
1584 */
1585 if (current->exit_code) {
1586 send_sig(current->exit_code, current, 1);
1587 current->exit_code = 0;
1588 }
1589}
1590
1591/* "asmlinkage" so the input arguments are preserved... */
1592
1593asmlinkage void
1594syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
1595 long arg4, long arg5, long arg6, long arg7,
1596 struct pt_regs regs)
1597{
1598 long syscall;
1599
1600 if (unlikely(current->audit_context)) {
1601 if (IS_IA32_PROCESS(&regs))
1602 syscall = regs.r1;
1603 else
1604 syscall = regs.r15;
1605
1606 audit_syscall_entry(current, syscall, arg0, arg1, arg2, arg3);
1607 }
1608
1609 if (test_thread_flag(TIF_SYSCALL_TRACE)
1610 && (current->ptrace & PT_PTRACED))
1611 syscall_trace();
1612}
1613
1614/* "asmlinkage" so the input arguments are preserved... */
1615
1616asmlinkage void
1617syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
1618 long arg4, long arg5, long arg6, long arg7,
1619 struct pt_regs regs)
1620{
1621 if (unlikely(current->audit_context))
1622 audit_syscall_exit(current, regs.r8);
1623
1624 if (test_thread_flag(TIF_SYSCALL_TRACE)
1625 && (current->ptrace & PT_PTRACED))
1626 syscall_trace();
1627}
diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c
new file mode 100644
index 000000000000..acc0f132f86c
--- /dev/null
+++ b/arch/ia64/kernel/sal.c
@@ -0,0 +1,302 @@
1/*
2 * System Abstraction Layer (SAL) interface routines.
3 *
4 * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 * Copyright (C) 1999 VA Linux Systems
7 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
8 */
9#include <linux/config.h>
10
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/spinlock.h>
15#include <linux/string.h>
16
17#include <asm/page.h>
18#include <asm/sal.h>
19#include <asm/pal.h>
20
21 __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
22unsigned long sal_platform_features;
23
24unsigned short sal_revision;
25unsigned short sal_version;
26
27#define SAL_MAJOR(x) ((x) >> 8)
28#define SAL_MINOR(x) ((x) & 0xff)
29
30static struct {
31 void *addr; /* function entry point */
32 void *gpval; /* gp value to use */
33} pdesc;
34
35static long
36default_handler (void)
37{
38 return -1;
39}
40
41ia64_sal_handler ia64_sal = (ia64_sal_handler) default_handler;
42ia64_sal_desc_ptc_t *ia64_ptc_domain_info;
43
44const char *
45ia64_sal_strerror (long status)
46{
47 const char *str;
48 switch (status) {
49 case 0: str = "Call completed without error"; break;
50 case 1: str = "Effect a warm boot of the system to complete "
51 "the update"; break;
52 case -1: str = "Not implemented"; break;
53 case -2: str = "Invalid argument"; break;
54 case -3: str = "Call completed with error"; break;
55 case -4: str = "Virtual address not registered"; break;
56 case -5: str = "No information available"; break;
57 case -6: str = "Insufficient space to add the entry"; break;
58 case -7: str = "Invalid entry_addr value"; break;
59 case -8: str = "Invalid interrupt vector"; break;
60 case -9: str = "Requested memory not available"; break;
61 case -10: str = "Unable to write to the NVM device"; break;
62 case -11: str = "Invalid partition type specified"; break;
63 case -12: str = "Invalid NVM_Object id specified"; break;
64 case -13: str = "NVM_Object already has the maximum number "
65 "of partitions"; break;
66 case -14: str = "Insufficient space in partition for the "
67 "requested write sub-function"; break;
68 case -15: str = "Insufficient data buffer space for the "
69 "requested read record sub-function"; break;
70 case -16: str = "Scratch buffer required for the write/delete "
71 "sub-function"; break;
72 case -17: str = "Insufficient space in the NVM_Object for the "
73 "requested create sub-function"; break;
74 case -18: str = "Invalid value specified in the partition_rec "
75 "argument"; break;
76 case -19: str = "Record oriented I/O not supported for this "
77 "partition"; break;
78 case -20: str = "Bad format of record to be written or "
79 "required keyword variable not "
80 "specified"; break;
81 default: str = "Unknown SAL status code"; break;
82 }
83 return str;
84}
85
86void __init
87ia64_sal_handler_init (void *entry_point, void *gpval)
88{
89 /* fill in the SAL procedure descriptor and point ia64_sal to it: */
90 pdesc.addr = entry_point;
91 pdesc.gpval = gpval;
92 ia64_sal = (ia64_sal_handler) &pdesc;
93}
94
95static void __init
96check_versions (struct ia64_sal_systab *systab)
97{
98 sal_revision = (systab->sal_rev_major << 8) | systab->sal_rev_minor;
99 sal_version = (systab->sal_b_rev_major << 8) | systab->sal_b_rev_minor;
100
101 /* Check for broken firmware */
102 if ((sal_revision == SAL_VERSION_CODE(49, 29))
103 && (sal_version == SAL_VERSION_CODE(49, 29)))
104 {
105 /*
106 * Old firmware for zx2000 prototypes have this weird version number,
107 * reset it to something sane.
108 */
109 sal_revision = SAL_VERSION_CODE(2, 8);
110 sal_version = SAL_VERSION_CODE(0, 0);
111 }
112}
113
114static void __init
115sal_desc_entry_point (void *p)
116{
117 struct ia64_sal_desc_entry_point *ep = p;
118 ia64_pal_handler_init(__va(ep->pal_proc));
119 ia64_sal_handler_init(__va(ep->sal_proc), __va(ep->gp));
120}
121
122#ifdef CONFIG_SMP
123static void __init
124set_smp_redirect (int flag)
125{
126#ifndef CONFIG_HOTPLUG_CPU
127 if (no_int_routing)
128 smp_int_redirect &= ~flag;
129 else
130 smp_int_redirect |= flag;
131#else
132 /*
133 * For CPU Hotplug we dont want to do any chipset supported
134 * interrupt redirection. The reason is this would require that
135 * All interrupts be stopped and hard bind the irq to a cpu.
136 * Later when the interrupt is fired we need to set the redir hint
137 * on again in the vector. This is combersome for something that the
138 * user mode irq balancer will solve anyways.
139 */
140 no_int_routing=1;
141 smp_int_redirect &= ~flag;
142#endif
143}
144#else
145#define set_smp_redirect(flag) do { } while (0)
146#endif
147
148static void __init
149sal_desc_platform_feature (void *p)
150{
151 struct ia64_sal_desc_platform_feature *pf = p;
152 sal_platform_features = pf->feature_mask;
153
154 printk(KERN_INFO "SAL Platform features:");
155 if (!sal_platform_features) {
156 printk(" None\n");
157 return;
158 }
159
160 if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_BUS_LOCK)
161 printk(" BusLock");
162 if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT) {
163 printk(" IRQ_Redirection");
164 set_smp_redirect(SMP_IRQ_REDIRECTION);
165 }
166 if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT) {
167 printk(" IPI_Redirection");
168 set_smp_redirect(SMP_IPI_REDIRECTION);
169 }
170 if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)
171 printk(" ITC_Drift");
172 printk("\n");
173}
174
175#ifdef CONFIG_SMP
176static void __init
177sal_desc_ap_wakeup (void *p)
178{
179 struct ia64_sal_desc_ap_wakeup *ap = p;
180
181 switch (ap->mechanism) {
182 case IA64_SAL_AP_EXTERNAL_INT:
183 ap_wakeup_vector = ap->vector;
184 printk(KERN_INFO "SAL: AP wakeup using external interrupt "
185 "vector 0x%lx\n", ap_wakeup_vector);
186 break;
187 default:
188 printk(KERN_ERR "SAL: AP wakeup mechanism unsupported!\n");
189 break;
190 }
191}
192
193static void __init
194chk_nointroute_opt(void)
195{
196 char *cp;
197 extern char saved_command_line[];
198
199 for (cp = saved_command_line; *cp; ) {
200 if (memcmp(cp, "nointroute", 10) == 0) {
201 no_int_routing = 1;
202 printk ("no_int_routing on\n");
203 break;
204 } else {
205 while (*cp != ' ' && *cp)
206 ++cp;
207 while (*cp == ' ')
208 ++cp;
209 }
210 }
211}
212
213#else
214static void __init sal_desc_ap_wakeup(void *p) { }
215#endif
216
217void __init
218ia64_sal_init (struct ia64_sal_systab *systab)
219{
220 char *p;
221 int i;
222
223 if (!systab) {
224 printk(KERN_WARNING "Hmm, no SAL System Table.\n");
225 return;
226 }
227
228 if (strncmp(systab->signature, "SST_", 4) != 0)
229 printk(KERN_ERR "bad signature in system table!");
230
231 check_versions(systab);
232#ifdef CONFIG_SMP
233 chk_nointroute_opt();
234#endif
235
236 /* revisions are coded in BCD, so %x does the job for us */
237 printk(KERN_INFO "SAL %x.%x: %.32s %.32s%sversion %x.%x\n",
238 SAL_MAJOR(sal_revision), SAL_MINOR(sal_revision),
239 systab->oem_id, systab->product_id,
240 systab->product_id[0] ? " " : "",
241 SAL_MAJOR(sal_version), SAL_MINOR(sal_version));
242
243 p = (char *) (systab + 1);
244 for (i = 0; i < systab->entry_count; i++) {
245 /*
246 * The first byte of each entry type contains the type
247 * descriptor.
248 */
249 switch (*p) {
250 case SAL_DESC_ENTRY_POINT:
251 sal_desc_entry_point(p);
252 break;
253 case SAL_DESC_PLATFORM_FEATURE:
254 sal_desc_platform_feature(p);
255 break;
256 case SAL_DESC_PTC:
257 ia64_ptc_domain_info = (ia64_sal_desc_ptc_t *)p;
258 break;
259 case SAL_DESC_AP_WAKEUP:
260 sal_desc_ap_wakeup(p);
261 break;
262 }
263 p += SAL_DESC_SIZE(*p);
264 }
265}
266
267int
268ia64_sal_oemcall(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
269 u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7)
270{
271 if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
272 return -1;
273 SAL_CALL(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
274 return 0;
275}
276EXPORT_SYMBOL(ia64_sal_oemcall);
277
278int
279ia64_sal_oemcall_nolock(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
280 u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6,
281 u64 arg7)
282{
283 if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
284 return -1;
285 SAL_CALL_NOLOCK(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
286 arg7);
287 return 0;
288}
289EXPORT_SYMBOL(ia64_sal_oemcall_nolock);
290
291int
292ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc,
293 u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5,
294 u64 arg6, u64 arg7)
295{
296 if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
297 return -1;
298 SAL_CALL_REENTRANT(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
299 arg7);
300 return 0;
301}
302EXPORT_SYMBOL(ia64_sal_oemcall_reentrant);
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
new file mode 100644
index 000000000000..d227fabecd02
--- /dev/null
+++ b/arch/ia64/kernel/salinfo.c
@@ -0,0 +1,629 @@
1/*
2 * salinfo.c
3 *
4 * Creates entries in /proc/sal for various system features.
5 *
6 * Copyright (c) 2003 Silicon Graphics, Inc. All rights reserved.
7 * Copyright (c) 2003 Hewlett-Packard Co
8 * Bjorn Helgaas <bjorn.helgaas@hp.com>
9 *
10 * 10/30/2001 jbarnes@sgi.com copied much of Stephane's palinfo
11 * code to create this file
12 * Oct 23 2003 kaos@sgi.com
13 * Replace IPI with set_cpus_allowed() to read a record from the required cpu.
14 * Redesign salinfo log processing to separate interrupt and user space
15 * contexts.
16 * Cache the record across multi-block reads from user space.
17 * Support > 64 cpus.
18 * Delete module_exit and MOD_INC/DEC_COUNT, salinfo cannot be a module.
19 *
20 * Jan 28 2004 kaos@sgi.com
21 * Periodically check for outstanding MCA or INIT records.
22 *
23 * Dec 5 2004 kaos@sgi.com
24 * Standardize which records are cleared automatically.
25 */
26
27#include <linux/types.h>
28#include <linux/proc_fs.h>
29#include <linux/module.h>
30#include <linux/smp.h>
31#include <linux/smp_lock.h>
32#include <linux/timer.h>
33#include <linux/vmalloc.h>
34
35#include <asm/semaphore.h>
36#include <asm/sal.h>
37#include <asm/uaccess.h>
38
39MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
40MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
41MODULE_LICENSE("GPL");
42
43static int salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data);
44
45typedef struct {
46 const char *name; /* name of the proc entry */
47 unsigned long feature; /* feature bit */
48 struct proc_dir_entry *entry; /* registered entry (removal) */
49} salinfo_entry_t;
50
51/*
52 * List {name,feature} pairs for every entry in /proc/sal/<feature>
53 * that this module exports
54 */
55static salinfo_entry_t salinfo_entries[]={
56 { "bus_lock", IA64_SAL_PLATFORM_FEATURE_BUS_LOCK, },
57 { "irq_redirection", IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT, },
58 { "ipi_redirection", IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT, },
59 { "itc_drift", IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT, },
60};
61
62#define NR_SALINFO_ENTRIES ARRAY_SIZE(salinfo_entries)
63
64static char *salinfo_log_name[] = {
65 "mca",
66 "init",
67 "cmc",
68 "cpe",
69};
70
71static struct proc_dir_entry *salinfo_proc_entries[
72 ARRAY_SIZE(salinfo_entries) + /* /proc/sal/bus_lock */
73 ARRAY_SIZE(salinfo_log_name) + /* /proc/sal/{mca,...} */
74 (2 * ARRAY_SIZE(salinfo_log_name)) + /* /proc/sal/mca/{event,data} */
75 1]; /* /proc/sal */
76
77/* Some records we get ourselves, some are accessed as saved data in buffers
78 * that are owned by mca.c.
79 */
80struct salinfo_data_saved {
81 u8* buffer;
82 u64 size;
83 u64 id;
84 int cpu;
85};
86
87/* State transitions. Actions are :-
88 * Write "read <cpunum>" to the data file.
89 * Write "clear <cpunum>" to the data file.
90 * Write "oemdata <cpunum> <offset> to the data file.
91 * Read from the data file.
92 * Close the data file.
93 *
94 * Start state is NO_DATA.
95 *
96 * NO_DATA
97 * write "read <cpunum>" -> NO_DATA or LOG_RECORD.
98 * write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
99 * write "oemdata <cpunum> <offset> -> return -EINVAL.
100 * read data -> return EOF.
101 * close -> unchanged. Free record areas.
102 *
103 * LOG_RECORD
104 * write "read <cpunum>" -> NO_DATA or LOG_RECORD.
105 * write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
106 * write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
107 * read data -> return the INIT/MCA/CMC/CPE record.
108 * close -> unchanged. Keep record areas.
109 *
110 * OEMDATA
111 * write "read <cpunum>" -> NO_DATA or LOG_RECORD.
112 * write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
113 * write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
114 * read data -> return the formatted oemdata.
115 * close -> unchanged. Keep record areas.
116 *
117 * Closing the data file does not change the state. This allows shell scripts
118 * to manipulate salinfo data, each shell redirection opens the file, does one
119 * action then closes it again. The record areas are only freed at close when
120 * the state is NO_DATA.
121 */
122enum salinfo_state {
123 STATE_NO_DATA,
124 STATE_LOG_RECORD,
125 STATE_OEMDATA,
126};
127
128struct salinfo_data {
129 volatile cpumask_t cpu_event; /* which cpus have outstanding events */
130 struct semaphore sem; /* count of cpus with outstanding events (bits set in cpu_event) */
131 u8 *log_buffer;
132 u64 log_size;
133 u8 *oemdata; /* decoded oem data */
134 u64 oemdata_size;
135 int open; /* single-open to prevent races */
136 u8 type;
137 u8 saved_num; /* using a saved record? */
138 enum salinfo_state state :8; /* processing state */
139 u8 padding;
140 int cpu_check; /* next CPU to check */
141 struct salinfo_data_saved data_saved[5];/* save last 5 records from mca.c, must be < 255 */
142};
143
144static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
145
146static spinlock_t data_lock, data_saved_lock;
147
148/** salinfo_platform_oemdata - optional callback to decode oemdata from an error
149 * record.
150 * @sect_header: pointer to the start of the section to decode.
151 * @oemdata: returns vmalloc area containing the decded output.
152 * @oemdata_size: returns length of decoded output (strlen).
153 *
154 * Description: If user space asks for oem data to be decoded by the kernel
155 * and/or prom and the platform has set salinfo_platform_oemdata to the address
156 * of a platform specific routine then call that routine. salinfo_platform_oemdata
157 * vmalloc's and formats its output area, returning the address of the text
158 * and its strlen. Returns 0 for success, -ve for error. The callback is
159 * invoked on the cpu that generated the error record.
160 */
161int (*salinfo_platform_oemdata)(const u8 *sect_header, u8 **oemdata, u64 *oemdata_size);
162
163struct salinfo_platform_oemdata_parms {
164 const u8 *efi_guid;
165 u8 **oemdata;
166 u64 *oemdata_size;
167 int ret;
168};
169
170static void
171salinfo_platform_oemdata_cpu(void *context)
172{
173 struct salinfo_platform_oemdata_parms *parms = context;
174 parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
175}
176
177static void
178shift1_data_saved (struct salinfo_data *data, int shift)
179{
180 memcpy(data->data_saved+shift, data->data_saved+shift+1,
181 (ARRAY_SIZE(data->data_saved) - (shift+1)) * sizeof(data->data_saved[0]));
182 memset(data->data_saved + ARRAY_SIZE(data->data_saved) - 1, 0,
183 sizeof(data->data_saved[0]));
184}
185
186/* This routine is invoked in interrupt context. Note: mca.c enables
187 * interrupts before calling this code for CMC/CPE. MCA and INIT events are
188 * not irq safe, do not call any routines that use spinlocks, they may deadlock.
189 * MCA and INIT records are recorded, a timer event will look for any
190 * outstanding events and wake up the user space code.
191 *
192 * The buffer passed from mca.c points to the output from ia64_log_get. This is
193 * a persistent buffer but its contents can change between the interrupt and
194 * when user space processes the record. Save the record id to identify
195 * changes.
196 */
197void
198salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe)
199{
200 struct salinfo_data *data = salinfo_data + type;
201 struct salinfo_data_saved *data_saved;
202 unsigned long flags = 0;
203 int i;
204 int saved_size = ARRAY_SIZE(data->data_saved);
205
206 BUG_ON(type >= ARRAY_SIZE(salinfo_log_name));
207
208 if (irqsafe)
209 spin_lock_irqsave(&data_saved_lock, flags);
210 for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
211 if (!data_saved->buffer)
212 break;
213 }
214 if (i == saved_size) {
215 if (!data->saved_num) {
216 shift1_data_saved(data, 0);
217 data_saved = data->data_saved + saved_size - 1;
218 } else
219 data_saved = NULL;
220 }
221 if (data_saved) {
222 data_saved->cpu = smp_processor_id();
223 data_saved->id = ((sal_log_record_header_t *)buffer)->id;
224 data_saved->size = size;
225 data_saved->buffer = buffer;
226 }
227 if (irqsafe)
228 spin_unlock_irqrestore(&data_saved_lock, flags);
229
230 if (!test_and_set_bit(smp_processor_id(), &data->cpu_event)) {
231 if (irqsafe)
232 up(&data->sem);
233 }
234}
235
236/* Check for outstanding MCA/INIT records every minute (arbitrary) */
237#define SALINFO_TIMER_DELAY (60*HZ)
238static struct timer_list salinfo_timer;
239
240static void
241salinfo_timeout_check(struct salinfo_data *data)
242{
243 int i;
244 if (!data->open)
245 return;
246 for (i = 0; i < NR_CPUS; ++i) {
247 if (test_bit(i, &data->cpu_event)) {
248 /* double up() is not a problem, user space will see no
249 * records for the additional "events".
250 */
251 up(&data->sem);
252 }
253 }
254}
255
256static void
257salinfo_timeout (unsigned long arg)
258{
259 salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
260 salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
261 salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
262 add_timer(&salinfo_timer);
263}
264
265static int
266salinfo_event_open(struct inode *inode, struct file *file)
267{
268 if (!capable(CAP_SYS_ADMIN))
269 return -EPERM;
270 return 0;
271}
272
273static ssize_t
274salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
275{
276 struct inode *inode = file->f_dentry->d_inode;
277 struct proc_dir_entry *entry = PDE(inode);
278 struct salinfo_data *data = entry->data;
279 char cmd[32];
280 size_t size;
281 int i, n, cpu = -1;
282
283retry:
284 if (down_trylock(&data->sem)) {
285 if (file->f_flags & O_NONBLOCK)
286 return -EAGAIN;
287 if (down_interruptible(&data->sem))
288 return -ERESTARTSYS;
289 }
290
291 n = data->cpu_check;
292 for (i = 0; i < NR_CPUS; i++) {
293 if (test_bit(n, &data->cpu_event)) {
294 cpu = n;
295 break;
296 }
297 if (++n == NR_CPUS)
298 n = 0;
299 }
300
301 if (cpu == -1)
302 goto retry;
303
304 /* events are sticky until the user says "clear" */
305 up(&data->sem);
306
307 /* for next read, start checking at next CPU */
308 data->cpu_check = cpu;
309 if (++data->cpu_check == NR_CPUS)
310 data->cpu_check = 0;
311
312 snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
313
314 size = strlen(cmd);
315 if (size > count)
316 size = count;
317 if (copy_to_user(buffer, cmd, size))
318 return -EFAULT;
319
320 return size;
321}
322
323static struct file_operations salinfo_event_fops = {
324 .open = salinfo_event_open,
325 .read = salinfo_event_read,
326};
327
328static int
329salinfo_log_open(struct inode *inode, struct file *file)
330{
331 struct proc_dir_entry *entry = PDE(inode);
332 struct salinfo_data *data = entry->data;
333
334 if (!capable(CAP_SYS_ADMIN))
335 return -EPERM;
336
337 spin_lock(&data_lock);
338 if (data->open) {
339 spin_unlock(&data_lock);
340 return -EBUSY;
341 }
342 data->open = 1;
343 spin_unlock(&data_lock);
344
345 if (data->state == STATE_NO_DATA &&
346 !(data->log_buffer = vmalloc(ia64_sal_get_state_info_size(data->type)))) {
347 data->open = 0;
348 return -ENOMEM;
349 }
350
351 return 0;
352}
353
354static int
355salinfo_log_release(struct inode *inode, struct file *file)
356{
357 struct proc_dir_entry *entry = PDE(inode);
358 struct salinfo_data *data = entry->data;
359
360 if (data->state == STATE_NO_DATA) {
361 vfree(data->log_buffer);
362 vfree(data->oemdata);
363 data->log_buffer = NULL;
364 data->oemdata = NULL;
365 }
366 spin_lock(&data_lock);
367 data->open = 0;
368 spin_unlock(&data_lock);
369 return 0;
370}
371
372static void
373call_on_cpu(int cpu, void (*fn)(void *), void *arg)
374{
375 cpumask_t save_cpus_allowed, new_cpus_allowed;
376 memcpy(&save_cpus_allowed, &current->cpus_allowed, sizeof(save_cpus_allowed));
377 memset(&new_cpus_allowed, 0, sizeof(new_cpus_allowed));
378 set_bit(cpu, &new_cpus_allowed);
379 set_cpus_allowed(current, new_cpus_allowed);
380 (*fn)(arg);
381 set_cpus_allowed(current, save_cpus_allowed);
382}
383
384static void
385salinfo_log_read_cpu(void *context)
386{
387 struct salinfo_data *data = context;
388 sal_log_record_header_t *rh;
389 data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer);
390 rh = (sal_log_record_header_t *)(data->log_buffer);
391 /* Clear corrected errors as they are read from SAL */
392 if (rh->severity == sal_log_severity_corrected)
393 ia64_sal_clear_state_info(data->type);
394}
395
396static void
397salinfo_log_new_read(int cpu, struct salinfo_data *data)
398{
399 struct salinfo_data_saved *data_saved;
400 unsigned long flags;
401 int i;
402 int saved_size = ARRAY_SIZE(data->data_saved);
403
404 data->saved_num = 0;
405 spin_lock_irqsave(&data_saved_lock, flags);
406retry:
407 for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
408 if (data_saved->buffer && data_saved->cpu == cpu) {
409 sal_log_record_header_t *rh = (sal_log_record_header_t *)(data_saved->buffer);
410 data->log_size = data_saved->size;
411 memcpy(data->log_buffer, rh, data->log_size);
412 barrier(); /* id check must not be moved */
413 if (rh->id == data_saved->id) {
414 data->saved_num = i+1;
415 break;
416 }
417 /* saved record changed by mca.c since interrupt, discard it */
418 shift1_data_saved(data, i);
419 goto retry;
420 }
421 }
422 spin_unlock_irqrestore(&data_saved_lock, flags);
423
424 if (!data->saved_num)
425 call_on_cpu(cpu, salinfo_log_read_cpu, data);
426 if (!data->log_size) {
427 data->state = STATE_NO_DATA;
428 clear_bit(cpu, &data->cpu_event);
429 } else {
430 data->state = STATE_LOG_RECORD;
431 }
432}
433
434static ssize_t
435salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
436{
437 struct inode *inode = file->f_dentry->d_inode;
438 struct proc_dir_entry *entry = PDE(inode);
439 struct salinfo_data *data = entry->data;
440 u8 *buf;
441 u64 bufsize;
442
443 if (data->state == STATE_LOG_RECORD) {
444 buf = data->log_buffer;
445 bufsize = data->log_size;
446 } else if (data->state == STATE_OEMDATA) {
447 buf = data->oemdata;
448 bufsize = data->oemdata_size;
449 } else {
450 buf = NULL;
451 bufsize = 0;
452 }
453 return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
454}
455
456static void
457salinfo_log_clear_cpu(void *context)
458{
459 struct salinfo_data *data = context;
460 ia64_sal_clear_state_info(data->type);
461}
462
463static int
464salinfo_log_clear(struct salinfo_data *data, int cpu)
465{
466 sal_log_record_header_t *rh;
467 data->state = STATE_NO_DATA;
468 if (!test_bit(cpu, &data->cpu_event))
469 return 0;
470 down(&data->sem);
471 clear_bit(cpu, &data->cpu_event);
472 if (data->saved_num) {
473 unsigned long flags;
474 spin_lock_irqsave(&data_saved_lock, flags);
475 shift1_data_saved(data, data->saved_num - 1 );
476 data->saved_num = 0;
477 spin_unlock_irqrestore(&data_saved_lock, flags);
478 }
479 rh = (sal_log_record_header_t *)(data->log_buffer);
480 /* Corrected errors have already been cleared from SAL */
481 if (rh->severity != sal_log_severity_corrected)
482 call_on_cpu(cpu, salinfo_log_clear_cpu, data);
483 /* clearing a record may make a new record visible */
484 salinfo_log_new_read(cpu, data);
485 if (data->state == STATE_LOG_RECORD &&
486 !test_and_set_bit(cpu, &data->cpu_event))
487 up(&data->sem);
488 return 0;
489}
490
491static ssize_t
492salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
493{
494 struct inode *inode = file->f_dentry->d_inode;
495 struct proc_dir_entry *entry = PDE(inode);
496 struct salinfo_data *data = entry->data;
497 char cmd[32];
498 size_t size;
499 u32 offset;
500 int cpu;
501
502 size = sizeof(cmd);
503 if (count < size)
504 size = count;
505 if (copy_from_user(cmd, buffer, size))
506 return -EFAULT;
507
508 if (sscanf(cmd, "read %d", &cpu) == 1) {
509 salinfo_log_new_read(cpu, data);
510 } else if (sscanf(cmd, "clear %d", &cpu) == 1) {
511 int ret;
512 if ((ret = salinfo_log_clear(data, cpu)))
513 count = ret;
514 } else if (sscanf(cmd, "oemdata %d %d", &cpu, &offset) == 2) {
515 if (data->state != STATE_LOG_RECORD && data->state != STATE_OEMDATA)
516 return -EINVAL;
517 if (offset > data->log_size - sizeof(efi_guid_t))
518 return -EINVAL;
519 data->state = STATE_OEMDATA;
520 if (salinfo_platform_oemdata) {
521 struct salinfo_platform_oemdata_parms parms = {
522 .efi_guid = data->log_buffer + offset,
523 .oemdata = &data->oemdata,
524 .oemdata_size = &data->oemdata_size
525 };
526 call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms);
527 if (parms.ret)
528 count = parms.ret;
529 } else
530 data->oemdata_size = 0;
531 } else
532 return -EINVAL;
533
534 return count;
535}
536
537static struct file_operations salinfo_data_fops = {
538 .open = salinfo_log_open,
539 .release = salinfo_log_release,
540 .read = salinfo_log_read,
541 .write = salinfo_log_write,
542};
543
544static int __init
545salinfo_init(void)
546{
547 struct proc_dir_entry *salinfo_dir; /* /proc/sal dir entry */
548 struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */
549 struct proc_dir_entry *dir, *entry;
550 struct salinfo_data *data;
551 int i, j, online;
552
553 salinfo_dir = proc_mkdir("sal", NULL);
554 if (!salinfo_dir)
555 return 0;
556
557 for (i=0; i < NR_SALINFO_ENTRIES; i++) {
558 /* pass the feature bit in question as misc data */
559 *sdir++ = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir,
560 salinfo_read, (void *)salinfo_entries[i].feature);
561 }
562
563 for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
564 data = salinfo_data + i;
565 data->type = i;
566 sema_init(&data->sem, 0);
567 dir = proc_mkdir(salinfo_log_name[i], salinfo_dir);
568 if (!dir)
569 continue;
570
571 entry = create_proc_entry("event", S_IRUSR, dir);
572 if (!entry)
573 continue;
574 entry->data = data;
575 entry->proc_fops = &salinfo_event_fops;
576 *sdir++ = entry;
577
578 entry = create_proc_entry("data", S_IRUSR | S_IWUSR, dir);
579 if (!entry)
580 continue;
581 entry->data = data;
582 entry->proc_fops = &salinfo_data_fops;
583 *sdir++ = entry;
584
585 /* we missed any events before now */
586 online = 0;
587 for (j = 0; j < NR_CPUS; j++)
588 if (cpu_online(j)) {
589 set_bit(j, &data->cpu_event);
590 ++online;
591 }
592 sema_init(&data->sem, online);
593
594 *sdir++ = dir;
595 }
596
597 *sdir++ = salinfo_dir;
598
599 init_timer(&salinfo_timer);
600 salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
601 salinfo_timer.function = &salinfo_timeout;
602 add_timer(&salinfo_timer);
603
604 return 0;
605}
606
607/*
608 * 'data' contains an integer that corresponds to the feature we're
609 * testing
610 */
611static int
612salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data)
613{
614 int len = 0;
615
616 len = sprintf(page, (sal_platform_features & (unsigned long)data) ? "1\n" : "0\n");
617
618 if (len <= off+count) *eof = 1;
619
620 *start = page + off;
621 len -= off;
622
623 if (len>count) len = count;
624 if (len<0) len = 0;
625
626 return len;
627}
628
629module_init(salinfo_init);
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
new file mode 100644
index 000000000000..2724ef3fbae2
--- /dev/null
+++ b/arch/ia64/kernel/semaphore.c
@@ -0,0 +1,165 @@
1/*
2 * IA-64 semaphore implementation (derived from x86 version).
3 *
4 * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 */
7
8/*
9 * Semaphores are implemented using a two-way counter: The "count"
10 * variable is decremented for each process that tries to acquire the
11 * semaphore, while the "sleepers" variable is a count of such
12 * acquires.
13 *
14 * Notably, the inline "up()" and "down()" functions can efficiently
15 * test if they need to do any extra work (up needs to do something
16 * only if count was negative before the increment operation.
17 *
18 * "sleeping" and the contention routine ordering is protected
19 * by the spinlock in the semaphore's waitqueue head.
20 *
21 * Note that these functions are only called when there is contention
22 * on the lock, and as such all this is the "non-critical" part of the
23 * whole semaphore business. The critical part is the inline stuff in
24 * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
25 */
26#include <linux/sched.h>
27#include <linux/init.h>
28
29#include <asm/errno.h>
30#include <asm/semaphore.h>
31
32/*
33 * Logic:
34 * - Only on a boundary condition do we need to care. When we go
35 * from a negative count to a non-negative, we wake people up.
36 * - When we go from a non-negative count to a negative do we
37 * (a) synchronize with the "sleepers" count and (b) make sure
38 * that we're on the wakeup list before we synchronize so that
39 * we cannot lose wakeup events.
40 */
41
42void
43__up (struct semaphore *sem)
44{
45 wake_up(&sem->wait);
46}
47
48void __sched __down (struct semaphore *sem)
49{
50 struct task_struct *tsk = current;
51 DECLARE_WAITQUEUE(wait, tsk);
52 unsigned long flags;
53
54 tsk->state = TASK_UNINTERRUPTIBLE;
55 spin_lock_irqsave(&sem->wait.lock, flags);
56 add_wait_queue_exclusive_locked(&sem->wait, &wait);
57
58 sem->sleepers++;
59 for (;;) {
60 int sleepers = sem->sleepers;
61
62 /*
63 * Add "everybody else" into it. They aren't
64 * playing, because we own the spinlock in
65 * the wait_queue_head.
66 */
67 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
68 sem->sleepers = 0;
69 break;
70 }
71 sem->sleepers = 1; /* us - see -1 above */
72 spin_unlock_irqrestore(&sem->wait.lock, flags);
73
74 schedule();
75
76 spin_lock_irqsave(&sem->wait.lock, flags);
77 tsk->state = TASK_UNINTERRUPTIBLE;
78 }
79 remove_wait_queue_locked(&sem->wait, &wait);
80 wake_up_locked(&sem->wait);
81 spin_unlock_irqrestore(&sem->wait.lock, flags);
82 tsk->state = TASK_RUNNING;
83}
84
85int __sched __down_interruptible (struct semaphore * sem)
86{
87 int retval = 0;
88 struct task_struct *tsk = current;
89 DECLARE_WAITQUEUE(wait, tsk);
90 unsigned long flags;
91
92 tsk->state = TASK_INTERRUPTIBLE;
93 spin_lock_irqsave(&sem->wait.lock, flags);
94 add_wait_queue_exclusive_locked(&sem->wait, &wait);
95
96 sem->sleepers ++;
97 for (;;) {
98 int sleepers = sem->sleepers;
99
100 /*
101 * With signals pending, this turns into
102 * the trylock failure case - we won't be
103 * sleeping, and we* can't get the lock as
104 * it has contention. Just correct the count
105 * and exit.
106 */
107 if (signal_pending(current)) {
108 retval = -EINTR;
109 sem->sleepers = 0;
110 atomic_add(sleepers, &sem->count);
111 break;
112 }
113
114 /*
115 * Add "everybody else" into it. They aren't
116 * playing, because we own the spinlock in
117 * wait_queue_head. The "-1" is because we're
118 * still hoping to get the semaphore.
119 */
120 if (!atomic_add_negative(sleepers - 1, &sem->count)) {
121 sem->sleepers = 0;
122 break;
123 }
124 sem->sleepers = 1; /* us - see -1 above */
125 spin_unlock_irqrestore(&sem->wait.lock, flags);
126
127 schedule();
128
129 spin_lock_irqsave(&sem->wait.lock, flags);
130 tsk->state = TASK_INTERRUPTIBLE;
131 }
132 remove_wait_queue_locked(&sem->wait, &wait);
133 wake_up_locked(&sem->wait);
134 spin_unlock_irqrestore(&sem->wait.lock, flags);
135
136 tsk->state = TASK_RUNNING;
137 return retval;
138}
139
140/*
141 * Trylock failed - make sure we correct for having decremented the
142 * count.
143 */
144int
145__down_trylock (struct semaphore *sem)
146{
147 unsigned long flags;
148 int sleepers;
149
150 spin_lock_irqsave(&sem->wait.lock, flags);
151 sleepers = sem->sleepers + 1;
152 sem->sleepers = 0;
153
154 /*
155 * Add "everybody else" and us into it. They aren't
156 * playing, because we own the spinlock in the
157 * wait_queue_head.
158 */
159 if (!atomic_add_negative(sleepers, &sem->count)) {
160 wake_up_locked(&sem->wait);
161 }
162
163 spin_unlock_irqrestore(&sem->wait.lock, flags);
164 return 1;
165}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
new file mode 100644
index 000000000000..f05650c801d2
--- /dev/null
+++ b/arch/ia64/kernel/setup.c
@@ -0,0 +1,723 @@
1/*
2 * Architecture-specific setup.
3 *
4 * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 * Stephane Eranian <eranian@hpl.hp.com>
7 * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
8 * Copyright (C) 1999 VA Linux Systems
9 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
10 *
11 * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
12 * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
13 * 03/31/00 R.Seth cpu_initialized and current->processor fixes
14 * 02/04/00 D.Mosberger some more get_cpuinfo fixes...
15 * 02/01/00 R.Seth fixed get_cpuinfo for SMP
16 * 01/07/99 S.Eranian added the support for command line argument
17 * 06/24/99 W.Drummond added boot_cpu_data.
18 */
19#include <linux/config.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <linux/acpi.h>
24#include <linux/bootmem.h>
25#include <linux/console.h>
26#include <linux/delay.h>
27#include <linux/kernel.h>
28#include <linux/reboot.h>
29#include <linux/sched.h>
30#include <linux/seq_file.h>
31#include <linux/string.h>
32#include <linux/threads.h>
33#include <linux/tty.h>
34#include <linux/serial.h>
35#include <linux/serial_core.h>
36#include <linux/efi.h>
37#include <linux/initrd.h>
38
39#include <asm/ia32.h>
40#include <asm/machvec.h>
41#include <asm/mca.h>
42#include <asm/meminit.h>
43#include <asm/page.h>
44#include <asm/patch.h>
45#include <asm/pgtable.h>
46#include <asm/processor.h>
47#include <asm/sal.h>
48#include <asm/sections.h>
49#include <asm/serial.h>
50#include <asm/setup.h>
51#include <asm/smp.h>
52#include <asm/system.h>
53#include <asm/unistd.h>
54
55#if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
56# error "struct cpuinfo_ia64 too big!"
57#endif
58
59#ifdef CONFIG_SMP
60unsigned long __per_cpu_offset[NR_CPUS];
61EXPORT_SYMBOL(__per_cpu_offset);
62#endif
63
64DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
65DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
66DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
67unsigned long ia64_cycles_per_usec;
68struct ia64_boot_param *ia64_boot_param;
69struct screen_info screen_info;
70
71unsigned long ia64_max_cacheline_size;
72unsigned long ia64_iobase; /* virtual address for I/O accesses */
73EXPORT_SYMBOL(ia64_iobase);
74struct io_space io_space[MAX_IO_SPACES];
75EXPORT_SYMBOL(io_space);
76unsigned int num_io_spaces;
77
78/*
79 * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This
80 * mask specifies a mask of address bits that must be 0 in order for two buffers to be
81 * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start
82 * address of the second buffer must be aligned to (merge_mask+1) in order to be
83 * mergeable). By default, we assume there is no I/O MMU which can merge physically
84 * discontiguous buffers, so we set the merge_mask to ~0UL, which corresponds to a iommu
85 * page-size of 2^64.
86 */
87unsigned long ia64_max_iommu_merge_mask = ~0UL;
88EXPORT_SYMBOL(ia64_max_iommu_merge_mask);
89
90/*
91 * We use a special marker for the end of memory and it uses the extra (+1) slot
92 */
93struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
94int num_rsvd_regions;
95
96
97/*
98 * Filter incoming memory segments based on the primitive map created from the boot
99 * parameters. Segments contained in the map are removed from the memory ranges. A
100 * caller-specified function is called with the memory ranges that remain after filtering.
101 * This routine does not assume the incoming segments are sorted.
102 */
103int
104filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
105{
106 unsigned long range_start, range_end, prev_start;
107 void (*func)(unsigned long, unsigned long, int);
108 int i;
109
110#if IGNORE_PFN0
111 if (start == PAGE_OFFSET) {
112 printk(KERN_WARNING "warning: skipping physical page 0\n");
113 start += PAGE_SIZE;
114 if (start >= end) return 0;
115 }
116#endif
117 /*
118 * lowest possible address(walker uses virtual)
119 */
120 prev_start = PAGE_OFFSET;
121 func = arg;
122
123 for (i = 0; i < num_rsvd_regions; ++i) {
124 range_start = max(start, prev_start);
125 range_end = min(end, rsvd_region[i].start);
126
127 if (range_start < range_end)
128 call_pernode_memory(__pa(range_start), range_end - range_start, func);
129
130 /* nothing more available in this segment */
131 if (range_end == end) return 0;
132
133 prev_start = rsvd_region[i].end;
134 }
135 /* end of memory marker allows full processing inside loop body */
136 return 0;
137}
138
139static void
140sort_regions (struct rsvd_region *rsvd_region, int max)
141{
142 int j;
143
144 /* simple bubble sorting */
145 while (max--) {
146 for (j = 0; j < max; ++j) {
147 if (rsvd_region[j].start > rsvd_region[j+1].start) {
148 struct rsvd_region tmp;
149 tmp = rsvd_region[j];
150 rsvd_region[j] = rsvd_region[j + 1];
151 rsvd_region[j + 1] = tmp;
152 }
153 }
154 }
155}
156
157/**
158 * reserve_memory - setup reserved memory areas
159 *
160 * Setup the reserved memory areas set aside for the boot parameters,
161 * initrd, etc. There are currently %IA64_MAX_RSVD_REGIONS defined,
162 * see include/asm-ia64/meminit.h if you need to define more.
163 */
164void
165reserve_memory (void)
166{
167 int n = 0;
168
169 /*
170 * none of the entries in this table overlap
171 */
172 rsvd_region[n].start = (unsigned long) ia64_boot_param;
173 rsvd_region[n].end = rsvd_region[n].start + sizeof(*ia64_boot_param);
174 n++;
175
176 rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->efi_memmap);
177 rsvd_region[n].end = rsvd_region[n].start + ia64_boot_param->efi_memmap_size;
178 n++;
179
180 rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->command_line);
181 rsvd_region[n].end = (rsvd_region[n].start
182 + strlen(__va(ia64_boot_param->command_line)) + 1);
183 n++;
184
185 rsvd_region[n].start = (unsigned long) ia64_imva((void *)KERNEL_START);
186 rsvd_region[n].end = (unsigned long) ia64_imva(_end);
187 n++;
188
189#ifdef CONFIG_BLK_DEV_INITRD
190 if (ia64_boot_param->initrd_start) {
191 rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
192 rsvd_region[n].end = rsvd_region[n].start + ia64_boot_param->initrd_size;
193 n++;
194 }
195#endif
196
197 /* end of memory marker */
198 rsvd_region[n].start = ~0UL;
199 rsvd_region[n].end = ~0UL;
200 n++;
201
202 num_rsvd_regions = n;
203
204 sort_regions(rsvd_region, num_rsvd_regions);
205}
206
207/**
208 * find_initrd - get initrd parameters from the boot parameter structure
209 *
210 * Grab the initrd start and end from the boot parameter struct given us by
211 * the boot loader.
212 */
213void
214find_initrd (void)
215{
216#ifdef CONFIG_BLK_DEV_INITRD
217 if (ia64_boot_param->initrd_start) {
218 initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start);
219 initrd_end = initrd_start+ia64_boot_param->initrd_size;
220
221 printk(KERN_INFO "Initial ramdisk at: 0x%lx (%lu bytes)\n",
222 initrd_start, ia64_boot_param->initrd_size);
223 }
224#endif
225}
226
227static void __init
228io_port_init (void)
229{
230 extern unsigned long ia64_iobase;
231 unsigned long phys_iobase;
232
233 /*
234 * Set `iobase' to the appropriate address in region 6 (uncached access range).
235 *
236 * The EFI memory map is the "preferred" location to get the I/O port space base,
237 * rather the relying on AR.KR0. This should become more clear in future SAL
238 * specs. We'll fall back to getting it out of AR.KR0 if no appropriate entry is
239 * found in the memory map.
240 */
241 phys_iobase = efi_get_iobase();
242 if (phys_iobase)
243 /* set AR.KR0 since this is all we use it for anyway */
244 ia64_set_kr(IA64_KR_IO_BASE, phys_iobase);
245 else {
246 phys_iobase = ia64_get_kr(IA64_KR_IO_BASE);
247 printk(KERN_INFO "No I/O port range found in EFI memory map, falling back "
248 "to AR.KR0\n");
249 printk(KERN_INFO "I/O port base = 0x%lx\n", phys_iobase);
250 }
251 ia64_iobase = (unsigned long) ioremap(phys_iobase, 0);
252
253 /* setup legacy IO port space */
254 io_space[0].mmio_base = ia64_iobase;
255 io_space[0].sparse = 1;
256 num_io_spaces = 1;
257}
258
259/**
260 * early_console_setup - setup debugging console
261 *
262 * Consoles started here require little enough setup that we can start using
263 * them very early in the boot process, either right after the machine
264 * vector initialization, or even before if the drivers can detect their hw.
265 *
266 * Returns non-zero if a console couldn't be setup.
267 */
268static inline int __init
269early_console_setup (char *cmdline)
270{
271#ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
272 {
273 extern int sn_serial_console_early_setup(void);
274 if (!sn_serial_console_early_setup())
275 return 0;
276 }
277#endif
278#ifdef CONFIG_EFI_PCDP
279 if (!efi_setup_pcdp_console(cmdline))
280 return 0;
281#endif
282#ifdef CONFIG_SERIAL_8250_CONSOLE
283 if (!early_serial_console_init(cmdline))
284 return 0;
285#endif
286
287 return -1;
288}
289
290static inline void
291mark_bsp_online (void)
292{
293#ifdef CONFIG_SMP
294 /* If we register an early console, allow CPU 0 to printk */
295 cpu_set(smp_processor_id(), cpu_online_map);
296#endif
297}
298
299void __init
300setup_arch (char **cmdline_p)
301{
302 unw_init();
303
304 ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
305
306 *cmdline_p = __va(ia64_boot_param->command_line);
307 strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE);
308
309 efi_init();
310 io_port_init();
311
312#ifdef CONFIG_IA64_GENERIC
313 {
314 const char *mvec_name = strstr (*cmdline_p, "machvec=");
315 char str[64];
316
317 if (mvec_name) {
318 const char *end;
319 size_t len;
320
321 mvec_name += 8;
322 end = strchr (mvec_name, ' ');
323 if (end)
324 len = end - mvec_name;
325 else
326 len = strlen (mvec_name);
327 len = min(len, sizeof (str) - 1);
328 strncpy (str, mvec_name, len);
329 str[len] = '\0';
330 mvec_name = str;
331 } else
332 mvec_name = acpi_get_sysname();
333 machvec_init(mvec_name);
334 }
335#endif
336
337 if (early_console_setup(*cmdline_p) == 0)
338 mark_bsp_online();
339
340#ifdef CONFIG_ACPI_BOOT
341 /* Initialize the ACPI boot-time table parser */
342 acpi_table_init();
343# ifdef CONFIG_ACPI_NUMA
344 acpi_numa_init();
345# endif
346#else
347# ifdef CONFIG_SMP
348 smp_build_cpu_map(); /* happens, e.g., with the Ski simulator */
349# endif
350#endif /* CONFIG_APCI_BOOT */
351
352 find_memory();
353
354 /* process SAL system table: */
355 ia64_sal_init(efi.sal_systab);
356
357#ifdef CONFIG_SMP
358 cpu_physical_id(0) = hard_smp_processor_id();
359#endif
360
361 cpu_init(); /* initialize the bootstrap CPU */
362
363#ifdef CONFIG_ACPI_BOOT
364 acpi_boot_init();
365#endif
366
367#ifdef CONFIG_VT
368 if (!conswitchp) {
369# if defined(CONFIG_DUMMY_CONSOLE)
370 conswitchp = &dummy_con;
371# endif
372# if defined(CONFIG_VGA_CONSOLE)
373 /*
374 * Non-legacy systems may route legacy VGA MMIO range to system
375 * memory. vga_con probes the MMIO hole, so memory looks like
376 * a VGA device to it. The EFI memory map can tell us if it's
377 * memory so we can avoid this problem.
378 */
379 if (efi_mem_type(0xA0000) != EFI_CONVENTIONAL_MEMORY)
380 conswitchp = &vga_con;
381# endif
382 }
383#endif
384
385 /* enable IA-64 Machine Check Abort Handling unless disabled */
386 if (!strstr(saved_command_line, "nomca"))
387 ia64_mca_init();
388
389 platform_setup(cmdline_p);
390 paging_init();
391}
392
393/*
394 * Display cpu info for all cpu's.
395 */
396static int
397show_cpuinfo (struct seq_file *m, void *v)
398{
399#ifdef CONFIG_SMP
400# define lpj c->loops_per_jiffy
401# define cpunum c->cpu
402#else
403# define lpj loops_per_jiffy
404# define cpunum 0
405#endif
406 static struct {
407 unsigned long mask;
408 const char *feature_name;
409 } feature_bits[] = {
410 { 1UL << 0, "branchlong" },
411 { 1UL << 1, "spontaneous deferral"},
412 { 1UL << 2, "16-byte atomic ops" }
413 };
414 char family[32], features[128], *cp, sep;
415 struct cpuinfo_ia64 *c = v;
416 unsigned long mask;
417 int i;
418
419 mask = c->features;
420
421 switch (c->family) {
422 case 0x07: memcpy(family, "Itanium", 8); break;
423 case 0x1f: memcpy(family, "Itanium 2", 10); break;
424 default: sprintf(family, "%u", c->family); break;
425 }
426
427 /* build the feature string: */
428 memcpy(features, " standard", 10);
429 cp = features;
430 sep = 0;
431 for (i = 0; i < (int) ARRAY_SIZE(feature_bits); ++i) {
432 if (mask & feature_bits[i].mask) {
433 if (sep)
434 *cp++ = sep;
435 sep = ',';
436 *cp++ = ' ';
437 strcpy(cp, feature_bits[i].feature_name);
438 cp += strlen(feature_bits[i].feature_name);
439 mask &= ~feature_bits[i].mask;
440 }
441 }
442 if (mask) {
443 /* print unknown features as a hex value: */
444 if (sep)
445 *cp++ = sep;
446 sprintf(cp, " 0x%lx", mask);
447 }
448
449 seq_printf(m,
450 "processor : %d\n"
451 "vendor : %s\n"
452 "arch : IA-64\n"
453 "family : %s\n"
454 "model : %u\n"
455 "revision : %u\n"
456 "archrev : %u\n"
457 "features :%s\n" /* don't change this---it _is_ right! */
458 "cpu number : %lu\n"
459 "cpu regs : %u\n"
460 "cpu MHz : %lu.%06lu\n"
461 "itc MHz : %lu.%06lu\n"
462 "BogoMIPS : %lu.%02lu\n\n",
463 cpunum, c->vendor, family, c->model, c->revision, c->archrev,
464 features, c->ppn, c->number,
465 c->proc_freq / 1000000, c->proc_freq % 1000000,
466 c->itc_freq / 1000000, c->itc_freq % 1000000,
467 lpj*HZ/500000, (lpj*HZ/5000) % 100);
468 return 0;
469}
470
471static void *
472c_start (struct seq_file *m, loff_t *pos)
473{
474#ifdef CONFIG_SMP
475 while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map))
476 ++*pos;
477#endif
478 return *pos < NR_CPUS ? cpu_data(*pos) : NULL;
479}
480
481static void *
482c_next (struct seq_file *m, void *v, loff_t *pos)
483{
484 ++*pos;
485 return c_start(m, pos);
486}
487
488static void
489c_stop (struct seq_file *m, void *v)
490{
491}
492
493struct seq_operations cpuinfo_op = {
494 .start = c_start,
495 .next = c_next,
496 .stop = c_stop,
497 .show = show_cpuinfo
498};
499
500void
501identify_cpu (struct cpuinfo_ia64 *c)
502{
503 union {
504 unsigned long bits[5];
505 struct {
506 /* id 0 & 1: */
507 char vendor[16];
508
509 /* id 2 */
510 u64 ppn; /* processor serial number */
511
512 /* id 3: */
513 unsigned number : 8;
514 unsigned revision : 8;
515 unsigned model : 8;
516 unsigned family : 8;
517 unsigned archrev : 8;
518 unsigned reserved : 24;
519
520 /* id 4: */
521 u64 features;
522 } field;
523 } cpuid;
524 pal_vm_info_1_u_t vm1;
525 pal_vm_info_2_u_t vm2;
526 pal_status_t status;
527 unsigned long impl_va_msb = 50, phys_addr_size = 44; /* Itanium defaults */
528 int i;
529
530 for (i = 0; i < 5; ++i)
531 cpuid.bits[i] = ia64_get_cpuid(i);
532
533 memcpy(c->vendor, cpuid.field.vendor, 16);
534#ifdef CONFIG_SMP
535 c->cpu = smp_processor_id();
536#endif
537 c->ppn = cpuid.field.ppn;
538 c->number = cpuid.field.number;
539 c->revision = cpuid.field.revision;
540 c->model = cpuid.field.model;
541 c->family = cpuid.field.family;
542 c->archrev = cpuid.field.archrev;
543 c->features = cpuid.field.features;
544
545 status = ia64_pal_vm_summary(&vm1, &vm2);
546 if (status == PAL_STATUS_SUCCESS) {
547 impl_va_msb = vm2.pal_vm_info_2_s.impl_va_msb;
548 phys_addr_size = vm1.pal_vm_info_1_s.phys_add_size;
549 }
550 c->unimpl_va_mask = ~((7L<<61) | ((1L << (impl_va_msb + 1)) - 1));
551 c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
552}
553
554void
555setup_per_cpu_areas (void)
556{
557 /* start_kernel() requires this... */
558}
559
560static void
561get_max_cacheline_size (void)
562{
563 unsigned long line_size, max = 1;
564 u64 l, levels, unique_caches;
565 pal_cache_config_info_t cci;
566 s64 status;
567
568 status = ia64_pal_cache_summary(&levels, &unique_caches);
569 if (status != 0) {
570 printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
571 __FUNCTION__, status);
572 max = SMP_CACHE_BYTES;
573 goto out;
574 }
575
576 for (l = 0; l < levels; ++l) {
577 status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2,
578 &cci);
579 if (status != 0) {
580 printk(KERN_ERR
581 "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n",
582 __FUNCTION__, l, status);
583 max = SMP_CACHE_BYTES;
584 }
585 line_size = 1 << cci.pcci_line_size;
586 if (line_size > max)
587 max = line_size;
588 }
589 out:
590 if (max > ia64_max_cacheline_size)
591 ia64_max_cacheline_size = max;
592}
593
594/*
595 * cpu_init() initializes state that is per-CPU. This function acts
596 * as a 'CPU state barrier', nothing should get across.
597 */
598void
599cpu_init (void)
600{
601 extern void __devinit ia64_mmu_init (void *);
602 unsigned long num_phys_stacked;
603 pal_vm_info_2_u_t vmi;
604 unsigned int max_ctx;
605 struct cpuinfo_ia64 *cpu_info;
606 void *cpu_data;
607
608 cpu_data = per_cpu_init();
609
610 /*
611 * We set ar.k3 so that assembly code in MCA handler can compute
612 * physical addresses of per cpu variables with a simple:
613 * phys = ar.k3 + &per_cpu_var
614 */
615 ia64_set_kr(IA64_KR_PER_CPU_DATA,
616 ia64_tpa(cpu_data) - (long) __per_cpu_start);
617
618 get_max_cacheline_size();
619
620 /*
621 * We can't pass "local_cpu_data" to identify_cpu() because we haven't called
622 * ia64_mmu_init() yet. And we can't call ia64_mmu_init() first because it
623 * depends on the data returned by identify_cpu(). We break the dependency by
624 * accessing cpu_data() through the canonical per-CPU address.
625 */
626 cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(cpu_info) - __per_cpu_start);
627 identify_cpu(cpu_info);
628
629#ifdef CONFIG_MCKINLEY
630 {
631# define FEATURE_SET 16
632 struct ia64_pal_retval iprv;
633
634 if (cpu_info->family == 0x1f) {
635 PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, FEATURE_SET, 0);
636 if ((iprv.status == 0) && (iprv.v0 & 0x80) && (iprv.v2 & 0x80))
637 PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES,
638 (iprv.v1 | 0x80), FEATURE_SET, 0);
639 }
640 }
641#endif
642
643 /* Clear the stack memory reserved for pt_regs: */
644 memset(ia64_task_regs(current), 0, sizeof(struct pt_regs));
645
646 ia64_set_kr(IA64_KR_FPU_OWNER, 0);
647
648 /*
649 * Initialize the page-table base register to a global
650 * directory with all zeroes. This ensure that we can handle
651 * TLB-misses to user address-space even before we created the
652 * first user address-space. This may happen, e.g., due to
653 * aggressive use of lfetch.fault.
654 */
655 ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
656
657 /*
658 * Initialize default control register to defer all speculative faults. The
659 * kernel MUST NOT depend on a particular setting of these bits (in other words,
660 * the kernel must have recovery code for all speculative accesses). Turn on
661 * dcr.lc as per recommendation by the architecture team. Most IA-32 apps
662 * shouldn't be affected by this (moral: keep your ia32 locks aligned and you'll
663 * be fine).
664 */
665 ia64_setreg(_IA64_REG_CR_DCR, ( IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
666 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
667 atomic_inc(&init_mm.mm_count);
668 current->active_mm = &init_mm;
669 if (current->mm)
670 BUG();
671
672 ia64_mmu_init(ia64_imva(cpu_data));
673 ia64_mca_cpu_init(ia64_imva(cpu_data));
674
675#ifdef CONFIG_IA32_SUPPORT
676 ia32_cpu_init();
677#endif
678
679 /* Clear ITC to eliminiate sched_clock() overflows in human time. */
680 ia64_set_itc(0);
681
682 /* disable all local interrupt sources: */
683 ia64_set_itv(1 << 16);
684 ia64_set_lrr0(1 << 16);
685 ia64_set_lrr1(1 << 16);
686 ia64_setreg(_IA64_REG_CR_PMV, 1 << 16);
687 ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16);
688
689 /* clear TPR & XTP to enable all interrupt classes: */
690 ia64_setreg(_IA64_REG_CR_TPR, 0);
691#ifdef CONFIG_SMP
692 normal_xtp();
693#endif
694
695 /* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
696 if (ia64_pal_vm_summary(NULL, &vmi) == 0)
697 max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
698 else {
699 printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n");
700 max_ctx = (1U << 15) - 1; /* use architected minimum */
701 }
702 while (max_ctx < ia64_ctx.max_ctx) {
703 unsigned int old = ia64_ctx.max_ctx;
704 if (cmpxchg(&ia64_ctx.max_ctx, old, max_ctx) == old)
705 break;
706 }
707
708 if (ia64_pal_rse_info(&num_phys_stacked, NULL) != 0) {
709 printk(KERN_WARNING "cpu_init: PAL RSE info failed; assuming 96 physical "
710 "stacked regs\n");
711 num_phys_stacked = 96;
712 }
713 /* size of physical stacked register partition plus 8 bytes: */
714 __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
715 platform_cpu_init();
716}
717
718void
719check_bugs (void)
720{
721 ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles,
722 (unsigned long) __end___mckinley_e9_bundles);
723}
diff --git a/arch/ia64/kernel/sigframe.h b/arch/ia64/kernel/sigframe.h
new file mode 100644
index 000000000000..37b986cb86e0
--- /dev/null
+++ b/arch/ia64/kernel/sigframe.h
@@ -0,0 +1,25 @@
1struct sigscratch {
2 unsigned long scratch_unat; /* ar.unat for the general registers saved in pt */
3 unsigned long ar_pfs; /* for syscalls, the user-level function-state */
4 struct pt_regs pt;
5};
6
7struct sigframe {
8 /*
9 * Place signal handler args where user-level unwinder can find them easily.
10 * DO NOT MOVE THESE. They are part of the IA-64 Linux ABI and there is
11 * user-level code that depends on their presence!
12 */
13 unsigned long arg0; /* signum */
14 unsigned long arg1; /* siginfo pointer */
15 unsigned long arg2; /* sigcontext pointer */
16 /*
17 * End of architected state.
18 */
19
20 void __user *handler; /* pointer to the plabel of the signal handler */
21 struct siginfo info;
22 struct sigcontext sc;
23};
24
25extern long ia64_do_signal (sigset_t *, struct sigscratch *, long);
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
new file mode 100644
index 000000000000..6891d86937d9
--- /dev/null
+++ b/arch/ia64/kernel/signal.c
@@ -0,0 +1,691 @@
1/*
2 * Architecture-specific signal handling support.
3 *
4 * Copyright (C) 1999-2004 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * Derived from i386 and Alpha versions.
8 */
9
10#include <linux/config.h>
11#include <linux/errno.h>
12#include <linux/kernel.h>
13#include <linux/mm.h>
14#include <linux/ptrace.h>
15#include <linux/sched.h>
16#include <linux/signal.h>
17#include <linux/smp.h>
18#include <linux/smp_lock.h>
19#include <linux/stddef.h>
20#include <linux/tty.h>
21#include <linux/binfmts.h>
22#include <linux/unistd.h>
23#include <linux/wait.h>
24
25#include <asm/ia32.h>
26#include <asm/intrinsics.h>
27#include <asm/uaccess.h>
28#include <asm/rse.h>
29#include <asm/sigcontext.h>
30
31#include "sigframe.h"
32
33#define DEBUG_SIG 0
34#define STACK_ALIGN 16 /* minimal alignment for stack pointer */
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36
37#if _NSIG_WORDS > 1
38# define PUT_SIGSET(k,u) __copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t))
39# define GET_SIGSET(k,u) __copy_from_user((k)->sig, (u)->sig, sizeof(sigset_t))
40#else
41# define PUT_SIGSET(k,u) __put_user((k)->sig[0], &(u)->sig[0])
42# define GET_SIGSET(k,u) __get_user((k)->sig[0], &(u)->sig[0])
43#endif
44
45long
46ia64_rt_sigsuspend (sigset_t __user *uset, size_t sigsetsize, struct sigscratch *scr)
47{
48 sigset_t oldset, set;
49
50 /* XXX: Don't preclude handling different sized sigset_t's. */
51 if (sigsetsize != sizeof(sigset_t))
52 return -EINVAL;
53
54 if (!access_ok(VERIFY_READ, uset, sigsetsize))
55 return -EFAULT;
56
57 if (GET_SIGSET(&set, uset))
58 return -EFAULT;
59
60 sigdelsetmask(&set, ~_BLOCKABLE);
61
62 spin_lock_irq(&current->sighand->siglock);
63 {
64 oldset = current->blocked;
65 current->blocked = set;
66 recalc_sigpending();
67 }
68 spin_unlock_irq(&current->sighand->siglock);
69
70 /*
71 * The return below usually returns to the signal handler. We need to
72 * pre-set the correct error code here to ensure that the right values
73 * get saved in sigcontext by ia64_do_signal.
74 */
75 scr->pt.r8 = EINTR;
76 scr->pt.r10 = -1;
77
78 while (1) {
79 current->state = TASK_INTERRUPTIBLE;
80 schedule();
81 if (ia64_do_signal(&oldset, scr, 1))
82 return -EINTR;
83 }
84}
85
86asmlinkage long
87sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2,
88 long arg3, long arg4, long arg5, long arg6, long arg7,
89 struct pt_regs regs)
90{
91 return do_sigaltstack(uss, uoss, regs.r12);
92}
93
94static long
95restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
96{
97 unsigned long ip, flags, nat, um, cfm;
98 long err;
99
100 /* Always make any pending restarted system calls return -EINTR */
101 current_thread_info()->restart_block.fn = do_no_restart_syscall;
102
103 /* restore scratch that always needs gets updated during signal delivery: */
104 err = __get_user(flags, &sc->sc_flags);
105 err |= __get_user(nat, &sc->sc_nat);
106 err |= __get_user(ip, &sc->sc_ip); /* instruction pointer */
107 err |= __get_user(cfm, &sc->sc_cfm);
108 err |= __get_user(um, &sc->sc_um); /* user mask */
109 err |= __get_user(scr->pt.ar_rsc, &sc->sc_ar_rsc);
110 err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat);
111 err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);
112 err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
113 err |= __get_user(scr->pt.pr, &sc->sc_pr); /* predicates */
114 err |= __get_user(scr->pt.b0, &sc->sc_br[0]); /* b0 (rp) */
115 err |= __get_user(scr->pt.b6, &sc->sc_br[6]); /* b6 */
116 err |= __copy_from_user(&scr->pt.r1, &sc->sc_gr[1], 8); /* r1 */
117 err |= __copy_from_user(&scr->pt.r8, &sc->sc_gr[8], 4*8); /* r8-r11 */
118 err |= __copy_from_user(&scr->pt.r12, &sc->sc_gr[12], 2*8); /* r12-r13 */
119 err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8); /* r15 */
120
121 scr->pt.cr_ifs = cfm | (1UL << 63);
122
123 /* establish new instruction pointer: */
124 scr->pt.cr_iip = ip & ~0x3UL;
125 ia64_psr(&scr->pt)->ri = ip & 0x3;
126 scr->pt.cr_ipsr = (scr->pt.cr_ipsr & ~IA64_PSR_UM) | (um & IA64_PSR_UM);
127
128 scr->scratch_unat = ia64_put_scratch_nat_bits(&scr->pt, nat);
129
130 if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) {
131 /* Restore most scratch-state only when not in syscall. */
132 err |= __get_user(scr->pt.ar_ccv, &sc->sc_ar_ccv); /* ar.ccv */
133 err |= __get_user(scr->pt.b7, &sc->sc_br[7]); /* b7 */
134 err |= __get_user(scr->pt.r14, &sc->sc_gr[14]); /* r14 */
135 err |= __copy_from_user(&scr->pt.ar_csd, &sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */
136 err |= __copy_from_user(&scr->pt.r2, &sc->sc_gr[2], 2*8); /* r2-r3 */
137 err |= __copy_from_user(&scr->pt.r16, &sc->sc_gr[16], 16*8); /* r16-r31 */
138 }
139
140 if ((flags & IA64_SC_FLAG_FPH_VALID) != 0) {
141 struct ia64_psr *psr = ia64_psr(&scr->pt);
142
143 __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16);
144 psr->mfh = 0; /* drop signal handler's fph contents... */
145 if (psr->dfh)
146 ia64_drop_fpu(current);
147 else {
148 /* We already own the local fph, otherwise psr->dfh wouldn't be 0. */
149 __ia64_load_fpu(current->thread.fph);
150 ia64_set_local_fpu_owner(current);
151 }
152 }
153 return err;
154}
155
156int
157copy_siginfo_to_user (siginfo_t __user *to, siginfo_t *from)
158{
159 if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t)))
160 return -EFAULT;
161 if (from->si_code < 0) {
162 if (__copy_to_user(to, from, sizeof(siginfo_t)))
163 return -EFAULT;
164 return 0;
165 } else {
166 int err;
167
168 /*
169 * If you change siginfo_t structure, please be sure this code is fixed
170 * accordingly. It should never copy any pad contained in the structure
171 * to avoid security leaks, but must copy the generic 3 ints plus the
172 * relevant union member.
173 */
174 err = __put_user(from->si_signo, &to->si_signo);
175 err |= __put_user(from->si_errno, &to->si_errno);
176 err |= __put_user((short)from->si_code, &to->si_code);
177 switch (from->si_code >> 16) {
178 case __SI_FAULT >> 16:
179 err |= __put_user(from->si_flags, &to->si_flags);
180 err |= __put_user(from->si_isr, &to->si_isr);
181 case __SI_POLL >> 16:
182 err |= __put_user(from->si_addr, &to->si_addr);
183 err |= __put_user(from->si_imm, &to->si_imm);
184 break;
185 case __SI_TIMER >> 16:
186 err |= __put_user(from->si_tid, &to->si_tid);
187 err |= __put_user(from->si_overrun, &to->si_overrun);
188 err |= __put_user(from->si_ptr, &to->si_ptr);
189 break;
190 case __SI_RT >> 16: /* Not generated by the kernel as of now. */
191 case __SI_MESGQ >> 16:
192 err |= __put_user(from->si_uid, &to->si_uid);
193 err |= __put_user(from->si_pid, &to->si_pid);
194 err |= __put_user(from->si_ptr, &to->si_ptr);
195 break;
196 case __SI_CHLD >> 16:
197 err |= __put_user(from->si_utime, &to->si_utime);
198 err |= __put_user(from->si_stime, &to->si_stime);
199 err |= __put_user(from->si_status, &to->si_status);
200 default:
201 err |= __put_user(from->si_uid, &to->si_uid);
202 err |= __put_user(from->si_pid, &to->si_pid);
203 break;
204 }
205 return err;
206 }
207}
208
209long
210ia64_rt_sigreturn (struct sigscratch *scr)
211{
212 extern char ia64_strace_leave_kernel, ia64_leave_kernel;
213 struct sigcontext __user *sc;
214 struct siginfo si;
215 sigset_t set;
216 long retval;
217
218 sc = &((struct sigframe __user *) (scr->pt.r12 + 16))->sc;
219
220 /*
221 * When we return to the previously executing context, r8 and r10 have already
222 * been setup the way we want them. Indeed, if the signal wasn't delivered while
223 * in a system call, we must not touch r8 or r10 as otherwise user-level state
224 * could be corrupted.
225 */
226 retval = (long) &ia64_leave_kernel;
227 if (test_thread_flag(TIF_SYSCALL_TRACE))
228 /*
229 * strace expects to be notified after sigreturn returns even though the
230 * context to which we return may not be in the middle of a syscall.
231 * Thus, the return-value that strace displays for sigreturn is
232 * meaningless.
233 */
234 retval = (long) &ia64_strace_leave_kernel;
235
236 if (!access_ok(VERIFY_READ, sc, sizeof(*sc)))
237 goto give_sigsegv;
238
239 if (GET_SIGSET(&set, &sc->sc_mask))
240 goto give_sigsegv;
241
242 sigdelsetmask(&set, ~_BLOCKABLE);
243
244 spin_lock_irq(&current->sighand->siglock);
245 {
246 current->blocked = set;
247 recalc_sigpending();
248 }
249 spin_unlock_irq(&current->sighand->siglock);
250
251 if (restore_sigcontext(sc, scr))
252 goto give_sigsegv;
253
254#if DEBUG_SIG
255 printk("SIG return (%s:%d): sp=%lx ip=%lx\n",
256 current->comm, current->pid, scr->pt.r12, scr->pt.cr_iip);
257#endif
258 /*
259 * It is more difficult to avoid calling this function than to
260 * call it and ignore errors.
261 */
262 do_sigaltstack(&sc->sc_stack, NULL, scr->pt.r12);
263 return retval;
264
265 give_sigsegv:
266 si.si_signo = SIGSEGV;
267 si.si_errno = 0;
268 si.si_code = SI_KERNEL;
269 si.si_pid = current->pid;
270 si.si_uid = current->uid;
271 si.si_addr = sc;
272 force_sig_info(SIGSEGV, &si, current);
273 return retval;
274}
275
276/*
277 * This does just the minimum required setup of sigcontext.
278 * Specifically, it only installs data that is either not knowable at
279 * the user-level or that gets modified before execution in the
280 * trampoline starts. Everything else is done at the user-level.
281 */
282static long
283setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratch *scr)
284{
285 unsigned long flags = 0, ifs, cfm, nat;
286 long err;
287
288 ifs = scr->pt.cr_ifs;
289
290 if (on_sig_stack((unsigned long) sc))
291 flags |= IA64_SC_FLAG_ONSTACK;
292 if ((ifs & (1UL << 63)) == 0)
293 /* if cr_ifs doesn't have the valid bit set, we got here through a syscall */
294 flags |= IA64_SC_FLAG_IN_SYSCALL;
295 cfm = ifs & ((1UL << 38) - 1);
296 ia64_flush_fph(current);
297 if ((current->thread.flags & IA64_THREAD_FPH_VALID)) {
298 flags |= IA64_SC_FLAG_FPH_VALID;
299 __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16);
300 }
301
302 nat = ia64_get_scratch_nat_bits(&scr->pt, scr->scratch_unat);
303
304 err = __put_user(flags, &sc->sc_flags);
305 err |= __put_user(nat, &sc->sc_nat);
306 err |= PUT_SIGSET(mask, &sc->sc_mask);
307 err |= __put_user(cfm, &sc->sc_cfm);
308 err |= __put_user(scr->pt.cr_ipsr & IA64_PSR_UM, &sc->sc_um);
309 err |= __put_user(scr->pt.ar_rsc, &sc->sc_ar_rsc);
310 err |= __put_user(scr->pt.ar_unat, &sc->sc_ar_unat); /* ar.unat */
311 err |= __put_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr); /* ar.fpsr */
312 err |= __put_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
313 err |= __put_user(scr->pt.pr, &sc->sc_pr); /* predicates */
314 err |= __put_user(scr->pt.b0, &sc->sc_br[0]); /* b0 (rp) */
315 err |= __put_user(scr->pt.b6, &sc->sc_br[6]); /* b6 */
316 err |= __copy_to_user(&sc->sc_gr[1], &scr->pt.r1, 8); /* r1 */
317 err |= __copy_to_user(&sc->sc_gr[8], &scr->pt.r8, 4*8); /* r8-r11 */
318 err |= __copy_to_user(&sc->sc_gr[12], &scr->pt.r12, 2*8); /* r12-r13 */
319 err |= __copy_to_user(&sc->sc_gr[15], &scr->pt.r15, 8); /* r15 */
320 err |= __put_user(scr->pt.cr_iip + ia64_psr(&scr->pt)->ri, &sc->sc_ip);
321
322 if (flags & IA64_SC_FLAG_IN_SYSCALL) {
323 /* Clear scratch registers if the signal interrupted a system call. */
324 err |= __put_user(0, &sc->sc_ar_ccv); /* ar.ccv */
325 err |= __put_user(0, &sc->sc_br[7]); /* b7 */
326 err |= __put_user(0, &sc->sc_gr[14]); /* r14 */
327 err |= __clear_user(&sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */
328 err |= __clear_user(&sc->sc_gr[2], 2*8); /* r2-r3 */
329 err |= __clear_user(&sc->sc_gr[16], 16*8); /* r16-r31 */
330 } else {
331 /* Copy scratch regs to sigcontext if the signal didn't interrupt a syscall. */
332 err |= __put_user(scr->pt.ar_ccv, &sc->sc_ar_ccv); /* ar.ccv */
333 err |= __put_user(scr->pt.b7, &sc->sc_br[7]); /* b7 */
334 err |= __put_user(scr->pt.r14, &sc->sc_gr[14]); /* r14 */
335 err |= __copy_to_user(&sc->sc_ar25, &scr->pt.ar_csd, 2*8); /* ar.csd & ar.ssd */
336 err |= __copy_to_user(&sc->sc_gr[2], &scr->pt.r2, 2*8); /* r2-r3 */
337 err |= __copy_to_user(&sc->sc_gr[16], &scr->pt.r16, 16*8); /* r16-r31 */
338 }
339 return err;
340}
341
342/*
343 * Check whether the register-backing store is already on the signal stack.
344 */
345static inline int
346rbs_on_sig_stack (unsigned long bsp)
347{
348 return (bsp - current->sas_ss_sp < current->sas_ss_size);
349}
350
351static long
352force_sigsegv_info (int sig, void __user *addr)
353{
354 unsigned long flags;
355 struct siginfo si;
356
357 if (sig == SIGSEGV) {
358 /*
359 * Acquiring siglock around the sa_handler-update is almost
360 * certainly overkill, but this isn't a
361 * performance-critical path and I'd rather play it safe
362 * here than having to debug a nasty race if and when
363 * something changes in kernel/signal.c that would make it
364 * no longer safe to modify sa_handler without holding the
365 * lock.
366 */
367 spin_lock_irqsave(&current->sighand->siglock, flags);
368 current->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
369 spin_unlock_irqrestore(&current->sighand->siglock, flags);
370 }
371 si.si_signo = SIGSEGV;
372 si.si_errno = 0;
373 si.si_code = SI_KERNEL;
374 si.si_pid = current->pid;
375 si.si_uid = current->uid;
376 si.si_addr = addr;
377 force_sig_info(SIGSEGV, &si, current);
378 return 0;
379}
380
381static long
382setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
383 struct sigscratch *scr)
384{
385 extern char __kernel_sigtramp[];
386 unsigned long tramp_addr, new_rbs = 0;
387 struct sigframe __user *frame;
388 long err;
389
390 frame = (void __user *) scr->pt.r12;
391 tramp_addr = (unsigned long) __kernel_sigtramp;
392 if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) {
393 frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size)
394 & ~(STACK_ALIGN - 1));
395 /*
396 * We need to check for the register stack being on the signal stack
397 * separately, because it's switched separately (memory stack is switched
398 * in the kernel, register stack is switched in the signal trampoline).
399 */
400 if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
401 new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
402 }
403 frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1));
404
405 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
406 return force_sigsegv_info(sig, frame);
407
408 err = __put_user(sig, &frame->arg0);
409 err |= __put_user(&frame->info, &frame->arg1);
410 err |= __put_user(&frame->sc, &frame->arg2);
411 err |= __put_user(new_rbs, &frame->sc.sc_rbs_base);
412 err |= __put_user(0, &frame->sc.sc_loadrs); /* initialize to zero */
413 err |= __put_user(ka->sa.sa_handler, &frame->handler);
414
415 err |= copy_siginfo_to_user(&frame->info, info);
416
417 err |= __put_user(current->sas_ss_sp, &frame->sc.sc_stack.ss_sp);
418 err |= __put_user(current->sas_ss_size, &frame->sc.sc_stack.ss_size);
419 err |= __put_user(sas_ss_flags(scr->pt.r12), &frame->sc.sc_stack.ss_flags);
420 err |= setup_sigcontext(&frame->sc, set, scr);
421
422 if (unlikely(err))
423 return force_sigsegv_info(sig, frame);
424
425 scr->pt.r12 = (unsigned long) frame - 16; /* new stack pointer */
426 scr->pt.ar_fpsr = FPSR_DEFAULT; /* reset fpsr for signal handler */
427 scr->pt.cr_iip = tramp_addr;
428 ia64_psr(&scr->pt)->ri = 0; /* start executing in first slot */
429 ia64_psr(&scr->pt)->be = 0; /* force little-endian byte-order */
430 /*
431 * Force the interruption function mask to zero. This has no effect when a
432 * system-call got interrupted by a signal (since, in that case, scr->pt_cr_ifs is
433 * ignored), but it has the desirable effect of making it possible to deliver a
434 * signal with an incomplete register frame (which happens when a mandatory RSE
435 * load faults). Furthermore, it has no negative effect on the getting the user's
436 * dirty partition preserved, because that's governed by scr->pt.loadrs.
437 */
438 scr->pt.cr_ifs = (1UL << 63);
439
440 /*
441 * Note: this affects only the NaT bits of the scratch regs (the ones saved in
442 * pt_regs), which is exactly what we want.
443 */
444 scr->scratch_unat = 0; /* ensure NaT bits of r12 is clear */
445
446#if DEBUG_SIG
447 printk("SIG deliver (%s:%d): sig=%d sp=%lx ip=%lx handler=%p\n",
448 current->comm, current->pid, sig, scr->pt.r12, frame->sc.sc_ip, frame->handler);
449#endif
450 return 1;
451}
452
453static long
454handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset,
455 struct sigscratch *scr)
456{
457 if (IS_IA32_PROCESS(&scr->pt)) {
458 /* send signal to IA-32 process */
459 if (!ia32_setup_frame1(sig, ka, info, oldset, &scr->pt))
460 return 0;
461 } else
462 /* send signal to IA-64 process */
463 if (!setup_frame(sig, ka, info, oldset, scr))
464 return 0;
465
466 if (!(ka->sa.sa_flags & SA_NODEFER)) {
467 spin_lock_irq(&current->sighand->siglock);
468 {
469 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
470 sigaddset(&current->blocked, sig);
471 recalc_sigpending();
472 }
473 spin_unlock_irq(&current->sighand->siglock);
474 }
475 return 1;
476}
477
478/*
479 * Note that `init' is a special process: it doesn't get signals it doesn't want to
480 * handle. Thus you cannot kill init even with a SIGKILL even by mistake.
481 */
482long
483ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
484{
485 struct k_sigaction ka;
486 siginfo_t info;
487 long restart = in_syscall;
488 long errno = scr->pt.r8;
489# define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c))
490
491 /*
492 * In the ia64_leave_kernel code path, we want the common case to go fast, which
493 * is why we may in certain cases get here from kernel mode. Just return without
494 * doing anything if so.
495 */
496 if (!user_mode(&scr->pt))
497 return 0;
498
499 if (!oldset)
500 oldset = &current->blocked;
501
502 /*
503 * This only loops in the rare cases of handle_signal() failing, in which case we
504 * need to push through a forced SIGSEGV.
505 */
506 while (1) {
507 int signr = get_signal_to_deliver(&info, &ka, &scr->pt, NULL);
508
509 /*
510 * get_signal_to_deliver() may have run a debugger (via notify_parent())
511 * and the debugger may have modified the state (e.g., to arrange for an
512 * inferior call), thus it's important to check for restarting _after_
513 * get_signal_to_deliver().
514 */
515 if (IS_IA32_PROCESS(&scr->pt)) {
516 if (in_syscall) {
517 if (errno >= 0)
518 restart = 0;
519 else
520 errno = -errno;
521 }
522 } else if ((long) scr->pt.r10 != -1)
523 /*
524 * A system calls has to be restarted only if one of the error codes
525 * ERESTARTNOHAND, ERESTARTSYS, or ERESTARTNOINTR is returned. If r10
526 * isn't -1 then r8 doesn't hold an error code and we don't need to
527 * restart the syscall, so we can clear the "restart" flag here.
528 */
529 restart = 0;
530
531 if (signr <= 0)
532 break;
533
534 if (unlikely(restart)) {
535 switch (errno) {
536 case ERESTART_RESTARTBLOCK:
537 case ERESTARTNOHAND:
538 scr->pt.r8 = ERR_CODE(EINTR);
539 /* note: scr->pt.r10 is already -1 */
540 break;
541
542 case ERESTARTSYS:
543 if ((ka.sa.sa_flags & SA_RESTART) == 0) {
544 scr->pt.r8 = ERR_CODE(EINTR);
545 /* note: scr->pt.r10 is already -1 */
546 break;
547 }
548 case ERESTARTNOINTR:
549 if (IS_IA32_PROCESS(&scr->pt)) {
550 scr->pt.r8 = scr->pt.r1;
551 scr->pt.cr_iip -= 2;
552 } else
553 ia64_decrement_ip(&scr->pt);
554 restart = 0; /* don't restart twice if handle_signal() fails... */
555 }
556 }
557
558 /*
559 * Whee! Actually deliver the signal. If the delivery failed, we need to
560 * continue to iterate in this loop so we can deliver the SIGSEGV...
561 */
562 if (handle_signal(signr, &ka, &info, oldset, scr))
563 return 1;
564 }
565
566 /* Did we come from a system call? */
567 if (restart) {
568 /* Restart the system call - no handlers present */
569 if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR
570 || errno == ERESTART_RESTARTBLOCK)
571 {
572 if (IS_IA32_PROCESS(&scr->pt)) {
573 scr->pt.r8 = scr->pt.r1;
574 scr->pt.cr_iip -= 2;
575 if (errno == ERESTART_RESTARTBLOCK)
576 scr->pt.r8 = 0; /* x86 version of __NR_restart_syscall */
577 } else {
578 /*
579 * Note: the syscall number is in r15 which is saved in
580 * pt_regs so all we need to do here is adjust ip so that
581 * the "break" instruction gets re-executed.
582 */
583 ia64_decrement_ip(&scr->pt);
584 if (errno == ERESTART_RESTARTBLOCK)
585 scr->pt.r15 = __NR_restart_syscall;
586 }
587 }
588 }
589 return 0;
590}
591
592/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it
593 * could not be delivered. It is important that the target process is not
594 * allowed to do any more work in user space. Possible cases for the target
595 * process:
596 *
597 * - It is sleeping and will wake up soon. Store the data in the current task,
598 * the signal will be sent when the current task returns from the next
599 * interrupt.
600 *
601 * - It is running in user context. Store the data in the current task, the
602 * signal will be sent when the current task returns from the next interrupt.
603 *
604 * - It is running in kernel context on this or another cpu and will return to
605 * user context. Store the data in the target task, the signal will be sent
606 * to itself when the target task returns to user space.
607 *
608 * - It is running in kernel context on this cpu and will sleep before
609 * returning to user context. Because this is also the current task, the
610 * signal will not get delivered and the task could sleep indefinitely.
611 * Store the data in the idle task for this cpu, the signal will be sent
612 * after the idle task processes its next interrupt.
613 *
614 * To cover all cases, store the data in the target task, the current task and
615 * the idle task on this cpu. Whatever happens, the signal will be delivered
616 * to the target task before it can do any useful user space work. Multiple
617 * deliveries have no unwanted side effects.
618 *
619 * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts
620 * disabled. It must not take any locks nor use kernel structures or services
621 * that require locks.
622 */
623
624/* To ensure that we get the right pid, check its start time. To avoid extra
625 * include files in thread_info.h, convert the task start_time to unsigned long,
626 * giving us a cycle time of > 580 years.
627 */
628static inline unsigned long
629start_time_ul(const struct task_struct *t)
630{
631 return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec;
632}
633
634void
635set_sigdelayed(pid_t pid, int signo, int code, void __user *addr)
636{
637 struct task_struct *t;
638 unsigned long start_time = 0;
639 int i;
640
641 for (i = 1; i <= 3; ++i) {
642 switch (i) {
643 case 1:
644 t = find_task_by_pid(pid);
645 if (t)
646 start_time = start_time_ul(t);
647 break;
648 case 2:
649 t = current;
650 break;
651 default:
652 t = idle_task(smp_processor_id());
653 break;
654 }
655
656 if (!t)
657 return;
658 t->thread_info->sigdelayed.signo = signo;
659 t->thread_info->sigdelayed.code = code;
660 t->thread_info->sigdelayed.addr = addr;
661 t->thread_info->sigdelayed.start_time = start_time;
662 t->thread_info->sigdelayed.pid = pid;
663 wmb();
664 set_tsk_thread_flag(t, TIF_SIGDELAYED);
665 }
666}
667
668/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that
669 * was detected in MCA/INIT/NMI/PMI context where it could not be delivered.
670 */
671
672void
673do_sigdelayed(void)
674{
675 struct siginfo siginfo;
676 pid_t pid;
677 struct task_struct *t;
678
679 clear_thread_flag(TIF_SIGDELAYED);
680 memset(&siginfo, 0, sizeof(siginfo));
681 siginfo.si_signo = current_thread_info()->sigdelayed.signo;
682 siginfo.si_code = current_thread_info()->sigdelayed.code;
683 siginfo.si_addr = current_thread_info()->sigdelayed.addr;
684 pid = current_thread_info()->sigdelayed.pid;
685 t = find_task_by_pid(pid);
686 if (!t)
687 return;
688 if (current_thread_info()->sigdelayed.start_time != start_time_ul(t))
689 return;
690 force_sig_info(siginfo.si_signo, &siginfo, t);
691}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
new file mode 100644
index 000000000000..953095e2ce15
--- /dev/null
+++ b/arch/ia64/kernel/smp.c
@@ -0,0 +1,376 @@
1/*
2 * SMP Support
3 *
4 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
5 * Copyright (C) 1999, 2001, 2003 David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * Lots of stuff stolen from arch/alpha/kernel/smp.c
8 *
9 * 01/05/16 Rohit Seth <rohit.seth@intel.com> IA64-SMP functions. Reorganized
10 * the existing code (on the lines of x86 port).
11 * 00/09/11 David Mosberger <davidm@hpl.hp.com> Do loops_per_jiffy
12 * calibration on each CPU.
13 * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> fixed logical processor id
14 * 00/03/31 Rohit Seth <rohit.seth@intel.com> Fixes for Bootstrap Processor
15 * & cpu_online_map now gets done here (instead of setup.c)
16 * 99/10/05 davidm Update to bring it in sync with new command-line processing
17 * scheme.
18 * 10/13/00 Goutham Rao <goutham.rao@intel.com> Updated smp_call_function and
19 * smp_call_function_single to resend IPI on timeouts
20 */
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/init.h>
25#include <linux/interrupt.h>
26#include <linux/smp.h>
27#include <linux/kernel_stat.h>
28#include <linux/mm.h>
29#include <linux/cache.h>
30#include <linux/delay.h>
31#include <linux/efi.h>
32#include <linux/bitops.h>
33
34#include <asm/atomic.h>
35#include <asm/current.h>
36#include <asm/delay.h>
37#include <asm/machvec.h>
38#include <asm/io.h>
39#include <asm/irq.h>
40#include <asm/page.h>
41#include <asm/pgalloc.h>
42#include <asm/pgtable.h>
43#include <asm/processor.h>
44#include <asm/ptrace.h>
45#include <asm/sal.h>
46#include <asm/system.h>
47#include <asm/tlbflush.h>
48#include <asm/unistd.h>
49#include <asm/mca.h>
50
51/*
52 * Structure and data for smp_call_function(). This is designed to minimise static memory
53 * requirements. It also looks cleaner.
54 */
55static __cacheline_aligned DEFINE_SPINLOCK(call_lock);
56
57struct call_data_struct {
58 void (*func) (void *info);
59 void *info;
60 long wait;
61 atomic_t started;
62 atomic_t finished;
63};
64
65static volatile struct call_data_struct *call_data;
66
67#define IPI_CALL_FUNC 0
68#define IPI_CPU_STOP 1
69
70/* This needs to be cacheline aligned because it is written to by *other* CPUs. */
71static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
72
73extern void cpu_halt (void);
74
75void
76lock_ipi_calllock(void)
77{
78 spin_lock_irq(&call_lock);
79}
80
81void
82unlock_ipi_calllock(void)
83{
84 spin_unlock_irq(&call_lock);
85}
86
87static void
88stop_this_cpu (void)
89{
90 /*
91 * Remove this CPU:
92 */
93 cpu_clear(smp_processor_id(), cpu_online_map);
94 max_xtp();
95 local_irq_disable();
96 cpu_halt();
97}
98
99void
100cpu_die(void)
101{
102 max_xtp();
103 local_irq_disable();
104 cpu_halt();
105 /* Should never be here */
106 BUG();
107 for (;;);
108}
109
110irqreturn_t
111handle_IPI (int irq, void *dev_id, struct pt_regs *regs)
112{
113 int this_cpu = get_cpu();
114 unsigned long *pending_ipis = &__ia64_per_cpu_var(ipi_operation);
115 unsigned long ops;
116
117 mb(); /* Order interrupt and bit testing. */
118 while ((ops = xchg(pending_ipis, 0)) != 0) {
119 mb(); /* Order bit clearing and data access. */
120 do {
121 unsigned long which;
122
123 which = ffz(~ops);
124 ops &= ~(1 << which);
125
126 switch (which) {
127 case IPI_CALL_FUNC:
128 {
129 struct call_data_struct *data;
130 void (*func)(void *info);
131 void *info;
132 int wait;
133
134 /* release the 'pointer lock' */
135 data = (struct call_data_struct *) call_data;
136 func = data->func;
137 info = data->info;
138 wait = data->wait;
139
140 mb();
141 atomic_inc(&data->started);
142 /*
143 * At this point the structure may be gone unless
144 * wait is true.
145 */
146 (*func)(info);
147
148 /* Notify the sending CPU that the task is done. */
149 mb();
150 if (wait)
151 atomic_inc(&data->finished);
152 }
153 break;
154
155 case IPI_CPU_STOP:
156 stop_this_cpu();
157 break;
158
159 default:
160 printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
161 break;
162 }
163 } while (ops);
164 mb(); /* Order data access and bit testing. */
165 }
166 put_cpu();
167 return IRQ_HANDLED;
168}
169
170/*
171 * Called with preeemption disabled.
172 */
173static inline void
174send_IPI_single (int dest_cpu, int op)
175{
176 set_bit(op, &per_cpu(ipi_operation, dest_cpu));
177 platform_send_ipi(dest_cpu, IA64_IPI_VECTOR, IA64_IPI_DM_INT, 0);
178}
179
180/*
181 * Called with preeemption disabled.
182 */
183static inline void
184send_IPI_allbutself (int op)
185{
186 unsigned int i;
187
188 for (i = 0; i < NR_CPUS; i++) {
189 if (cpu_online(i) && i != smp_processor_id())
190 send_IPI_single(i, op);
191 }
192}
193
194/*
195 * Called with preeemption disabled.
196 */
197static inline void
198send_IPI_all (int op)
199{
200 int i;
201
202 for (i = 0; i < NR_CPUS; i++)
203 if (cpu_online(i))
204 send_IPI_single(i, op);
205}
206
207/*
208 * Called with preeemption disabled.
209 */
210static inline void
211send_IPI_self (int op)
212{
213 send_IPI_single(smp_processor_id(), op);
214}
215
216/*
217 * Called with preeemption disabled.
218 */
219void
220smp_send_reschedule (int cpu)
221{
222 platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
223}
224
225void
226smp_flush_tlb_all (void)
227{
228 on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
229}
230
231void
232smp_flush_tlb_mm (struct mm_struct *mm)
233{
234 /* this happens for the common case of a single-threaded fork(): */
235 if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1))
236 {
237 local_finish_flush_tlb_mm(mm);
238 return;
239 }
240
241 /*
242 * We could optimize this further by using mm->cpu_vm_mask to track which CPUs
243 * have been running in the address space. It's not clear that this is worth the
244 * trouble though: to avoid races, we have to raise the IPI on the target CPU
245 * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is
246 * rather trivial.
247 */
248 on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1);
249}
250
251/*
252 * Run a function on another CPU
253 * <func> The function to run. This must be fast and non-blocking.
254 * <info> An arbitrary pointer to pass to the function.
255 * <nonatomic> Currently unused.
256 * <wait> If true, wait until function has completed on other CPUs.
257 * [RETURNS] 0 on success, else a negative status code.
258 *
259 * Does not return until the remote CPU is nearly ready to execute <func>
260 * or is or has executed.
261 */
262
263int
264smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic,
265 int wait)
266{
267 struct call_data_struct data;
268 int cpus = 1;
269 int me = get_cpu(); /* prevent preemption and reschedule on another processor */
270
271 if (cpuid == me) {
272 printk("%s: trying to call self\n", __FUNCTION__);
273 put_cpu();
274 return -EBUSY;
275 }
276
277 data.func = func;
278 data.info = info;
279 atomic_set(&data.started, 0);
280 data.wait = wait;
281 if (wait)
282 atomic_set(&data.finished, 0);
283
284 spin_lock_bh(&call_lock);
285
286 call_data = &data;
287 mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */
288 send_IPI_single(cpuid, IPI_CALL_FUNC);
289
290 /* Wait for response */
291 while (atomic_read(&data.started) != cpus)
292 cpu_relax();
293
294 if (wait)
295 while (atomic_read(&data.finished) != cpus)
296 cpu_relax();
297 call_data = NULL;
298
299 spin_unlock_bh(&call_lock);
300 put_cpu();
301 return 0;
302}
303EXPORT_SYMBOL(smp_call_function_single);
304
305/*
306 * this function sends a 'generic call function' IPI to all other CPUs
307 * in the system.
308 */
309
310/*
311 * [SUMMARY] Run a function on all other CPUs.
312 * <func> The function to run. This must be fast and non-blocking.
313 * <info> An arbitrary pointer to pass to the function.
314 * <nonatomic> currently unused.
315 * <wait> If true, wait (atomically) until function has completed on other CPUs.
316 * [RETURNS] 0 on success, else a negative status code.
317 *
318 * Does not return until remote CPUs are nearly ready to execute <func> or are or have
319 * executed.
320 *
321 * You must not call this function with disabled interrupts or from a
322 * hardware interrupt handler or from a bottom half handler.
323 */
324int
325smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait)
326{
327 struct call_data_struct data;
328 int cpus = num_online_cpus()-1;
329
330 if (!cpus)
331 return 0;
332
333 /* Can deadlock when called with interrupts disabled */
334 WARN_ON(irqs_disabled());
335
336 data.func = func;
337 data.info = info;
338 atomic_set(&data.started, 0);
339 data.wait = wait;
340 if (wait)
341 atomic_set(&data.finished, 0);
342
343 spin_lock(&call_lock);
344
345 call_data = &data;
346 mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */
347 send_IPI_allbutself(IPI_CALL_FUNC);
348
349 /* Wait for response */
350 while (atomic_read(&data.started) != cpus)
351 cpu_relax();
352
353 if (wait)
354 while (atomic_read(&data.finished) != cpus)
355 cpu_relax();
356 call_data = NULL;
357
358 spin_unlock(&call_lock);
359 return 0;
360}
361EXPORT_SYMBOL(smp_call_function);
362
363/*
364 * this function calls the 'stop' function on all other CPUs in the system.
365 */
366void
367smp_send_stop (void)
368{
369 send_IPI_allbutself(IPI_CPU_STOP);
370}
371
372int __init
373setup_profiling_timer (unsigned int multiplier)
374{
375 return -EINVAL;
376}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
new file mode 100644
index 000000000000..5318f0cbfc26
--- /dev/null
+++ b/arch/ia64/kernel/smpboot.c
@@ -0,0 +1,692 @@
1/*
2 * SMP boot-related support
3 *
4 * Copyright (C) 1998-2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * 01/05/16 Rohit Seth <rohit.seth@intel.com> Moved SMP booting functions from smp.c to here.
8 * 01/04/27 David Mosberger <davidm@hpl.hp.com> Added ITC synching code.
9 * 02/07/31 David Mosberger <davidm@hpl.hp.com> Switch over to hotplug-CPU boot-sequence.
10 * smp_boot_cpus()/smp_commence() is replaced by
11 * smp_prepare_cpus()/__cpu_up()/smp_cpus_done().
12 */
13#include <linux/config.h>
14
15#include <linux/module.h>
16#include <linux/acpi.h>
17#include <linux/bootmem.h>
18#include <linux/cpu.h>
19#include <linux/delay.h>
20#include <linux/init.h>
21#include <linux/interrupt.h>
22#include <linux/irq.h>
23#include <linux/kernel.h>
24#include <linux/kernel_stat.h>
25#include <linux/mm.h>
26#include <linux/notifier.h>
27#include <linux/smp.h>
28#include <linux/smp_lock.h>
29#include <linux/spinlock.h>
30#include <linux/efi.h>
31#include <linux/percpu.h>
32#include <linux/bitops.h>
33
34#include <asm/atomic.h>
35#include <asm/cache.h>
36#include <asm/current.h>
37#include <asm/delay.h>
38#include <asm/ia32.h>
39#include <asm/io.h>
40#include <asm/irq.h>
41#include <asm/machvec.h>
42#include <asm/mca.h>
43#include <asm/page.h>
44#include <asm/pgalloc.h>
45#include <asm/pgtable.h>
46#include <asm/processor.h>
47#include <asm/ptrace.h>
48#include <asm/sal.h>
49#include <asm/system.h>
50#include <asm/tlbflush.h>
51#include <asm/unistd.h>
52
53#define SMP_DEBUG 0
54
55#if SMP_DEBUG
56#define Dprintk(x...) printk(x)
57#else
58#define Dprintk(x...)
59#endif
60
61
62/*
63 * ITC synchronization related stuff:
64 */
65#define MASTER 0
66#define SLAVE (SMP_CACHE_BYTES/8)
67
68#define NUM_ROUNDS 64 /* magic value */
69#define NUM_ITERS 5 /* likewise */
70
71static DEFINE_SPINLOCK(itc_sync_lock);
72static volatile unsigned long go[SLAVE + 1];
73
74#define DEBUG_ITC_SYNC 0
75
76extern void __devinit calibrate_delay (void);
77extern void start_ap (void);
78extern unsigned long ia64_iobase;
79
80task_t *task_for_booting_cpu;
81
82/*
83 * State for each CPU
84 */
85DEFINE_PER_CPU(int, cpu_state);
86
87/* Bitmasks of currently online, and possible CPUs */
88cpumask_t cpu_online_map;
89EXPORT_SYMBOL(cpu_online_map);
90cpumask_t cpu_possible_map;
91EXPORT_SYMBOL(cpu_possible_map);
92
93/* which logical CPU number maps to which CPU (physical APIC ID) */
94volatile int ia64_cpu_to_sapicid[NR_CPUS];
95EXPORT_SYMBOL(ia64_cpu_to_sapicid);
96
97static volatile cpumask_t cpu_callin_map;
98
99struct smp_boot_data smp_boot_data __initdata;
100
101unsigned long ap_wakeup_vector = -1; /* External Int use to wakeup APs */
102
103char __initdata no_int_routing;
104
105unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
106
107static int __init
108nointroute (char *str)
109{
110 no_int_routing = 1;
111 printk ("no_int_routing on\n");
112 return 1;
113}
114
115__setup("nointroute", nointroute);
116
117void
118sync_master (void *arg)
119{
120 unsigned long flags, i;
121
122 go[MASTER] = 0;
123
124 local_irq_save(flags);
125 {
126 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
127 while (!go[MASTER]);
128 go[MASTER] = 0;
129 go[SLAVE] = ia64_get_itc();
130 }
131 }
132 local_irq_restore(flags);
133}
134
135/*
136 * Return the number of cycles by which our itc differs from the itc on the master
137 * (time-keeper) CPU. A positive number indicates our itc is ahead of the master,
138 * negative that it is behind.
139 */
140static inline long
141get_delta (long *rt, long *master)
142{
143 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
144 unsigned long tcenter, t0, t1, tm;
145 long i;
146
147 for (i = 0; i < NUM_ITERS; ++i) {
148 t0 = ia64_get_itc();
149 go[MASTER] = 1;
150 while (!(tm = go[SLAVE]));
151 go[SLAVE] = 0;
152 t1 = ia64_get_itc();
153
154 if (t1 - t0 < best_t1 - best_t0)
155 best_t0 = t0, best_t1 = t1, best_tm = tm;
156 }
157
158 *rt = best_t1 - best_t0;
159 *master = best_tm - best_t0;
160
161 /* average best_t0 and best_t1 without overflow: */
162 tcenter = (best_t0/2 + best_t1/2);
163 if (best_t0 % 2 + best_t1 % 2 == 2)
164 ++tcenter;
165 return tcenter - best_tm;
166}
167
168/*
169 * Synchronize ar.itc of the current (slave) CPU with the ar.itc of the MASTER CPU
170 * (normally the time-keeper CPU). We use a closed loop to eliminate the possibility of
171 * unaccounted-for errors (such as getting a machine check in the middle of a calibration
172 * step). The basic idea is for the slave to ask the master what itc value it has and to
173 * read its own itc before and after the master responds. Each iteration gives us three
174 * timestamps:
175 *
176 * slave master
177 *
178 * t0 ---\
179 * ---\
180 * --->
181 * tm
182 * /---
183 * /---
184 * t1 <---
185 *
186 *
187 * The goal is to adjust the slave's ar.itc such that tm falls exactly half-way between t0
188 * and t1. If we achieve this, the clocks are synchronized provided the interconnect
189 * between the slave and the master is symmetric. Even if the interconnect were
190 * asymmetric, we would still know that the synchronization error is smaller than the
191 * roundtrip latency (t0 - t1).
192 *
193 * When the interconnect is quiet and symmetric, this lets us synchronize the itc to
194 * within one or two cycles. However, we can only *guarantee* that the synchronization is
195 * accurate to within a round-trip time, which is typically in the range of several
196 * hundred cycles (e.g., ~500 cycles). In practice, this means that the itc's are usually
197 * almost perfectly synchronized, but we shouldn't assume that the accuracy is much better
198 * than half a micro second or so.
199 */
200void
201ia64_sync_itc (unsigned int master)
202{
203 long i, delta, adj, adjust_latency = 0, done = 0;
204 unsigned long flags, rt, master_time_stamp, bound;
205#if DEBUG_ITC_SYNC
206 struct {
207 long rt; /* roundtrip time */
208 long master; /* master's timestamp */
209 long diff; /* difference between midpoint and master's timestamp */
210 long lat; /* estimate of itc adjustment latency */
211 } t[NUM_ROUNDS];
212#endif
213
214 /*
215 * Make sure local timer ticks are disabled while we sync. If
216 * they were enabled, we'd have to worry about nasty issues
217 * like setting the ITC ahead of (or a long time before) the
218 * next scheduled tick.
219 */
220 BUG_ON((ia64_get_itv() & (1 << 16)) == 0);
221
222 go[MASTER] = 1;
223
224 if (smp_call_function_single(master, sync_master, NULL, 1, 0) < 0) {
225 printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master);
226 return;
227 }
228
229 while (go[MASTER]); /* wait for master to be ready */
230
231 spin_lock_irqsave(&itc_sync_lock, flags);
232 {
233 for (i = 0; i < NUM_ROUNDS; ++i) {
234 delta = get_delta(&rt, &master_time_stamp);
235 if (delta == 0) {
236 done = 1; /* let's lock on to this... */
237 bound = rt;
238 }
239
240 if (!done) {
241 if (i > 0) {
242 adjust_latency += -delta;
243 adj = -delta + adjust_latency/4;
244 } else
245 adj = -delta;
246
247 ia64_set_itc(ia64_get_itc() + adj);
248 }
249#if DEBUG_ITC_SYNC
250 t[i].rt = rt;
251 t[i].master = master_time_stamp;
252 t[i].diff = delta;
253 t[i].lat = adjust_latency/4;
254#endif
255 }
256 }
257 spin_unlock_irqrestore(&itc_sync_lock, flags);
258
259#if DEBUG_ITC_SYNC
260 for (i = 0; i < NUM_ROUNDS; ++i)
261 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
262 t[i].rt, t[i].master, t[i].diff, t[i].lat);
263#endif
264
265 printk(KERN_INFO "CPU %d: synchronized ITC with CPU %u (last diff %ld cycles, "
266 "maxerr %lu cycles)\n", smp_processor_id(), master, delta, rt);
267}
268
269/*
270 * Ideally sets up per-cpu profiling hooks. Doesn't do much now...
271 */
272static inline void __devinit
273smp_setup_percpu_timer (void)
274{
275}
276
277static void __devinit
278smp_callin (void)
279{
280 int cpuid, phys_id;
281 extern void ia64_init_itm(void);
282
283#ifdef CONFIG_PERFMON
284 extern void pfm_init_percpu(void);
285#endif
286
287 cpuid = smp_processor_id();
288 phys_id = hard_smp_processor_id();
289
290 if (cpu_online(cpuid)) {
291 printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
292 phys_id, cpuid);
293 BUG();
294 }
295
296 lock_ipi_calllock();
297 cpu_set(cpuid, cpu_online_map);
298 unlock_ipi_calllock();
299
300 smp_setup_percpu_timer();
301
302 ia64_mca_cmc_vector_setup(); /* Setup vector on AP */
303
304#ifdef CONFIG_PERFMON
305 pfm_init_percpu();
306#endif
307
308 local_irq_enable();
309
310 if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
311 /*
312 * Synchronize the ITC with the BP. Need to do this after irqs are
313 * enabled because ia64_sync_itc() calls smp_call_function_single(), which
314 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
315 * local_bh_enable(), which bugs out if irqs are not enabled...
316 */
317 Dprintk("Going to syncup ITC with BP.\n");
318 ia64_sync_itc(0);
319 }
320
321 /*
322 * Get our bogomips.
323 */
324 ia64_init_itm();
325 calibrate_delay();
326 local_cpu_data->loops_per_jiffy = loops_per_jiffy;
327
328#ifdef CONFIG_IA32_SUPPORT
329 ia32_gdt_init();
330#endif
331
332 /*
333 * Allow the master to continue.
334 */
335 cpu_set(cpuid, cpu_callin_map);
336 Dprintk("Stack on CPU %d at about %p\n",cpuid, &cpuid);
337}
338
339
340/*
341 * Activate a secondary processor. head.S calls this.
342 */
343int __devinit
344start_secondary (void *unused)
345{
346 /* Early console may use I/O ports */
347 ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
348
349 Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
350 efi_map_pal_code();
351 cpu_init();
352 smp_callin();
353
354 cpu_idle();
355 return 0;
356}
357
358struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
359{
360 return NULL;
361}
362
363struct create_idle {
364 struct task_struct *idle;
365 struct completion done;
366 int cpu;
367};
368
369void
370do_fork_idle(void *_c_idle)
371{
372 struct create_idle *c_idle = _c_idle;
373
374 c_idle->idle = fork_idle(c_idle->cpu);
375 complete(&c_idle->done);
376}
377
378static int __devinit
379do_boot_cpu (int sapicid, int cpu)
380{
381 int timeout;
382 struct create_idle c_idle = {
383 .cpu = cpu,
384 .done = COMPLETION_INITIALIZER(c_idle.done),
385 };
386 DECLARE_WORK(work, do_fork_idle, &c_idle);
387 /*
388 * We can't use kernel_thread since we must avoid to reschedule the child.
389 */
390 if (!keventd_up() || current_is_keventd())
391 work.func(work.data);
392 else {
393 schedule_work(&work);
394 wait_for_completion(&c_idle.done);
395 }
396
397 if (IS_ERR(c_idle.idle))
398 panic("failed fork for CPU %d", cpu);
399 task_for_booting_cpu = c_idle.idle;
400
401 Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid);
402
403 platform_send_ipi(cpu, ap_wakeup_vector, IA64_IPI_DM_INT, 0);
404
405 /*
406 * Wait 10s total for the AP to start
407 */
408 Dprintk("Waiting on callin_map ...");
409 for (timeout = 0; timeout < 100000; timeout++) {
410 if (cpu_isset(cpu, cpu_callin_map))
411 break; /* It has booted */
412 udelay(100);
413 }
414 Dprintk("\n");
415
416 if (!cpu_isset(cpu, cpu_callin_map)) {
417 printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid);
418 ia64_cpu_to_sapicid[cpu] = -1;
419 cpu_clear(cpu, cpu_online_map); /* was set in smp_callin() */
420 return -EINVAL;
421 }
422 return 0;
423}
424
425static int __init
426decay (char *str)
427{
428 int ticks;
429 get_option (&str, &ticks);
430 return 1;
431}
432
433__setup("decay=", decay);
434
435/*
436 * Initialize the logical CPU number to SAPICID mapping
437 */
438void __init
439smp_build_cpu_map (void)
440{
441 int sapicid, cpu, i;
442 int boot_cpu_id = hard_smp_processor_id();
443
444 for (cpu = 0; cpu < NR_CPUS; cpu++) {
445 ia64_cpu_to_sapicid[cpu] = -1;
446#ifdef CONFIG_HOTPLUG_CPU
447 cpu_set(cpu, cpu_possible_map);
448#endif
449 }
450
451 ia64_cpu_to_sapicid[0] = boot_cpu_id;
452 cpus_clear(cpu_present_map);
453 cpu_set(0, cpu_present_map);
454 cpu_set(0, cpu_possible_map);
455 for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
456 sapicid = smp_boot_data.cpu_phys_id[i];
457 if (sapicid == boot_cpu_id)
458 continue;
459 cpu_set(cpu, cpu_present_map);
460 cpu_set(cpu, cpu_possible_map);
461 ia64_cpu_to_sapicid[cpu] = sapicid;
462 cpu++;
463 }
464}
465
466#ifdef CONFIG_NUMA
467
468/* on which node is each logical CPU (one cacheline even for 64 CPUs) */
469u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
470EXPORT_SYMBOL(cpu_to_node_map);
471/* which logical CPUs are on which nodes */
472cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
473
474/*
475 * Build cpu to node mapping and initialize the per node cpu masks.
476 */
477void __init
478build_cpu_to_node_map (void)
479{
480 int cpu, i, node;
481
482 for(node=0; node<MAX_NUMNODES; node++)
483 cpus_clear(node_to_cpu_mask[node]);
484 for(cpu = 0; cpu < NR_CPUS; ++cpu) {
485 /*
486 * All Itanium NUMA platforms I know use ACPI, so maybe we
487 * can drop this ifdef completely. [EF]
488 */
489#ifdef CONFIG_ACPI_NUMA
490 node = -1;
491 for (i = 0; i < NR_CPUS; ++i)
492 if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
493 node = node_cpuid[i].nid;
494 break;
495 }
496#else
497# error Fixme: Dunno how to build CPU-to-node map.
498#endif
499 cpu_to_node_map[cpu] = (node >= 0) ? node : 0;
500 if (node >= 0)
501 cpu_set(cpu, node_to_cpu_mask[node]);
502 }
503}
504
505#endif /* CONFIG_NUMA */
506
507/*
508 * Cycle through the APs sending Wakeup IPIs to boot each.
509 */
510void __init
511smp_prepare_cpus (unsigned int max_cpus)
512{
513 int boot_cpu_id = hard_smp_processor_id();
514
515 /*
516 * Initialize the per-CPU profiling counter/multiplier
517 */
518
519 smp_setup_percpu_timer();
520
521 /*
522 * We have the boot CPU online for sure.
523 */
524 cpu_set(0, cpu_online_map);
525 cpu_set(0, cpu_callin_map);
526
527 local_cpu_data->loops_per_jiffy = loops_per_jiffy;
528 ia64_cpu_to_sapicid[0] = boot_cpu_id;
529
530 printk(KERN_INFO "Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id);
531
532 current_thread_info()->cpu = 0;
533
534 /*
535 * If SMP should be disabled, then really disable it!
536 */
537 if (!max_cpus) {
538 printk(KERN_INFO "SMP mode deactivated.\n");
539 cpus_clear(cpu_online_map);
540 cpus_clear(cpu_present_map);
541 cpus_clear(cpu_possible_map);
542 cpu_set(0, cpu_online_map);
543 cpu_set(0, cpu_present_map);
544 cpu_set(0, cpu_possible_map);
545 return;
546 }
547}
548
549void __devinit smp_prepare_boot_cpu(void)
550{
551 cpu_set(smp_processor_id(), cpu_online_map);
552 cpu_set(smp_processor_id(), cpu_callin_map);
553}
554
555#ifdef CONFIG_HOTPLUG_CPU
556extern void fixup_irqs(void);
557/* must be called with cpucontrol mutex held */
558static int __devinit cpu_enable(unsigned int cpu)
559{
560 per_cpu(cpu_state,cpu) = CPU_UP_PREPARE;
561 wmb();
562
563 while (!cpu_online(cpu))
564 cpu_relax();
565 return 0;
566}
567
568int __cpu_disable(void)
569{
570 int cpu = smp_processor_id();
571
572 /*
573 * dont permit boot processor for now
574 */
575 if (cpu == 0)
576 return -EBUSY;
577
578 fixup_irqs();
579 local_flush_tlb_all();
580 printk ("Disabled cpu %u\n", smp_processor_id());
581 return 0;
582}
583
584void __cpu_die(unsigned int cpu)
585{
586 unsigned int i;
587
588 for (i = 0; i < 100; i++) {
589 /* They ack this in play_dead by setting CPU_DEAD */
590 if (per_cpu(cpu_state, cpu) == CPU_DEAD)
591 {
592 /*
593 * TBD: Enable this when physical removal
594 * or when we put the processor is put in
595 * SAL_BOOT_RENDEZ mode
596 * cpu_clear(cpu, cpu_callin_map);
597 */
598 return;
599 }
600 msleep(100);
601 }
602 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
603}
604#else /* !CONFIG_HOTPLUG_CPU */
605static int __devinit cpu_enable(unsigned int cpu)
606{
607 return 0;
608}
609
610int __cpu_disable(void)
611{
612 return -ENOSYS;
613}
614
615void __cpu_die(unsigned int cpu)
616{
617 /* We said "no" in __cpu_disable */
618 BUG();
619}
620#endif /* CONFIG_HOTPLUG_CPU */
621
622void
623smp_cpus_done (unsigned int dummy)
624{
625 int cpu;
626 unsigned long bogosum = 0;
627
628 /*
629 * Allow the user to impress friends.
630 */
631
632 for (cpu = 0; cpu < NR_CPUS; cpu++)
633 if (cpu_online(cpu))
634 bogosum += cpu_data(cpu)->loops_per_jiffy;
635
636 printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
637 (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
638}
639
640int __devinit
641__cpu_up (unsigned int cpu)
642{
643 int ret;
644 int sapicid;
645
646 sapicid = ia64_cpu_to_sapicid[cpu];
647 if (sapicid == -1)
648 return -EINVAL;
649
650 /*
651 * Already booted.. just enable and get outa idle lool
652 */
653 if (cpu_isset(cpu, cpu_callin_map))
654 {
655 cpu_enable(cpu);
656 local_irq_enable();
657 while (!cpu_isset(cpu, cpu_online_map))
658 mb();
659 return 0;
660 }
661 /* Processor goes to start_secondary(), sets online flag */
662 ret = do_boot_cpu(sapicid, cpu);
663 if (ret < 0)
664 return ret;
665
666 return 0;
667}
668
669/*
670 * Assume that CPU's have been discovered by some platform-dependent interface. For
671 * SoftSDV/Lion, that would be ACPI.
672 *
673 * Setup of the IPI irq handler is done in irq.c:init_IRQ_SMP().
674 */
675void __init
676init_smp_config(void)
677{
678 struct fptr {
679 unsigned long fp;
680 unsigned long gp;
681 } *ap_startup;
682 long sal_ret;
683
684 /* Tell SAL where to drop the AP's. */
685 ap_startup = (struct fptr *) start_ap;
686 sal_ret = ia64_sal_set_vectors(SAL_VECTOR_OS_BOOT_RENDEZ,
687 ia64_tpa(ap_startup->fp), ia64_tpa(ap_startup->gp), 0, 0, 0, 0);
688 if (sal_ret < 0)
689 printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
690 ia64_sal_strerror(sal_ret));
691}
692
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
new file mode 100644
index 000000000000..3ac216e1c8bb
--- /dev/null
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -0,0 +1,298 @@
1/*
2 * This file contains various system calls that have different calling
3 * conventions on different platforms.
4 *
5 * Copyright (C) 1999-2000, 2002-2003, 2005 Hewlett-Packard Co
6 * David Mosberger-Tang <davidm@hpl.hp.com>
7 */
8#include <linux/config.h>
9#include <linux/errno.h>
10#include <linux/fs.h>
11#include <linux/mm.h>
12#include <linux/mman.h>
13#include <linux/sched.h>
14#include <linux/shm.h>
15#include <linux/file.h> /* doh, must come after sched.h... */
16#include <linux/smp.h>
17#include <linux/smp_lock.h>
18#include <linux/syscalls.h>
19#include <linux/highuid.h>
20#include <linux/hugetlb.h>
21
22#include <asm/shmparam.h>
23#include <asm/uaccess.h>
24
25unsigned long
26arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len,
27 unsigned long pgoff, unsigned long flags)
28{
29 long map_shared = (flags & MAP_SHARED);
30 unsigned long start_addr, align_mask = PAGE_SIZE - 1;
31 struct mm_struct *mm = current->mm;
32 struct vm_area_struct *vma;
33
34 if (len > RGN_MAP_LIMIT)
35 return -ENOMEM;
36
37#ifdef CONFIG_HUGETLB_PAGE
38 if (REGION_NUMBER(addr) == REGION_HPAGE)
39 addr = 0;
40#endif
41 if (!addr)
42 addr = mm->free_area_cache;
43
44 if (map_shared && (TASK_SIZE > 0xfffffffful))
45 /*
46 * For 64-bit tasks, align shared segments to 1MB to avoid potential
47 * performance penalty due to virtual aliasing (see ASDM). For 32-bit
48 * tasks, we prefer to avoid exhausting the address space too quickly by
49 * limiting alignment to a single page.
50 */
51 align_mask = SHMLBA - 1;
52
53 full_search:
54 start_addr = addr = (addr + align_mask) & ~align_mask;
55
56 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
57 /* At this point: (!vma || addr < vma->vm_end). */
58 if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) {
59 if (start_addr != TASK_UNMAPPED_BASE) {
60 /* Start a new search --- just in case we missed some holes. */
61 addr = TASK_UNMAPPED_BASE;
62 goto full_search;
63 }
64 return -ENOMEM;
65 }
66 if (!vma || addr + len <= vma->vm_start) {
67 /* Remember the address where we stopped this search: */
68 mm->free_area_cache = addr + len;
69 return addr;
70 }
71 addr = (vma->vm_end + align_mask) & ~align_mask;
72 }
73}
74
75asmlinkage long
76ia64_getpriority (int which, int who)
77{
78 long prio;
79
80 prio = sys_getpriority(which, who);
81 if (prio >= 0) {
82 force_successful_syscall_return();
83 prio = 20 - prio;
84 }
85 return prio;
86}
87
88/* XXX obsolete, but leave it here until the old libc is gone... */
89asmlinkage unsigned long
90sys_getpagesize (void)
91{
92 return PAGE_SIZE;
93}
94
95asmlinkage unsigned long
96ia64_shmat (int shmid, void __user *shmaddr, int shmflg)
97{
98 unsigned long raddr;
99 int retval;
100
101 retval = do_shmat(shmid, shmaddr, shmflg, &raddr);
102 if (retval < 0)
103 return retval;
104
105 force_successful_syscall_return();
106 return raddr;
107}
108
109asmlinkage unsigned long
110ia64_brk (unsigned long brk)
111{
112 unsigned long rlim, retval, newbrk, oldbrk;
113 struct mm_struct *mm = current->mm;
114
115 /*
116 * Most of this replicates the code in sys_brk() except for an additional safety
117 * check and the clearing of r8. However, we can't call sys_brk() because we need
118 * to acquire the mmap_sem before we can do the test...
119 */
120 down_write(&mm->mmap_sem);
121
122 if (brk < mm->end_code)
123 goto out;
124 newbrk = PAGE_ALIGN(brk);
125 oldbrk = PAGE_ALIGN(mm->brk);
126 if (oldbrk == newbrk)
127 goto set_brk;
128
129 /* Always allow shrinking brk. */
130 if (brk <= mm->brk) {
131 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
132 goto set_brk;
133 goto out;
134 }
135
136 /* Check against unimplemented/unmapped addresses: */
137 if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT)
138 goto out;
139
140 /* Check against rlimit.. */
141 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
142 if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
143 goto out;
144
145 /* Check against existing mmap mappings. */
146 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
147 goto out;
148
149 /* Ok, looks good - let it rip. */
150 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
151 goto out;
152set_brk:
153 mm->brk = brk;
154out:
155 retval = mm->brk;
156 up_write(&mm->mmap_sem);
157 force_successful_syscall_return();
158 return retval;
159}
160
161/*
162 * On IA-64, we return the two file descriptors in ret0 and ret1 (r8
163 * and r9) as this is faster than doing a copy_to_user().
164 */
165asmlinkage long
166sys_pipe (void)
167{
168 struct pt_regs *regs = ia64_task_regs(current);
169 int fd[2];
170 int retval;
171
172 retval = do_pipe(fd);
173 if (retval)
174 goto out;
175 retval = fd[0];
176 regs->r9 = fd[1];
177 out:
178 return retval;
179}
180
181static inline unsigned long
182do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff)
183{
184 unsigned long roff;
185 struct file *file = NULL;
186
187 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
188 if (!(flags & MAP_ANONYMOUS)) {
189 file = fget(fd);
190 if (!file)
191 return -EBADF;
192
193 if (!file->f_op || !file->f_op->mmap) {
194 addr = -ENODEV;
195 goto out;
196 }
197 }
198
199 /*
200 * A zero mmap always succeeds in Linux, independent of whether or not the
201 * remaining arguments are valid.
202 */
203 if (len == 0)
204 goto out;
205
206 /* Careful about overflows.. */
207 len = PAGE_ALIGN(len);
208 if (!len || len > TASK_SIZE) {
209 addr = -EINVAL;
210 goto out;
211 }
212
213 /*
214 * Don't permit mappings into unmapped space, the virtual page table of a region,
215 * or across a region boundary. Note: RGN_MAP_LIMIT is equal to 2^n-PAGE_SIZE
216 * (for some integer n <= 61) and len > 0.
217 */
218 roff = REGION_OFFSET(addr);
219 if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) {
220 addr = -EINVAL;
221 goto out;
222 }
223
224 down_write(&current->mm->mmap_sem);
225 addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
226 up_write(&current->mm->mmap_sem);
227
228out: if (file)
229 fput(file);
230 return addr;
231}
232
233/*
234 * mmap2() is like mmap() except that the offset is expressed in units
235 * of PAGE_SIZE (instead of bytes). This allows to mmap2() (pieces
236 * of) files that are larger than the address space of the CPU.
237 */
238asmlinkage unsigned long
239sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
240{
241 addr = do_mmap2(addr, len, prot, flags, fd, pgoff);
242 if (!IS_ERR((void *) addr))
243 force_successful_syscall_return();
244 return addr;
245}
246
247asmlinkage unsigned long
248sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
249{
250 if (offset_in_page(off) != 0)
251 return -EINVAL;
252
253 addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
254 if (!IS_ERR((void *) addr))
255 force_successful_syscall_return();
256 return addr;
257}
258
259asmlinkage unsigned long
260ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
261 unsigned long new_addr)
262{
263 extern unsigned long do_mremap (unsigned long addr,
264 unsigned long old_len,
265 unsigned long new_len,
266 unsigned long flags,
267 unsigned long new_addr);
268
269 down_write(&current->mm->mmap_sem);
270 {
271 addr = do_mremap(addr, old_len, new_len, flags, new_addr);
272 }
273 up_write(&current->mm->mmap_sem);
274
275 if (IS_ERR((void *) addr))
276 return addr;
277
278 force_successful_syscall_return();
279 return addr;
280}
281
282#ifndef CONFIG_PCI
283
284asmlinkage long
285sys_pciconfig_read (unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len,
286 void *buf)
287{
288 return -ENOSYS;
289}
290
291asmlinkage long
292sys_pciconfig_write (unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len,
293 void *buf)
294{
295 return -ENOSYS;
296}
297
298#endif /* CONFIG_PCI */
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
new file mode 100644
index 000000000000..8b8a5a45b621
--- /dev/null
+++ b/arch/ia64/kernel/time.c
@@ -0,0 +1,255 @@
1/*
2 * linux/arch/ia64/kernel/time.c
3 *
4 * Copyright (C) 1998-2003 Hewlett-Packard Co
5 * Stephane Eranian <eranian@hpl.hp.com>
6 * David Mosberger <davidm@hpl.hp.com>
7 * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
8 * Copyright (C) 1999-2000 VA Linux Systems
9 * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com>
10 */
11#include <linux/config.h>
12
13#include <linux/cpu.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/profile.h>
18#include <linux/sched.h>
19#include <linux/time.h>
20#include <linux/interrupt.h>
21#include <linux/efi.h>
22#include <linux/profile.h>
23#include <linux/timex.h>
24
25#include <asm/machvec.h>
26#include <asm/delay.h>
27#include <asm/hw_irq.h>
28#include <asm/ptrace.h>
29#include <asm/sal.h>
30#include <asm/sections.h>
31#include <asm/system.h>
32
33extern unsigned long wall_jiffies;
34
35u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
36
37EXPORT_SYMBOL(jiffies_64);
38
39#define TIME_KEEPER_ID 0 /* smp_processor_id() of time-keeper */
40
41#ifdef CONFIG_IA64_DEBUG_IRQ
42
43unsigned long last_cli_ip;
44EXPORT_SYMBOL(last_cli_ip);
45
46#endif
47
48static struct time_interpolator itc_interpolator = {
49 .shift = 16,
50 .mask = 0xffffffffffffffffLL,
51 .source = TIME_SOURCE_CPU
52};
53
54static irqreturn_t
55timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
56{
57 unsigned long new_itm;
58
59 if (unlikely(cpu_is_offline(smp_processor_id()))) {
60 return IRQ_HANDLED;
61 }
62
63 platform_timer_interrupt(irq, dev_id, regs);
64
65 new_itm = local_cpu_data->itm_next;
66
67 if (!time_after(ia64_get_itc(), new_itm))
68 printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n",
69 ia64_get_itc(), new_itm);
70
71 profile_tick(CPU_PROFILING, regs);
72
73 while (1) {
74 update_process_times(user_mode(regs));
75
76 new_itm += local_cpu_data->itm_delta;
77
78 if (smp_processor_id() == TIME_KEEPER_ID) {
79 /*
80 * Here we are in the timer irq handler. We have irqs locally
81 * disabled, but we don't know if the timer_bh is running on
82 * another CPU. We need to avoid to SMP race by acquiring the
83 * xtime_lock.
84 */
85 write_seqlock(&xtime_lock);
86 do_timer(regs);
87 local_cpu_data->itm_next = new_itm;
88 write_sequnlock(&xtime_lock);
89 } else
90 local_cpu_data->itm_next = new_itm;
91
92 if (time_after(new_itm, ia64_get_itc()))
93 break;
94 }
95
96 do {
97 /*
98 * If we're too close to the next clock tick for
99 * comfort, we increase the safety margin by
100 * intentionally dropping the next tick(s). We do NOT
101 * update itm.next because that would force us to call
102 * do_timer() which in turn would let our clock run
103 * too fast (with the potentially devastating effect
104 * of losing monotony of time).
105 */
106 while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2))
107 new_itm += local_cpu_data->itm_delta;
108 ia64_set_itm(new_itm);
109 /* double check, in case we got hit by a (slow) PMI: */
110 } while (time_after_eq(ia64_get_itc(), new_itm));
111 return IRQ_HANDLED;
112}
113
114/*
115 * Encapsulate access to the itm structure for SMP.
116 */
117void
118ia64_cpu_local_tick (void)
119{
120 int cpu = smp_processor_id();
121 unsigned long shift = 0, delta;
122
123 /* arrange for the cycle counter to generate a timer interrupt: */
124 ia64_set_itv(IA64_TIMER_VECTOR);
125
126 delta = local_cpu_data->itm_delta;
127 /*
128 * Stagger the timer tick for each CPU so they don't occur all at (almost) the
129 * same time:
130 */
131 if (cpu) {
132 unsigned long hi = 1UL << ia64_fls(cpu);
133 shift = (2*(cpu - hi) + 1) * delta/hi/2;
134 }
135 local_cpu_data->itm_next = ia64_get_itc() + delta + shift;
136 ia64_set_itm(local_cpu_data->itm_next);
137}
138
139static int nojitter;
140
141static int __init nojitter_setup(char *str)
142{
143 nojitter = 1;
144 printk("Jitter checking for ITC timers disabled\n");
145 return 1;
146}
147
148__setup("nojitter", nojitter_setup);
149
150
151void __devinit
152ia64_init_itm (void)
153{
154 unsigned long platform_base_freq, itc_freq;
155 struct pal_freq_ratio itc_ratio, proc_ratio;
156 long status, platform_base_drift, itc_drift;
157
158 /*
159 * According to SAL v2.6, we need to use a SAL call to determine the platform base
160 * frequency and then a PAL call to determine the frequency ratio between the ITC
161 * and the base frequency.
162 */
163 status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
164 &platform_base_freq, &platform_base_drift);
165 if (status != 0) {
166 printk(KERN_ERR "SAL_FREQ_BASE_PLATFORM failed: %s\n", ia64_sal_strerror(status));
167 } else {
168 status = ia64_pal_freq_ratios(&proc_ratio, NULL, &itc_ratio);
169 if (status != 0)
170 printk(KERN_ERR "PAL_FREQ_RATIOS failed with status=%ld\n", status);
171 }
172 if (status != 0) {
173 /* invent "random" values */
174 printk(KERN_ERR
175 "SAL/PAL failed to obtain frequency info---inventing reasonable values\n");
176 platform_base_freq = 100000000;
177 platform_base_drift = -1; /* no drift info */
178 itc_ratio.num = 3;
179 itc_ratio.den = 1;
180 }
181 if (platform_base_freq < 40000000) {
182 printk(KERN_ERR "Platform base frequency %lu bogus---resetting to 75MHz!\n",
183 platform_base_freq);
184 platform_base_freq = 75000000;
185 platform_base_drift = -1;
186 }
187 if (!proc_ratio.den)
188 proc_ratio.den = 1; /* avoid division by zero */
189 if (!itc_ratio.den)
190 itc_ratio.den = 1; /* avoid division by zero */
191
192 itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
193
194 local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
195 printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%lu/%lu, "
196 "ITC freq=%lu.%03luMHz", smp_processor_id(),
197 platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
198 itc_ratio.num, itc_ratio.den, itc_freq / 1000000, (itc_freq / 1000) % 1000);
199
200 if (platform_base_drift != -1) {
201 itc_drift = platform_base_drift*itc_ratio.num/itc_ratio.den;
202 printk("+/-%ldppm\n", itc_drift);
203 } else {
204 itc_drift = -1;
205 printk("\n");
206 }
207
208 local_cpu_data->proc_freq = (platform_base_freq*proc_ratio.num)/proc_ratio.den;
209 local_cpu_data->itc_freq = itc_freq;
210 local_cpu_data->cyc_per_usec = (itc_freq + USEC_PER_SEC/2) / USEC_PER_SEC;
211 local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
212 + itc_freq/2)/itc_freq;
213
214 if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
215 itc_interpolator.frequency = local_cpu_data->itc_freq;
216 itc_interpolator.drift = itc_drift;
217#ifdef CONFIG_SMP
218 /* On IA64 in an SMP configuration ITCs are never accurately synchronized.
219 * Jitter compensation requires a cmpxchg which may limit
220 * the scalability of the syscalls for retrieving time.
221 * The ITC synchronization is usually successful to within a few
222 * ITC ticks but this is not a sure thing. If you need to improve
223 * timer performance in SMP situations then boot the kernel with the
224 * "nojitter" option. However, doing so may result in time fluctuating (maybe
225 * even going backward) if the ITC offsets between the individual CPUs
226 * are too large.
227 */
228 if (!nojitter) itc_interpolator.jitter = 1;
229#endif
230 register_time_interpolator(&itc_interpolator);
231 }
232
233 /* Setup the CPU local timer tick */
234 ia64_cpu_local_tick();
235}
236
237static struct irqaction timer_irqaction = {
238 .handler = timer_interrupt,
239 .flags = SA_INTERRUPT,
240 .name = "timer"
241};
242
243void __init
244time_init (void)
245{
246 register_percpu_irq(IA64_TIMER_VECTOR, &timer_irqaction);
247 efi_gettimeofday(&xtime);
248 ia64_init_itm();
249
250 /*
251 * Initialize wall_to_monotonic such that adding it to xtime will yield zero, the
252 * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC).
253 */
254 set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec);
255}
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
new file mode 100644
index 000000000000..f1aafd4c05f9
--- /dev/null
+++ b/arch/ia64/kernel/topology.c
@@ -0,0 +1,92 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * This file contains NUMA specific variables and functions which can
7 * be split away from DISCONTIGMEM and are used on NUMA machines with
8 * contiguous memory.
9 * 2002/08/07 Erich Focht <efocht@ess.nec.de>
10 * Populate cpu entries in sysfs for non-numa systems as well
11 * Intel Corporation - Ashok Raj
12 */
13
14#include <linux/config.h>
15#include <linux/cpu.h>
16#include <linux/kernel.h>
17#include <linux/mm.h>
18#include <linux/node.h>
19#include <linux/init.h>
20#include <linux/bootmem.h>
21#include <linux/nodemask.h>
22#include <asm/mmzone.h>
23#include <asm/numa.h>
24#include <asm/cpu.h>
25
26#ifdef CONFIG_NUMA
27static struct node *sysfs_nodes;
28#endif
29static struct ia64_cpu *sysfs_cpus;
30
31int arch_register_cpu(int num)
32{
33 struct node *parent = NULL;
34
35#ifdef CONFIG_NUMA
36 parent = &sysfs_nodes[cpu_to_node(num)];
37#endif /* CONFIG_NUMA */
38
39 return register_cpu(&sysfs_cpus[num].cpu, num, parent);
40}
41
42#ifdef CONFIG_HOTPLUG_CPU
43
44void arch_unregister_cpu(int num)
45{
46 struct node *parent = NULL;
47
48#ifdef CONFIG_NUMA
49 int node = cpu_to_node(num);
50 parent = &sysfs_nodes[node];
51#endif /* CONFIG_NUMA */
52
53 return unregister_cpu(&sysfs_cpus[num].cpu, parent);
54}
55EXPORT_SYMBOL(arch_register_cpu);
56EXPORT_SYMBOL(arch_unregister_cpu);
57#endif /*CONFIG_HOTPLUG_CPU*/
58
59
60static int __init topology_init(void)
61{
62 int i, err = 0;
63
64#ifdef CONFIG_NUMA
65 sysfs_nodes = kmalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL);
66 if (!sysfs_nodes) {
67 err = -ENOMEM;
68 goto out;
69 }
70 memset(sysfs_nodes, 0, sizeof(struct node) * MAX_NUMNODES);
71
72 /* MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? */
73 for_each_online_node(i)
74 if ((err = register_node(&sysfs_nodes[i], i, 0)))
75 goto out;
76#endif
77
78 sysfs_cpus = kmalloc(sizeof(struct ia64_cpu) * NR_CPUS, GFP_KERNEL);
79 if (!sysfs_cpus) {
80 err = -ENOMEM;
81 goto out;
82 }
83 memset(sysfs_cpus, 0, sizeof(struct ia64_cpu) * NR_CPUS);
84
85 for_each_present_cpu(i)
86 if((err = arch_register_cpu(i)))
87 goto out;
88out:
89 return err;
90}
91
92__initcall(topology_init);
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
new file mode 100644
index 000000000000..e82ad78081b3
--- /dev/null
+++ b/arch/ia64/kernel/traps.c
@@ -0,0 +1,609 @@
1/*
2 * Architecture-specific trap handling.
3 *
4 * Copyright (C) 1998-2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * 05/12/00 grao <goutham.rao@intel.com> : added isr in siginfo for SIGFPE
8 */
9
10#include <linux/config.h>
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/sched.h>
14#include <linux/tty.h>
15#include <linux/vt_kern.h> /* For unblank_screen() */
16#include <linux/module.h> /* for EXPORT_SYMBOL */
17#include <linux/hardirq.h>
18
19#include <asm/fpswa.h>
20#include <asm/ia32.h>
21#include <asm/intrinsics.h>
22#include <asm/processor.h>
23#include <asm/uaccess.h>
24
25extern spinlock_t timerlist_lock;
26
27fpswa_interface_t *fpswa_interface;
28EXPORT_SYMBOL(fpswa_interface);
29
30void __init
31trap_init (void)
32{
33 if (ia64_boot_param->fpswa)
34 /* FPSWA fixup: make the interface pointer a kernel virtual address: */
35 fpswa_interface = __va(ia64_boot_param->fpswa);
36}
37
38/*
39 * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock
40 * is acquired through the console unblank code)
41 */
42void
43bust_spinlocks (int yes)
44{
45 int loglevel_save = console_loglevel;
46
47 if (yes) {
48 oops_in_progress = 1;
49 return;
50 }
51
52#ifdef CONFIG_VT
53 unblank_screen();
54#endif
55 oops_in_progress = 0;
56 /*
57 * OK, the message is on the console. Now we call printk() without
58 * oops_in_progress set so that printk will give klogd a poke. Hold onto
59 * your hats...
60 */
61 console_loglevel = 15; /* NMI oopser may have shut the console up */
62 printk(" ");
63 console_loglevel = loglevel_save;
64}
65
66void
67die (const char *str, struct pt_regs *regs, long err)
68{
69 static struct {
70 spinlock_t lock;
71 u32 lock_owner;
72 int lock_owner_depth;
73 } die = {
74 .lock = SPIN_LOCK_UNLOCKED,
75 .lock_owner = -1,
76 .lock_owner_depth = 0
77 };
78 static int die_counter;
79
80 if (die.lock_owner != smp_processor_id()) {
81 console_verbose();
82 spin_lock_irq(&die.lock);
83 die.lock_owner = smp_processor_id();
84 die.lock_owner_depth = 0;
85 bust_spinlocks(1);
86 }
87
88 if (++die.lock_owner_depth < 3) {
89 printk("%s[%d]: %s %ld [%d]\n",
90 current->comm, current->pid, str, err, ++die_counter);
91 show_regs(regs);
92 } else
93 printk(KERN_ERR "Recursive die() failure, output suppressed\n");
94
95 bust_spinlocks(0);
96 die.lock_owner = -1;
97 spin_unlock_irq(&die.lock);
98 do_exit(SIGSEGV);
99}
100
101void
102die_if_kernel (char *str, struct pt_regs *regs, long err)
103{
104 if (!user_mode(regs))
105 die(str, regs, err);
106}
107
108void
109ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
110{
111 siginfo_t siginfo;
112 int sig, code;
113
114 /* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */
115 siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
116 siginfo.si_imm = break_num;
117 siginfo.si_flags = 0; /* clear __ISR_VALID */
118 siginfo.si_isr = 0;
119
120 switch (break_num) {
121 case 0: /* unknown error (used by GCC for __builtin_abort()) */
122 die_if_kernel("bugcheck!", regs, break_num);
123 sig = SIGILL; code = ILL_ILLOPC;
124 break;
125
126 case 1: /* integer divide by zero */
127 sig = SIGFPE; code = FPE_INTDIV;
128 break;
129
130 case 2: /* integer overflow */
131 sig = SIGFPE; code = FPE_INTOVF;
132 break;
133
134 case 3: /* range check/bounds check */
135 sig = SIGFPE; code = FPE_FLTSUB;
136 break;
137
138 case 4: /* null pointer dereference */
139 sig = SIGSEGV; code = SEGV_MAPERR;
140 break;
141
142 case 5: /* misaligned data */
143 sig = SIGSEGV; code = BUS_ADRALN;
144 break;
145
146 case 6: /* decimal overflow */
147 sig = SIGFPE; code = __FPE_DECOVF;
148 break;
149
150 case 7: /* decimal divide by zero */
151 sig = SIGFPE; code = __FPE_DECDIV;
152 break;
153
154 case 8: /* packed decimal error */
155 sig = SIGFPE; code = __FPE_DECERR;
156 break;
157
158 case 9: /* invalid ASCII digit */
159 sig = SIGFPE; code = __FPE_INVASC;
160 break;
161
162 case 10: /* invalid decimal digit */
163 sig = SIGFPE; code = __FPE_INVDEC;
164 break;
165
166 case 11: /* paragraph stack overflow */
167 sig = SIGSEGV; code = __SEGV_PSTKOVF;
168 break;
169
170 case 0x3f000 ... 0x3ffff: /* bundle-update in progress */
171 sig = SIGILL; code = __ILL_BNDMOD;
172 break;
173
174 default:
175 if (break_num < 0x40000 || break_num > 0x100000)
176 die_if_kernel("Bad break", regs, break_num);
177
178 if (break_num < 0x80000) {
179 sig = SIGILL; code = __ILL_BREAK;
180 } else {
181 sig = SIGTRAP; code = TRAP_BRKPT;
182 }
183 }
184 siginfo.si_signo = sig;
185 siginfo.si_errno = 0;
186 siginfo.si_code = code;
187 force_sig_info(sig, &siginfo, current);
188}
189
190/*
191 * disabled_fph_fault() is called when a user-level process attempts to access f32..f127
192 * and it doesn't own the fp-high register partition. When this happens, we save the
193 * current fph partition in the task_struct of the fpu-owner (if necessary) and then load
194 * the fp-high partition of the current task (if necessary). Note that the kernel has
195 * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes
196 * care of clearing psr.dfh.
197 */
198static inline void
199disabled_fph_fault (struct pt_regs *regs)
200{
201 struct ia64_psr *psr = ia64_psr(regs);
202
203 /* first, grant user-level access to fph partition: */
204 psr->dfh = 0;
205#ifndef CONFIG_SMP
206 {
207 struct task_struct *fpu_owner
208 = (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER);
209
210 if (ia64_is_local_fpu_owner(current))
211 return;
212
213 if (fpu_owner)
214 ia64_flush_fph(fpu_owner);
215 }
216#endif /* !CONFIG_SMP */
217 ia64_set_local_fpu_owner(current);
218 if ((current->thread.flags & IA64_THREAD_FPH_VALID) != 0) {
219 __ia64_load_fpu(current->thread.fph);
220 psr->mfh = 0;
221 } else {
222 __ia64_init_fpu();
223 /*
224 * Set mfh because the state in thread.fph does not match the state in
225 * the fph partition.
226 */
227 psr->mfh = 1;
228 }
229}
230
231static inline int
232fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long *pr, long *ifs,
233 struct pt_regs *regs)
234{
235 fp_state_t fp_state;
236 fpswa_ret_t ret;
237
238 if (!fpswa_interface)
239 return -1;
240
241 memset(&fp_state, 0, sizeof(fp_state_t));
242
243 /*
244 * compute fp_state. only FP registers f6 - f11 are used by the
245 * kernel, so set those bits in the mask and set the low volatile
246 * pointer to point to these registers.
247 */
248 fp_state.bitmask_low64 = 0xfc0; /* bit6..bit11 */
249
250 fp_state.fp_state_low_volatile = (fp_state_low_volatile_t *) &regs->f6;
251 /*
252 * unsigned long (*EFI_FPSWA) (
253 * unsigned long trap_type,
254 * void *Bundle,
255 * unsigned long *pipsr,
256 * unsigned long *pfsr,
257 * unsigned long *pisr,
258 * unsigned long *ppreds,
259 * unsigned long *pifs,
260 * void *fp_state);
261 */
262 ret = (*fpswa_interface->fpswa)((unsigned long) fp_fault, bundle,
263 (unsigned long *) ipsr, (unsigned long *) fpsr,
264 (unsigned long *) isr, (unsigned long *) pr,
265 (unsigned long *) ifs, &fp_state);
266
267 return ret.status;
268}
269
270/*
271 * Handle floating-point assist faults and traps.
272 */
273static int
274handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
275{
276 long exception, bundle[2];
277 unsigned long fault_ip;
278 struct siginfo siginfo;
279 static int fpu_swa_count = 0;
280 static unsigned long last_time;
281
282 fault_ip = regs->cr_iip;
283 if (!fp_fault && (ia64_psr(regs)->ri == 0))
284 fault_ip -= 16;
285 if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle)))
286 return -1;
287
288 if (jiffies - last_time > 5*HZ)
289 fpu_swa_count = 0;
290 if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) {
291 last_time = jiffies;
292 ++fpu_swa_count;
293 printk(KERN_WARNING
294 "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
295 current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr);
296 }
297
298 exception = fp_emulate(fp_fault, bundle, &regs->cr_ipsr, &regs->ar_fpsr, &isr, &regs->pr,
299 &regs->cr_ifs, regs);
300 if (fp_fault) {
301 if (exception == 0) {
302 /* emulation was successful */
303 ia64_increment_ip(regs);
304 } else if (exception == -1) {
305 printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
306 return -1;
307 } else {
308 /* is next instruction a trap? */
309 if (exception & 2) {
310 ia64_increment_ip(regs);
311 }
312 siginfo.si_signo = SIGFPE;
313 siginfo.si_errno = 0;
314 siginfo.si_code = __SI_FAULT; /* default code */
315 siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
316 if (isr & 0x11) {
317 siginfo.si_code = FPE_FLTINV;
318 } else if (isr & 0x22) {
319 /* denormal operand gets the same si_code as underflow
320 * see arch/i386/kernel/traps.c:math_error() */
321 siginfo.si_code = FPE_FLTUND;
322 } else if (isr & 0x44) {
323 siginfo.si_code = FPE_FLTDIV;
324 }
325 siginfo.si_isr = isr;
326 siginfo.si_flags = __ISR_VALID;
327 siginfo.si_imm = 0;
328 force_sig_info(SIGFPE, &siginfo, current);
329 }
330 } else {
331 if (exception == -1) {
332 printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
333 return -1;
334 } else if (exception != 0) {
335 /* raise exception */
336 siginfo.si_signo = SIGFPE;
337 siginfo.si_errno = 0;
338 siginfo.si_code = __SI_FAULT; /* default code */
339 siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
340 if (isr & 0x880) {
341 siginfo.si_code = FPE_FLTOVF;
342 } else if (isr & 0x1100) {
343 siginfo.si_code = FPE_FLTUND;
344 } else if (isr & 0x2200) {
345 siginfo.si_code = FPE_FLTRES;
346 }
347 siginfo.si_isr = isr;
348 siginfo.si_flags = __ISR_VALID;
349 siginfo.si_imm = 0;
350 force_sig_info(SIGFPE, &siginfo, current);
351 }
352 }
353 return 0;
354}
355
356struct illegal_op_return {
357 unsigned long fkt, arg1, arg2, arg3;
358};
359
360struct illegal_op_return
361ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
362 long arg4, long arg5, long arg6, long arg7,
363 struct pt_regs regs)
364{
365 struct illegal_op_return rv;
366 struct siginfo si;
367 char buf[128];
368
369#ifdef CONFIG_IA64_BRL_EMU
370 {
371 extern struct illegal_op_return ia64_emulate_brl (struct pt_regs *, unsigned long);
372
373 rv = ia64_emulate_brl(&regs, ec);
374 if (rv.fkt != (unsigned long) -1)
375 return rv;
376 }
377#endif
378
379 sprintf(buf, "IA-64 Illegal operation fault");
380 die_if_kernel(buf, &regs, 0);
381
382 memset(&si, 0, sizeof(si));
383 si.si_signo = SIGILL;
384 si.si_code = ILL_ILLOPC;
385 si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(&regs)->ri);
386 force_sig_info(SIGILL, &si, current);
387 rv.fkt = 0;
388 return rv;
389}
390
391void
392ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
393 unsigned long iim, unsigned long itir, long arg5, long arg6,
394 long arg7, struct pt_regs regs)
395{
396 unsigned long code, error = isr, iip;
397 struct siginfo siginfo;
398 char buf[128];
399 int result, sig;
400 static const char *reason[] = {
401 "IA-64 Illegal Operation fault",
402 "IA-64 Privileged Operation fault",
403 "IA-64 Privileged Register fault",
404 "IA-64 Reserved Register/Field fault",
405 "Disabled Instruction Set Transition fault",
406 "Unknown fault 5", "Unknown fault 6", "Unknown fault 7", "Illegal Hazard fault",
407 "Unknown fault 9", "Unknown fault 10", "Unknown fault 11", "Unknown fault 12",
408 "Unknown fault 13", "Unknown fault 14", "Unknown fault 15"
409 };
410
411 if ((isr & IA64_ISR_NA) && ((isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) {
412 /*
413 * This fault was due to lfetch.fault, set "ed" bit in the psr to cancel
414 * the lfetch.
415 */
416 ia64_psr(&regs)->ed = 1;
417 return;
418 }
419
420 iip = regs.cr_iip + ia64_psr(&regs)->ri;
421
422 switch (vector) {
423 case 24: /* General Exception */
424 code = (isr >> 4) & 0xf;
425 sprintf(buf, "General Exception: %s%s", reason[code],
426 (code == 3) ? ((isr & (1UL << 37))
427 ? " (RSE access)" : " (data access)") : "");
428 if (code == 8) {
429# ifdef CONFIG_IA64_PRINT_HAZARDS
430 printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n",
431 current->comm, current->pid,
432 regs.cr_iip + ia64_psr(&regs)->ri, regs.pr);
433# endif
434 return;
435 }
436 break;
437
438 case 25: /* Disabled FP-Register */
439 if (isr & 2) {
440 disabled_fph_fault(&regs);
441 return;
442 }
443 sprintf(buf, "Disabled FPL fault---not supposed to happen!");
444 break;
445
446 case 26: /* NaT Consumption */
447 if (user_mode(&regs)) {
448 void __user *addr;
449
450 if (((isr >> 4) & 0xf) == 2) {
451 /* NaT page consumption */
452 sig = SIGSEGV;
453 code = SEGV_ACCERR;
454 addr = (void __user *) ifa;
455 } else {
456 /* register NaT consumption */
457 sig = SIGILL;
458 code = ILL_ILLOPN;
459 addr = (void __user *) (regs.cr_iip
460 + ia64_psr(&regs)->ri);
461 }
462 siginfo.si_signo = sig;
463 siginfo.si_code = code;
464 siginfo.si_errno = 0;
465 siginfo.si_addr = addr;
466 siginfo.si_imm = vector;
467 siginfo.si_flags = __ISR_VALID;
468 siginfo.si_isr = isr;
469 force_sig_info(sig, &siginfo, current);
470 return;
471 } else if (ia64_done_with_exception(&regs))
472 return;
473 sprintf(buf, "NaT consumption");
474 break;
475
476 case 31: /* Unsupported Data Reference */
477 if (user_mode(&regs)) {
478 siginfo.si_signo = SIGILL;
479 siginfo.si_code = ILL_ILLOPN;
480 siginfo.si_errno = 0;
481 siginfo.si_addr = (void __user *) iip;
482 siginfo.si_imm = vector;
483 siginfo.si_flags = __ISR_VALID;
484 siginfo.si_isr = isr;
485 force_sig_info(SIGILL, &siginfo, current);
486 return;
487 }
488 sprintf(buf, "Unsupported data reference");
489 break;
490
491 case 29: /* Debug */
492 case 35: /* Taken Branch Trap */
493 case 36: /* Single Step Trap */
494 if (fsys_mode(current, &regs)) {
495 extern char __kernel_syscall_via_break[];
496 /*
497 * Got a trap in fsys-mode: Taken Branch Trap and Single Step trap
498 * need special handling; Debug trap is not supposed to happen.
499 */
500 if (unlikely(vector == 29)) {
501 die("Got debug trap in fsys-mode---not supposed to happen!",
502 &regs, 0);
503 return;
504 }
505 /* re-do the system call via break 0x100000: */
506 regs.cr_iip = (unsigned long) __kernel_syscall_via_break;
507 ia64_psr(&regs)->ri = 0;
508 ia64_psr(&regs)->cpl = 3;
509 return;
510 }
511 switch (vector) {
512 case 29:
513 siginfo.si_code = TRAP_HWBKPT;
514#ifdef CONFIG_ITANIUM
515 /*
516 * Erratum 10 (IFA may contain incorrect address) now has
517 * "NoFix" status. There are no plans for fixing this.
518 */
519 if (ia64_psr(&regs)->is == 0)
520 ifa = regs.cr_iip;
521#endif
522 break;
523 case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break;
524 case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break;
525 }
526 siginfo.si_signo = SIGTRAP;
527 siginfo.si_errno = 0;
528 siginfo.si_addr = (void __user *) ifa;
529 siginfo.si_imm = 0;
530 siginfo.si_flags = __ISR_VALID;
531 siginfo.si_isr = isr;
532 force_sig_info(SIGTRAP, &siginfo, current);
533 return;
534
535 case 32: /* fp fault */
536 case 33: /* fp trap */
537 result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
538 if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
539 siginfo.si_signo = SIGFPE;
540 siginfo.si_errno = 0;
541 siginfo.si_code = FPE_FLTINV;
542 siginfo.si_addr = (void __user *) iip;
543 siginfo.si_flags = __ISR_VALID;
544 siginfo.si_isr = isr;
545 siginfo.si_imm = 0;
546 force_sig_info(SIGFPE, &siginfo, current);
547 }
548 return;
549
550 case 34:
551 if (isr & 0x2) {
552 /* Lower-Privilege Transfer Trap */
553 /*
554 * Just clear PSR.lp and then return immediately: all the
555 * interesting work (e.g., signal delivery is done in the kernel
556 * exit path).
557 */
558 ia64_psr(&regs)->lp = 0;
559 return;
560 } else {
561 /* Unimplemented Instr. Address Trap */
562 if (user_mode(&regs)) {
563 siginfo.si_signo = SIGILL;
564 siginfo.si_code = ILL_BADIADDR;
565 siginfo.si_errno = 0;
566 siginfo.si_flags = 0;
567 siginfo.si_isr = 0;
568 siginfo.si_imm = 0;
569 siginfo.si_addr = (void __user *) iip;
570 force_sig_info(SIGILL, &siginfo, current);
571 return;
572 }
573 sprintf(buf, "Unimplemented Instruction Address fault");
574 }
575 break;
576
577 case 45:
578#ifdef CONFIG_IA32_SUPPORT
579 if (ia32_exception(&regs, isr) == 0)
580 return;
581#endif
582 printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n");
583 printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n",
584 iip, ifa, isr);
585 force_sig(SIGSEGV, current);
586 break;
587
588 case 46:
589#ifdef CONFIG_IA32_SUPPORT
590 if (ia32_intercept(&regs, isr) == 0)
591 return;
592#endif
593 printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n");
594 printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n",
595 iip, ifa, isr, iim);
596 force_sig(SIGSEGV, current);
597 return;
598
599 case 47:
600 sprintf(buf, "IA-32 Interruption Fault (int 0x%lx)", isr >> 16);
601 break;
602
603 default:
604 sprintf(buf, "Fault %lu", vector);
605 break;
606 }
607 die_if_kernel(buf, &regs, error);
608 force_sig(SIGILL, current);
609}
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
new file mode 100644
index 000000000000..43b45b65ee5a
--- /dev/null
+++ b/arch/ia64/kernel/unaligned.c
@@ -0,0 +1,1521 @@
1/*
2 * Architecture-specific unaligned trap handling.
3 *
4 * Copyright (C) 1999-2002, 2004 Hewlett-Packard Co
5 * Stephane Eranian <eranian@hpl.hp.com>
6 * David Mosberger-Tang <davidm@hpl.hp.com>
7 *
8 * 2002/12/09 Fix rotating register handling (off-by-1 error, missing fr-rotation). Fix
9 * get_rse_reg() to not leak kernel bits to user-level (reading an out-of-frame
10 * stacked register returns an undefined value; it does NOT trigger a
11 * "rsvd register fault").
12 * 2001/10/11 Fix unaligned access to rotating registers in s/w pipelined loops.
13 * 2001/08/13 Correct size of extended floats (float_fsz) from 16 to 10 bytes.
14 * 2001/01/17 Add support emulation of unaligned kernel accesses.
15 */
16#include <linux/kernel.h>
17#include <linux/sched.h>
18#include <linux/smp_lock.h>
19#include <linux/tty.h>
20
21#include <asm/intrinsics.h>
22#include <asm/processor.h>
23#include <asm/rse.h>
24#include <asm/uaccess.h>
25#include <asm/unaligned.h>
26
27extern void die_if_kernel(char *str, struct pt_regs *regs, long err) __attribute__ ((noreturn));
28
29#undef DEBUG_UNALIGNED_TRAP
30
31#ifdef DEBUG_UNALIGNED_TRAP
32# define DPRINT(a...) do { printk("%s %u: ", __FUNCTION__, __LINE__); printk (a); } while (0)
33# define DDUMP(str,vp,len) dump(str, vp, len)
34
35static void
36dump (const char *str, void *vp, size_t len)
37{
38 unsigned char *cp = vp;
39 int i;
40
41 printk("%s", str);
42 for (i = 0; i < len; ++i)
43 printk (" %02x", *cp++);
44 printk("\n");
45}
46#else
47# define DPRINT(a...)
48# define DDUMP(str,vp,len)
49#endif
50
51#define IA64_FIRST_STACKED_GR 32
52#define IA64_FIRST_ROTATING_FR 32
53#define SIGN_EXT9 0xffffffffffffff00ul
54
55/*
56 * For M-unit:
57 *
58 * opcode | m | x6 |
59 * --------|------|---------|
60 * [40-37] | [36] | [35:30] |
61 * --------|------|---------|
62 * 4 | 1 | 6 | = 11 bits
63 * --------------------------
64 * However bits [31:30] are not directly useful to distinguish between
65 * load/store so we can use [35:32] instead, which gives the following
66 * mask ([40:32]) using 9 bits. The 'e' comes from the fact that we defer
67 * checking the m-bit until later in the load/store emulation.
68 */
69#define IA64_OPCODE_MASK 0x1ef
70#define IA64_OPCODE_SHIFT 32
71
72/*
73 * Table C-28 Integer Load/Store
74 *
75 * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
76 *
77 * ld8.fill, st8.fill MUST be aligned because the RNATs are based on
78 * the address (bits [8:3]), so we must failed.
79 */
80#define LD_OP 0x080
81#define LDS_OP 0x081
82#define LDA_OP 0x082
83#define LDSA_OP 0x083
84#define LDBIAS_OP 0x084
85#define LDACQ_OP 0x085
86/* 0x086, 0x087 are not relevant */
87#define LDCCLR_OP 0x088
88#define LDCNC_OP 0x089
89#define LDCCLRACQ_OP 0x08a
90#define ST_OP 0x08c
91#define STREL_OP 0x08d
92/* 0x08e,0x8f are not relevant */
93
94/*
95 * Table C-29 Integer Load +Reg
96 *
97 * we use the ld->m (bit [36:36]) field to determine whether or not we have
98 * a load/store of this form.
99 */
100
101/*
102 * Table C-30 Integer Load/Store +Imm
103 *
104 * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
105 *
106 * ld8.fill, st8.fill must be aligned because the Nat register are based on
107 * the address, so we must fail and the program must be fixed.
108 */
109#define LD_IMM_OP 0x0a0
110#define LDS_IMM_OP 0x0a1
111#define LDA_IMM_OP 0x0a2
112#define LDSA_IMM_OP 0x0a3
113#define LDBIAS_IMM_OP 0x0a4
114#define LDACQ_IMM_OP 0x0a5
115/* 0x0a6, 0xa7 are not relevant */
116#define LDCCLR_IMM_OP 0x0a8
117#define LDCNC_IMM_OP 0x0a9
118#define LDCCLRACQ_IMM_OP 0x0aa
119#define ST_IMM_OP 0x0ac
120#define STREL_IMM_OP 0x0ad
121/* 0x0ae,0xaf are not relevant */
122
123/*
124 * Table C-32 Floating-point Load/Store
125 */
126#define LDF_OP 0x0c0
127#define LDFS_OP 0x0c1
128#define LDFA_OP 0x0c2
129#define LDFSA_OP 0x0c3
130/* 0x0c6 is irrelevant */
131#define LDFCCLR_OP 0x0c8
132#define LDFCNC_OP 0x0c9
133/* 0x0cb is irrelevant */
134#define STF_OP 0x0cc
135
136/*
137 * Table C-33 Floating-point Load +Reg
138 *
139 * we use the ld->m (bit [36:36]) field to determine whether or not we have
140 * a load/store of this form.
141 */
142
143/*
144 * Table C-34 Floating-point Load/Store +Imm
145 */
146#define LDF_IMM_OP 0x0e0
147#define LDFS_IMM_OP 0x0e1
148#define LDFA_IMM_OP 0x0e2
149#define LDFSA_IMM_OP 0x0e3
150/* 0x0e6 is irrelevant */
151#define LDFCCLR_IMM_OP 0x0e8
152#define LDFCNC_IMM_OP 0x0e9
153#define STF_IMM_OP 0x0ec
154
155typedef struct {
156 unsigned long qp:6; /* [0:5] */
157 unsigned long r1:7; /* [6:12] */
158 unsigned long imm:7; /* [13:19] */
159 unsigned long r3:7; /* [20:26] */
160 unsigned long x:1; /* [27:27] */
161 unsigned long hint:2; /* [28:29] */
162 unsigned long x6_sz:2; /* [30:31] */
163 unsigned long x6_op:4; /* [32:35], x6 = x6_sz|x6_op */
164 unsigned long m:1; /* [36:36] */
165 unsigned long op:4; /* [37:40] */
166 unsigned long pad:23; /* [41:63] */
167} load_store_t;
168
169
170typedef enum {
171 UPD_IMMEDIATE, /* ldXZ r1=[r3],imm(9) */
172 UPD_REG /* ldXZ r1=[r3],r2 */
173} update_t;
174
175/*
176 * We use tables to keep track of the offsets of registers in the saved state.
177 * This way we save having big switch/case statements.
178 *
179 * We use bit 0 to indicate switch_stack or pt_regs.
180 * The offset is simply shifted by 1 bit.
181 * A 2-byte value should be enough to hold any kind of offset
182 *
183 * In case the calling convention changes (and thus pt_regs/switch_stack)
184 * simply use RSW instead of RPT or vice-versa.
185 */
186
187#define RPO(x) ((size_t) &((struct pt_regs *)0)->x)
188#define RSO(x) ((size_t) &((struct switch_stack *)0)->x)
189
190#define RPT(x) (RPO(x) << 1)
191#define RSW(x) (1| RSO(x)<<1)
192
193#define GR_OFFS(x) (gr_info[x]>>1)
194#define GR_IN_SW(x) (gr_info[x] & 0x1)
195
196#define FR_OFFS(x) (fr_info[x]>>1)
197#define FR_IN_SW(x) (fr_info[x] & 0x1)
198
199static u16 gr_info[32]={
200 0, /* r0 is read-only : WE SHOULD NEVER GET THIS */
201
202 RPT(r1), RPT(r2), RPT(r3),
203
204 RSW(r4), RSW(r5), RSW(r6), RSW(r7),
205
206 RPT(r8), RPT(r9), RPT(r10), RPT(r11),
207 RPT(r12), RPT(r13), RPT(r14), RPT(r15),
208
209 RPT(r16), RPT(r17), RPT(r18), RPT(r19),
210 RPT(r20), RPT(r21), RPT(r22), RPT(r23),
211 RPT(r24), RPT(r25), RPT(r26), RPT(r27),
212 RPT(r28), RPT(r29), RPT(r30), RPT(r31)
213};
214
215static u16 fr_info[32]={
216 0, /* constant : WE SHOULD NEVER GET THIS */
217 0, /* constant : WE SHOULD NEVER GET THIS */
218
219 RSW(f2), RSW(f3), RSW(f4), RSW(f5),
220
221 RPT(f6), RPT(f7), RPT(f8), RPT(f9),
222 RPT(f10), RPT(f11),
223
224 RSW(f12), RSW(f13), RSW(f14),
225 RSW(f15), RSW(f16), RSW(f17), RSW(f18), RSW(f19),
226 RSW(f20), RSW(f21), RSW(f22), RSW(f23), RSW(f24),
227 RSW(f25), RSW(f26), RSW(f27), RSW(f28), RSW(f29),
228 RSW(f30), RSW(f31)
229};
230
231/* Invalidate ALAT entry for integer register REGNO. */
232static void
233invala_gr (int regno)
234{
235# define F(reg) case reg: ia64_invala_gr(reg); break
236
237 switch (regno) {
238 F( 0); F( 1); F( 2); F( 3); F( 4); F( 5); F( 6); F( 7);
239 F( 8); F( 9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
240 F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
241 F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
242 F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
243 F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
244 F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
245 F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
246 F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
247 F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
248 F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
249 F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
250 F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
251 F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
252 F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
253 F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
254 }
255# undef F
256}
257
258/* Invalidate ALAT entry for floating-point register REGNO. */
259static void
260invala_fr (int regno)
261{
262# define F(reg) case reg: ia64_invala_fr(reg); break
263
264 switch (regno) {
265 F( 0); F( 1); F( 2); F( 3); F( 4); F( 5); F( 6); F( 7);
266 F( 8); F( 9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
267 F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
268 F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
269 F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
270 F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
271 F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
272 F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
273 F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
274 F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
275 F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
276 F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
277 F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
278 F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
279 F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
280 F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
281 }
282# undef F
283}
284
285static inline unsigned long
286rotate_reg (unsigned long sor, unsigned long rrb, unsigned long reg)
287{
288 reg += rrb;
289 if (reg >= sor)
290 reg -= sor;
291 return reg;
292}
293
294static void
295set_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long val, int nat)
296{
297 struct switch_stack *sw = (struct switch_stack *) regs - 1;
298 unsigned long *bsp, *bspstore, *addr, *rnat_addr, *ubs_end;
299 unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
300 unsigned long rnats, nat_mask;
301 unsigned long on_kbs;
302 long sof = (regs->cr_ifs) & 0x7f;
303 long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
304 long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
305 long ridx = r1 - 32;
306
307 if (ridx >= sof) {
308 /* this should never happen, as the "rsvd register fault" has higher priority */
309 DPRINT("ignoring write to r%lu; only %lu registers are allocated!\n", r1, sof);
310 return;
311 }
312
313 if (ridx < sor)
314 ridx = rotate_reg(sor, rrb_gr, ridx);
315
316 DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
317 r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
318
319 on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
320 addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
321 if (addr >= kbs) {
322 /* the register is on the kernel backing store: easy... */
323 rnat_addr = ia64_rse_rnat_addr(addr);
324 if ((unsigned long) rnat_addr >= sw->ar_bspstore)
325 rnat_addr = &sw->ar_rnat;
326 nat_mask = 1UL << ia64_rse_slot_num(addr);
327
328 *addr = val;
329 if (nat)
330 *rnat_addr |= nat_mask;
331 else
332 *rnat_addr &= ~nat_mask;
333 return;
334 }
335
336 if (!user_stack(current, regs)) {
337 DPRINT("ignoring kernel write to r%lu; register isn't on the kernel RBS!", r1);
338 return;
339 }
340
341 bspstore = (unsigned long *)regs->ar_bspstore;
342 ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
343 bsp = ia64_rse_skip_regs(ubs_end, -sof);
344 addr = ia64_rse_skip_regs(bsp, ridx);
345
346 DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
347
348 ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
349
350 rnat_addr = ia64_rse_rnat_addr(addr);
351
352 ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
353 DPRINT("rnat @%p = 0x%lx nat=%d old nat=%ld\n",
354 (void *) rnat_addr, rnats, nat, (rnats >> ia64_rse_slot_num(addr)) & 1);
355
356 nat_mask = 1UL << ia64_rse_slot_num(addr);
357 if (nat)
358 rnats |= nat_mask;
359 else
360 rnats &= ~nat_mask;
361 ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, rnats);
362
363 DPRINT("rnat changed to @%p = 0x%lx\n", (void *) rnat_addr, rnats);
364}
365
366
367static void
368get_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long *val, int *nat)
369{
370 struct switch_stack *sw = (struct switch_stack *) regs - 1;
371 unsigned long *bsp, *addr, *rnat_addr, *ubs_end, *bspstore;
372 unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
373 unsigned long rnats, nat_mask;
374 unsigned long on_kbs;
375 long sof = (regs->cr_ifs) & 0x7f;
376 long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
377 long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
378 long ridx = r1 - 32;
379
380 if (ridx >= sof) {
381 /* read of out-of-frame register returns an undefined value; 0 in our case. */
382 DPRINT("ignoring read from r%lu; only %lu registers are allocated!\n", r1, sof);
383 goto fail;
384 }
385
386 if (ridx < sor)
387 ridx = rotate_reg(sor, rrb_gr, ridx);
388
389 DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
390 r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
391
392 on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
393 addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
394 if (addr >= kbs) {
395 /* the register is on the kernel backing store: easy... */
396 *val = *addr;
397 if (nat) {
398 rnat_addr = ia64_rse_rnat_addr(addr);
399 if ((unsigned long) rnat_addr >= sw->ar_bspstore)
400 rnat_addr = &sw->ar_rnat;
401 nat_mask = 1UL << ia64_rse_slot_num(addr);
402 *nat = (*rnat_addr & nat_mask) != 0;
403 }
404 return;
405 }
406
407 if (!user_stack(current, regs)) {
408 DPRINT("ignoring kernel read of r%lu; register isn't on the RBS!", r1);
409 goto fail;
410 }
411
412 bspstore = (unsigned long *)regs->ar_bspstore;
413 ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
414 bsp = ia64_rse_skip_regs(ubs_end, -sof);
415 addr = ia64_rse_skip_regs(bsp, ridx);
416
417 DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
418
419 ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
420
421 if (nat) {
422 rnat_addr = ia64_rse_rnat_addr(addr);
423 nat_mask = 1UL << ia64_rse_slot_num(addr);
424
425 DPRINT("rnat @%p = 0x%lx\n", (void *) rnat_addr, rnats);
426
427 ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
428 *nat = (rnats & nat_mask) != 0;
429 }
430 return;
431
432 fail:
433 *val = 0;
434 if (nat)
435 *nat = 0;
436 return;
437}
438
439
440static void
441setreg (unsigned long regnum, unsigned long val, int nat, struct pt_regs *regs)
442{
443 struct switch_stack *sw = (struct switch_stack *) regs - 1;
444 unsigned long addr;
445 unsigned long bitmask;
446 unsigned long *unat;
447
448 /*
449 * First takes care of stacked registers
450 */
451 if (regnum >= IA64_FIRST_STACKED_GR) {
452 set_rse_reg(regs, regnum, val, nat);
453 return;
454 }
455
456 /*
457 * Using r0 as a target raises a General Exception fault which has higher priority
458 * than the Unaligned Reference fault.
459 */
460
461 /*
462 * Now look at registers in [0-31] range and init correct UNAT
463 */
464 if (GR_IN_SW(regnum)) {
465 addr = (unsigned long)sw;
466 unat = &sw->ar_unat;
467 } else {
468 addr = (unsigned long)regs;
469 unat = &sw->caller_unat;
470 }
471 DPRINT("tmp_base=%lx switch_stack=%s offset=%d\n",
472 addr, unat==&sw->ar_unat ? "yes":"no", GR_OFFS(regnum));
473 /*
474 * add offset from base of struct
475 * and do it !
476 */
477 addr += GR_OFFS(regnum);
478
479 *(unsigned long *)addr = val;
480
481 /*
482 * We need to clear the corresponding UNAT bit to fully emulate the load
483 * UNAT bit_pos = GR[r3]{8:3} form EAS-2.4
484 */
485 bitmask = 1UL << (addr >> 3 & 0x3f);
486 DPRINT("*0x%lx=0x%lx NaT=%d prev_unat @%p=%lx\n", addr, val, nat, (void *) unat, *unat);
487 if (nat) {
488 *unat |= bitmask;
489 } else {
490 *unat &= ~bitmask;
491 }
492 DPRINT("*0x%lx=0x%lx NaT=%d new unat: %p=%lx\n", addr, val, nat, (void *) unat,*unat);
493}
494
495/*
496 * Return the (rotated) index for floating point register REGNUM (REGNUM must be in the
497 * range from 32-127, result is in the range from 0-95.
498 */
499static inline unsigned long
500fph_index (struct pt_regs *regs, long regnum)
501{
502 unsigned long rrb_fr = (regs->cr_ifs >> 25) & 0x7f;
503 return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR));
504}
505
506static void
507setfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
508{
509 struct switch_stack *sw = (struct switch_stack *)regs - 1;
510 unsigned long addr;
511
512 /*
513 * From EAS-2.5: FPDisableFault has higher priority than Unaligned
514 * Fault. Thus, when we get here, we know the partition is enabled.
515 * To update f32-f127, there are three choices:
516 *
517 * (1) save f32-f127 to thread.fph and update the values there
518 * (2) use a gigantic switch statement to directly access the registers
519 * (3) generate code on the fly to update the desired register
520 *
521 * For now, we are using approach (1).
522 */
523 if (regnum >= IA64_FIRST_ROTATING_FR) {
524 ia64_sync_fph(current);
525 current->thread.fph[fph_index(regs, regnum)] = *fpval;
526 } else {
527 /*
528 * pt_regs or switch_stack ?
529 */
530 if (FR_IN_SW(regnum)) {
531 addr = (unsigned long)sw;
532 } else {
533 addr = (unsigned long)regs;
534 }
535
536 DPRINT("tmp_base=%lx offset=%d\n", addr, FR_OFFS(regnum));
537
538 addr += FR_OFFS(regnum);
539 *(struct ia64_fpreg *)addr = *fpval;
540
541 /*
542 * mark the low partition as being used now
543 *
544 * It is highly unlikely that this bit is not already set, but
545 * let's do it for safety.
546 */
547 regs->cr_ipsr |= IA64_PSR_MFL;
548 }
549}
550
551/*
552 * Those 2 inline functions generate the spilled versions of the constant floating point
553 * registers which can be used with stfX
554 */
555static inline void
556float_spill_f0 (struct ia64_fpreg *final)
557{
558 ia64_stf_spill(final, 0);
559}
560
561static inline void
562float_spill_f1 (struct ia64_fpreg *final)
563{
564 ia64_stf_spill(final, 1);
565}
566
567static void
568getfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
569{
570 struct switch_stack *sw = (struct switch_stack *) regs - 1;
571 unsigned long addr;
572
573 /*
574 * From EAS-2.5: FPDisableFault has higher priority than
575 * Unaligned Fault. Thus, when we get here, we know the partition is
576 * enabled.
577 *
578 * When regnum > 31, the register is still live and we need to force a save
579 * to current->thread.fph to get access to it. See discussion in setfpreg()
580 * for reasons and other ways of doing this.
581 */
582 if (regnum >= IA64_FIRST_ROTATING_FR) {
583 ia64_flush_fph(current);
584 *fpval = current->thread.fph[fph_index(regs, regnum)];
585 } else {
586 /*
587 * f0 = 0.0, f1= 1.0. Those registers are constant and are thus
588 * not saved, we must generate their spilled form on the fly
589 */
590 switch(regnum) {
591 case 0:
592 float_spill_f0(fpval);
593 break;
594 case 1:
595 float_spill_f1(fpval);
596 break;
597 default:
598 /*
599 * pt_regs or switch_stack ?
600 */
601 addr = FR_IN_SW(regnum) ? (unsigned long)sw
602 : (unsigned long)regs;
603
604 DPRINT("is_sw=%d tmp_base=%lx offset=0x%x\n",
605 FR_IN_SW(regnum), addr, FR_OFFS(regnum));
606
607 addr += FR_OFFS(regnum);
608 *fpval = *(struct ia64_fpreg *)addr;
609 }
610 }
611}
612
613
614static void
615getreg (unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs)
616{
617 struct switch_stack *sw = (struct switch_stack *) regs - 1;
618 unsigned long addr, *unat;
619
620 if (regnum >= IA64_FIRST_STACKED_GR) {
621 get_rse_reg(regs, regnum, val, nat);
622 return;
623 }
624
625 /*
626 * take care of r0 (read-only always evaluate to 0)
627 */
628 if (regnum == 0) {
629 *val = 0;
630 if (nat)
631 *nat = 0;
632 return;
633 }
634
635 /*
636 * Now look at registers in [0-31] range and init correct UNAT
637 */
638 if (GR_IN_SW(regnum)) {
639 addr = (unsigned long)sw;
640 unat = &sw->ar_unat;
641 } else {
642 addr = (unsigned long)regs;
643 unat = &sw->caller_unat;
644 }
645
646 DPRINT("addr_base=%lx offset=0x%x\n", addr, GR_OFFS(regnum));
647
648 addr += GR_OFFS(regnum);
649
650 *val = *(unsigned long *)addr;
651
652 /*
653 * do it only when requested
654 */
655 if (nat)
656 *nat = (*unat >> (addr >> 3 & 0x3f)) & 0x1UL;
657}
658
659static void
660emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsigned long ifa)
661{
662 /*
663 * IMPORTANT:
664 * Given the way we handle unaligned speculative loads, we should
665 * not get to this point in the code but we keep this sanity check,
666 * just in case.
667 */
668 if (ld.x6_op == 1 || ld.x6_op == 3) {
669 printk(KERN_ERR "%s: register update on speculative load, error\n", __FUNCTION__);
670 die_if_kernel("unaligned reference on speculative load with register update\n",
671 regs, 30);
672 }
673
674
675 /*
676 * at this point, we know that the base register to update is valid i.e.,
677 * it's not r0
678 */
679 if (type == UPD_IMMEDIATE) {
680 unsigned long imm;
681
682 /*
683 * Load +Imm: ldXZ r1=[r3],imm(9)
684 *
685 *
686 * form imm9: [13:19] contain the first 7 bits
687 */
688 imm = ld.x << 7 | ld.imm;
689
690 /*
691 * sign extend (1+8bits) if m set
692 */
693 if (ld.m) imm |= SIGN_EXT9;
694
695 /*
696 * ifa == r3 and we know that the NaT bit on r3 was clear so
697 * we can directly use ifa.
698 */
699 ifa += imm;
700
701 setreg(ld.r3, ifa, 0, regs);
702
703 DPRINT("ld.x=%d ld.m=%d imm=%ld r3=0x%lx\n", ld.x, ld.m, imm, ifa);
704
705 } else if (ld.m) {
706 unsigned long r2;
707 int nat_r2;
708
709 /*
710 * Load +Reg Opcode: ldXZ r1=[r3],r2
711 *
712 * Note: that we update r3 even in the case of ldfX.a
713 * (where the load does not happen)
714 *
715 * The way the load algorithm works, we know that r3 does not
716 * have its NaT bit set (would have gotten NaT consumption
717 * before getting the unaligned fault). So we can use ifa
718 * which equals r3 at this point.
719 *
720 * IMPORTANT:
721 * The above statement holds ONLY because we know that we
722 * never reach this code when trying to do a ldX.s.
723 * If we ever make it to here on an ldfX.s then
724 */
725 getreg(ld.imm, &r2, &nat_r2, regs);
726
727 ifa += r2;
728
729 /*
730 * propagate Nat r2 -> r3
731 */
732 setreg(ld.r3, ifa, nat_r2, regs);
733
734 DPRINT("imm=%d r2=%ld r3=0x%lx nat_r2=%d\n",ld.imm, r2, ifa, nat_r2);
735 }
736}
737
738
739static int
740emulate_load_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
741{
742 unsigned int len = 1 << ld.x6_sz;
743 unsigned long val = 0;
744
745 /*
746 * r0, as target, doesn't need to be checked because Illegal Instruction
747 * faults have higher priority than unaligned faults.
748 *
749 * r0 cannot be found as the base as it would never generate an
750 * unaligned reference.
751 */
752
753 /*
754 * ldX.a we will emulate load and also invalidate the ALAT entry.
755 * See comment below for explanation on how we handle ldX.a
756 */
757
758 if (len != 2 && len != 4 && len != 8) {
759 DPRINT("unknown size: x6=%d\n", ld.x6_sz);
760 return -1;
761 }
762 /* this assumes little-endian byte-order: */
763 if (copy_from_user(&val, (void __user *) ifa, len))
764 return -1;
765 setreg(ld.r1, val, 0, regs);
766
767 /*
768 * check for updates on any kind of loads
769 */
770 if (ld.op == 0x5 || ld.m)
771 emulate_load_updates(ld.op == 0x5 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
772
773 /*
774 * handling of various loads (based on EAS2.4):
775 *
776 * ldX.acq (ordered load):
777 * - acquire semantics would have been used, so force fence instead.
778 *
779 * ldX.c.clr (check load and clear):
780 * - if we get to this handler, it's because the entry was not in the ALAT.
781 * Therefore the operation reverts to a normal load
782 *
783 * ldX.c.nc (check load no clear):
784 * - same as previous one
785 *
786 * ldX.c.clr.acq (ordered check load and clear):
787 * - same as above for c.clr part. The load needs to have acquire semantics. So
788 * we use the fence semantics which is stronger and thus ensures correctness.
789 *
790 * ldX.a (advanced load):
791 * - suppose ldX.a r1=[r3]. If we get to the unaligned trap it's because the
792 * address doesn't match requested size alignment. This means that we would
793 * possibly need more than one load to get the result.
794 *
795 * The load part can be handled just like a normal load, however the difficult
796 * part is to get the right thing into the ALAT. The critical piece of information
797 * in the base address of the load & size. To do that, a ld.a must be executed,
798 * clearly any address can be pushed into the table by using ld1.a r1=[r3]. Now
799 * if we use the same target register, we will be okay for the check.a instruction.
800 * If we look at the store, basically a stX [r3]=r1 checks the ALAT for any entry
801 * which would overlap within [r3,r3+X] (the size of the load was store in the
802 * ALAT). If such an entry is found the entry is invalidated. But this is not good
803 * enough, take the following example:
804 * r3=3
805 * ld4.a r1=[r3]
806 *
807 * Could be emulated by doing:
808 * ld1.a r1=[r3],1
809 * store to temporary;
810 * ld1.a r1=[r3],1
811 * store & shift to temporary;
812 * ld1.a r1=[r3],1
813 * store & shift to temporary;
814 * ld1.a r1=[r3]
815 * store & shift to temporary;
816 * r1=temporary
817 *
818 * So in this case, you would get the right value is r1 but the wrong info in
819 * the ALAT. Notice that you could do it in reverse to finish with address 3
820 * but you would still get the size wrong. To get the size right, one needs to
821 * execute exactly the same kind of load. You could do it from a aligned
822 * temporary location, but you would get the address wrong.
823 *
824 * So no matter what, it is not possible to emulate an advanced load
825 * correctly. But is that really critical ?
826 *
827 * We will always convert ld.a into a normal load with ALAT invalidated. This
828 * will enable compiler to do optimization where certain code path after ld.a
829 * is not required to have ld.c/chk.a, e.g., code path with no intervening stores.
830 *
831 * If there is a store after the advanced load, one must either do a ld.c.* or
832 * chk.a.* to reuse the value stored in the ALAT. Both can "fail" (meaning no
833 * entry found in ALAT), and that's perfectly ok because:
834 *
835 * - ld.c.*, if the entry is not present a normal load is executed
836 * - chk.a.*, if the entry is not present, execution jumps to recovery code
837 *
838 * In either case, the load can be potentially retried in another form.
839 *
840 * ALAT must be invalidated for the register (so that chk.a or ld.c don't pick
841 * up a stale entry later). The register base update MUST also be performed.
842 */
843
844 /*
845 * when the load has the .acq completer then
846 * use ordering fence.
847 */
848 if (ld.x6_op == 0x5 || ld.x6_op == 0xa)
849 mb();
850
851 /*
852 * invalidate ALAT entry in case of advanced load
853 */
854 if (ld.x6_op == 0x2)
855 invala_gr(ld.r1);
856
857 return 0;
858}
859
860static int
861emulate_store_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
862{
863 unsigned long r2;
864 unsigned int len = 1 << ld.x6_sz;
865
866 /*
867 * if we get to this handler, Nat bits on both r3 and r2 have already
868 * been checked. so we don't need to do it
869 *
870 * extract the value to be stored
871 */
872 getreg(ld.imm, &r2, NULL, regs);
873
874 /*
875 * we rely on the macros in unaligned.h for now i.e.,
876 * we let the compiler figure out how to read memory gracefully.
877 *
878 * We need this switch/case because the way the inline function
879 * works. The code is optimized by the compiler and looks like
880 * a single switch/case.
881 */
882 DPRINT("st%d [%lx]=%lx\n", len, ifa, r2);
883
884 if (len != 2 && len != 4 && len != 8) {
885 DPRINT("unknown size: x6=%d\n", ld.x6_sz);
886 return -1;
887 }
888
889 /* this assumes little-endian byte-order: */
890 if (copy_to_user((void __user *) ifa, &r2, len))
891 return -1;
892
893 /*
894 * stX [r3]=r2,imm(9)
895 *
896 * NOTE:
897 * ld.r3 can never be r0, because r0 would not generate an
898 * unaligned access.
899 */
900 if (ld.op == 0x5) {
901 unsigned long imm;
902
903 /*
904 * form imm9: [12:6] contain first 7bits
905 */
906 imm = ld.x << 7 | ld.r1;
907 /*
908 * sign extend (8bits) if m set
909 */
910 if (ld.m) imm |= SIGN_EXT9;
911 /*
912 * ifa == r3 (NaT is necessarily cleared)
913 */
914 ifa += imm;
915
916 DPRINT("imm=%lx r3=%lx\n", imm, ifa);
917
918 setreg(ld.r3, ifa, 0, regs);
919 }
920 /*
921 * we don't have alat_invalidate_multiple() so we need
922 * to do the complete flush :-<<
923 */
924 ia64_invala();
925
926 /*
927 * stX.rel: use fence instead of release
928 */
929 if (ld.x6_op == 0xd)
930 mb();
931
932 return 0;
933}
934
935/*
936 * floating point operations sizes in bytes
937 */
938static const unsigned char float_fsz[4]={
939 10, /* extended precision (e) */
940 8, /* integer (8) */
941 4, /* single precision (s) */
942 8 /* double precision (d) */
943};
944
945static inline void
946mem2float_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
947{
948 ia64_ldfe(6, init);
949 ia64_stop();
950 ia64_stf_spill(final, 6);
951}
952
953static inline void
954mem2float_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
955{
956 ia64_ldf8(6, init);
957 ia64_stop();
958 ia64_stf_spill(final, 6);
959}
960
961static inline void
962mem2float_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
963{
964 ia64_ldfs(6, init);
965 ia64_stop();
966 ia64_stf_spill(final, 6);
967}
968
969static inline void
970mem2float_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
971{
972 ia64_ldfd(6, init);
973 ia64_stop();
974 ia64_stf_spill(final, 6);
975}
976
977static inline void
978float2mem_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
979{
980 ia64_ldf_fill(6, init);
981 ia64_stop();
982 ia64_stfe(final, 6);
983}
984
985static inline void
986float2mem_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
987{
988 ia64_ldf_fill(6, init);
989 ia64_stop();
990 ia64_stf8(final, 6);
991}
992
993static inline void
994float2mem_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
995{
996 ia64_ldf_fill(6, init);
997 ia64_stop();
998 ia64_stfs(final, 6);
999}
1000
1001static inline void
1002float2mem_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
1003{
1004 ia64_ldf_fill(6, init);
1005 ia64_stop();
1006 ia64_stfd(final, 6);
1007}
1008
1009static int
1010emulate_load_floatpair (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
1011{
1012 struct ia64_fpreg fpr_init[2];
1013 struct ia64_fpreg fpr_final[2];
1014 unsigned long len = float_fsz[ld.x6_sz];
1015
1016 /*
1017 * fr0 & fr1 don't need to be checked because Illegal Instruction faults have
1018 * higher priority than unaligned faults.
1019 *
1020 * r0 cannot be found as the base as it would never generate an unaligned
1021 * reference.
1022 */
1023
1024 /*
1025 * make sure we get clean buffers
1026 */
1027 memset(&fpr_init, 0, sizeof(fpr_init));
1028 memset(&fpr_final, 0, sizeof(fpr_final));
1029
1030 /*
1031 * ldfpX.a: we don't try to emulate anything but we must
1032 * invalidate the ALAT entry and execute updates, if any.
1033 */
1034 if (ld.x6_op != 0x2) {
1035 /*
1036 * This assumes little-endian byte-order. Note that there is no "ldfpe"
1037 * instruction:
1038 */
1039 if (copy_from_user(&fpr_init[0], (void __user *) ifa, len)
1040 || copy_from_user(&fpr_init[1], (void __user *) (ifa + len), len))
1041 return -1;
1042
1043 DPRINT("ld.r1=%d ld.imm=%d x6_sz=%d\n", ld.r1, ld.imm, ld.x6_sz);
1044 DDUMP("frp_init =", &fpr_init, 2*len);
1045 /*
1046 * XXX fixme
1047 * Could optimize inlines by using ldfpX & 2 spills
1048 */
1049 switch( ld.x6_sz ) {
1050 case 0:
1051 mem2float_extended(&fpr_init[0], &fpr_final[0]);
1052 mem2float_extended(&fpr_init[1], &fpr_final[1]);
1053 break;
1054 case 1:
1055 mem2float_integer(&fpr_init[0], &fpr_final[0]);
1056 mem2float_integer(&fpr_init[1], &fpr_final[1]);
1057 break;
1058 case 2:
1059 mem2float_single(&fpr_init[0], &fpr_final[0]);
1060 mem2float_single(&fpr_init[1], &fpr_final[1]);
1061 break;
1062 case 3:
1063 mem2float_double(&fpr_init[0], &fpr_final[0]);
1064 mem2float_double(&fpr_init[1], &fpr_final[1]);
1065 break;
1066 }
1067 DDUMP("fpr_final =", &fpr_final, 2*len);
1068 /*
1069 * XXX fixme
1070 *
1071 * A possible optimization would be to drop fpr_final and directly
1072 * use the storage from the saved context i.e., the actual final
1073 * destination (pt_regs, switch_stack or thread structure).
1074 */
1075 setfpreg(ld.r1, &fpr_final[0], regs);
1076 setfpreg(ld.imm, &fpr_final[1], regs);
1077 }
1078
1079 /*
1080 * Check for updates: only immediate updates are available for this
1081 * instruction.
1082 */
1083 if (ld.m) {
1084 /*
1085 * the immediate is implicit given the ldsz of the operation:
1086 * single: 8 (2x4) and for all others it's 16 (2x8)
1087 */
1088 ifa += len<<1;
1089
1090 /*
1091 * IMPORTANT:
1092 * the fact that we force the NaT of r3 to zero is ONLY valid
1093 * as long as we don't come here with a ldfpX.s.
1094 * For this reason we keep this sanity check
1095 */
1096 if (ld.x6_op == 1 || ld.x6_op == 3)
1097 printk(KERN_ERR "%s: register update on speculative load pair, error\n",
1098 __FUNCTION__);
1099
1100 setreg(ld.r3, ifa, 0, regs);
1101 }
1102
1103 /*
1104 * Invalidate ALAT entries, if any, for both registers.
1105 */
1106 if (ld.x6_op == 0x2) {
1107 invala_fr(ld.r1);
1108 invala_fr(ld.imm);
1109 }
1110 return 0;
1111}
1112
1113
1114static int
1115emulate_load_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
1116{
1117 struct ia64_fpreg fpr_init;
1118 struct ia64_fpreg fpr_final;
1119 unsigned long len = float_fsz[ld.x6_sz];
1120
1121 /*
1122 * fr0 & fr1 don't need to be checked because Illegal Instruction
1123 * faults have higher priority than unaligned faults.
1124 *
1125 * r0 cannot be found as the base as it would never generate an
1126 * unaligned reference.
1127 */
1128
1129 /*
1130 * make sure we get clean buffers
1131 */
1132 memset(&fpr_init,0, sizeof(fpr_init));
1133 memset(&fpr_final,0, sizeof(fpr_final));
1134
1135 /*
1136 * ldfX.a we don't try to emulate anything but we must
1137 * invalidate the ALAT entry.
1138 * See comments in ldX for descriptions on how the various loads are handled.
1139 */
1140 if (ld.x6_op != 0x2) {
1141 if (copy_from_user(&fpr_init, (void __user *) ifa, len))
1142 return -1;
1143
1144 DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
1145 DDUMP("fpr_init =", &fpr_init, len);
1146 /*
1147 * we only do something for x6_op={0,8,9}
1148 */
1149 switch( ld.x6_sz ) {
1150 case 0:
1151 mem2float_extended(&fpr_init, &fpr_final);
1152 break;
1153 case 1:
1154 mem2float_integer(&fpr_init, &fpr_final);
1155 break;
1156 case 2:
1157 mem2float_single(&fpr_init, &fpr_final);
1158 break;
1159 case 3:
1160 mem2float_double(&fpr_init, &fpr_final);
1161 break;
1162 }
1163 DDUMP("fpr_final =", &fpr_final, len);
1164 /*
1165 * XXX fixme
1166 *
1167 * A possible optimization would be to drop fpr_final and directly
1168 * use the storage from the saved context i.e., the actual final
1169 * destination (pt_regs, switch_stack or thread structure).
1170 */
1171 setfpreg(ld.r1, &fpr_final, regs);
1172 }
1173
1174 /*
1175 * check for updates on any loads
1176 */
1177 if (ld.op == 0x7 || ld.m)
1178 emulate_load_updates(ld.op == 0x7 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
1179
1180 /*
1181 * invalidate ALAT entry in case of advanced floating point loads
1182 */
1183 if (ld.x6_op == 0x2)
1184 invala_fr(ld.r1);
1185
1186 return 0;
1187}
1188
1189
1190static int
1191emulate_store_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
1192{
1193 struct ia64_fpreg fpr_init;
1194 struct ia64_fpreg fpr_final;
1195 unsigned long len = float_fsz[ld.x6_sz];
1196
1197 /*
1198 * make sure we get clean buffers
1199 */
1200 memset(&fpr_init,0, sizeof(fpr_init));
1201 memset(&fpr_final,0, sizeof(fpr_final));
1202
1203 /*
1204 * if we get to this handler, Nat bits on both r3 and r2 have already
1205 * been checked. so we don't need to do it
1206 *
1207 * extract the value to be stored
1208 */
1209 getfpreg(ld.imm, &fpr_init, regs);
1210 /*
1211 * during this step, we extract the spilled registers from the saved
1212 * context i.e., we refill. Then we store (no spill) to temporary
1213 * aligned location
1214 */
1215 switch( ld.x6_sz ) {
1216 case 0:
1217 float2mem_extended(&fpr_init, &fpr_final);
1218 break;
1219 case 1:
1220 float2mem_integer(&fpr_init, &fpr_final);
1221 break;
1222 case 2:
1223 float2mem_single(&fpr_init, &fpr_final);
1224 break;
1225 case 3:
1226 float2mem_double(&fpr_init, &fpr_final);
1227 break;
1228 }
1229 DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
1230 DDUMP("fpr_init =", &fpr_init, len);
1231 DDUMP("fpr_final =", &fpr_final, len);
1232
1233 if (copy_to_user((void __user *) ifa, &fpr_final, len))
1234 return -1;
1235
1236 /*
1237 * stfX [r3]=r2,imm(9)
1238 *
1239 * NOTE:
1240 * ld.r3 can never be r0, because r0 would not generate an
1241 * unaligned access.
1242 */
1243 if (ld.op == 0x7) {
1244 unsigned long imm;
1245
1246 /*
1247 * form imm9: [12:6] contain first 7bits
1248 */
1249 imm = ld.x << 7 | ld.r1;
1250 /*
1251 * sign extend (8bits) if m set
1252 */
1253 if (ld.m)
1254 imm |= SIGN_EXT9;
1255 /*
1256 * ifa == r3 (NaT is necessarily cleared)
1257 */
1258 ifa += imm;
1259
1260 DPRINT("imm=%lx r3=%lx\n", imm, ifa);
1261
1262 setreg(ld.r3, ifa, 0, regs);
1263 }
1264 /*
1265 * we don't have alat_invalidate_multiple() so we need
1266 * to do the complete flush :-<<
1267 */
1268 ia64_invala();
1269
1270 return 0;
1271}
1272
1273/*
1274 * Make sure we log the unaligned access, so that user/sysadmin can notice it and
1275 * eventually fix the program. However, we don't want to do that for every access so we
1276 * pace it with jiffies. This isn't really MP-safe, but it doesn't really have to be
1277 * either...
1278 */
1279static int
1280within_logging_rate_limit (void)
1281{
1282 static unsigned long count, last_time;
1283
1284 if (jiffies - last_time > 5*HZ)
1285 count = 0;
1286 if (++count < 5) {
1287 last_time = jiffies;
1288 return 1;
1289 }
1290 return 0;
1291
1292}
1293
1294void
1295ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
1296{
1297 struct ia64_psr *ipsr = ia64_psr(regs);
1298 mm_segment_t old_fs = get_fs();
1299 unsigned long bundle[2];
1300 unsigned long opcode;
1301 struct siginfo si;
1302 const struct exception_table_entry *eh = NULL;
1303 union {
1304 unsigned long l;
1305 load_store_t insn;
1306 } u;
1307 int ret = -1;
1308
1309 if (ia64_psr(regs)->be) {
1310 /* we don't support big-endian accesses */
1311 die_if_kernel("big-endian unaligned accesses are not supported", regs, 0);
1312 goto force_sigbus;
1313 }
1314
1315 /*
1316 * Treat kernel accesses for which there is an exception handler entry the same as
1317 * user-level unaligned accesses. Otherwise, a clever program could trick this
1318 * handler into reading an arbitrary kernel addresses...
1319 */
1320 if (!user_mode(regs))
1321 eh = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
1322 if (user_mode(regs) || eh) {
1323 if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0)
1324 goto force_sigbus;
1325
1326 if (!(current->thread.flags & IA64_THREAD_UAC_NOPRINT)
1327 && within_logging_rate_limit())
1328 {
1329 char buf[200]; /* comm[] is at most 16 bytes... */
1330 size_t len;
1331
1332 len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, "
1333 "ip=0x%016lx\n\r", current->comm, current->pid,
1334 ifa, regs->cr_iip + ipsr->ri);
1335 /*
1336 * Don't call tty_write_message() if we're in the kernel; we might
1337 * be holding locks...
1338 */
1339 if (user_mode(regs))
1340 tty_write_message(current->signal->tty, buf);
1341 buf[len-1] = '\0'; /* drop '\r' */
1342 printk(KERN_WARNING "%s", buf); /* watch for command names containing %s */
1343 }
1344 } else {
1345 if (within_logging_rate_limit())
1346 printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n",
1347 ifa, regs->cr_iip + ipsr->ri);
1348 set_fs(KERNEL_DS);
1349 }
1350
1351 DPRINT("iip=%lx ifa=%lx isr=%lx (ei=%d, sp=%d)\n",
1352 regs->cr_iip, ifa, regs->cr_ipsr, ipsr->ri, ipsr->it);
1353
1354 if (__copy_from_user(bundle, (void __user *) regs->cr_iip, 16))
1355 goto failure;
1356
1357 /*
1358 * extract the instruction from the bundle given the slot number
1359 */
1360 switch (ipsr->ri) {
1361 case 0: u.l = (bundle[0] >> 5); break;
1362 case 1: u.l = (bundle[0] >> 46) | (bundle[1] << 18); break;
1363 case 2: u.l = (bundle[1] >> 23); break;
1364 }
1365 opcode = (u.l >> IA64_OPCODE_SHIFT) & IA64_OPCODE_MASK;
1366
1367 DPRINT("opcode=%lx ld.qp=%d ld.r1=%d ld.imm=%d ld.r3=%d ld.x=%d ld.hint=%d "
1368 "ld.x6=0x%x ld.m=%d ld.op=%d\n", opcode, u.insn.qp, u.insn.r1, u.insn.imm,
1369 u.insn.r3, u.insn.x, u.insn.hint, u.insn.x6_sz, u.insn.m, u.insn.op);
1370
1371 /*
1372 * IMPORTANT:
1373 * Notice that the switch statement DOES not cover all possible instructions
1374 * that DO generate unaligned references. This is made on purpose because for some
1375 * instructions it DOES NOT make sense to try and emulate the access. Sometimes it
1376 * is WRONG to try and emulate. Here is a list of instruction we don't emulate i.e.,
1377 * the program will get a signal and die:
1378 *
1379 * load/store:
1380 * - ldX.spill
1381 * - stX.spill
1382 * Reason: RNATs are based on addresses
1383 * - ld16
1384 * - st16
1385 * Reason: ld16 and st16 are supposed to occur in a single
1386 * memory op
1387 *
1388 * synchronization:
1389 * - cmpxchg
1390 * - fetchadd
1391 * - xchg
1392 * Reason: ATOMIC operations cannot be emulated properly using multiple
1393 * instructions.
1394 *
1395 * speculative loads:
1396 * - ldX.sZ
1397 * Reason: side effects, code must be ready to deal with failure so simpler
1398 * to let the load fail.
1399 * ---------------------------------------------------------------------------------
1400 * XXX fixme
1401 *
1402 * I would like to get rid of this switch case and do something
1403 * more elegant.
1404 */
1405 switch (opcode) {
1406 case LDS_OP:
1407 case LDSA_OP:
1408 if (u.insn.x)
1409 /* oops, really a semaphore op (cmpxchg, etc) */
1410 goto failure;
1411 /* no break */
1412 case LDS_IMM_OP:
1413 case LDSA_IMM_OP:
1414 case LDFS_OP:
1415 case LDFSA_OP:
1416 case LDFS_IMM_OP:
1417 /*
1418 * The instruction will be retried with deferred exceptions turned on, and
1419 * we should get Nat bit installed
1420 *
1421 * IMPORTANT: When PSR_ED is set, the register & immediate update forms
1422 * are actually executed even though the operation failed. So we don't
1423 * need to take care of this.
1424 */
1425 DPRINT("forcing PSR_ED\n");
1426 regs->cr_ipsr |= IA64_PSR_ED;
1427 goto done;
1428
1429 case LD_OP:
1430 case LDA_OP:
1431 case LDBIAS_OP:
1432 case LDACQ_OP:
1433 case LDCCLR_OP:
1434 case LDCNC_OP:
1435 case LDCCLRACQ_OP:
1436 if (u.insn.x)
1437 /* oops, really a semaphore op (cmpxchg, etc) */
1438 goto failure;
1439 /* no break */
1440 case LD_IMM_OP:
1441 case LDA_IMM_OP:
1442 case LDBIAS_IMM_OP:
1443 case LDACQ_IMM_OP:
1444 case LDCCLR_IMM_OP:
1445 case LDCNC_IMM_OP:
1446 case LDCCLRACQ_IMM_OP:
1447 ret = emulate_load_int(ifa, u.insn, regs);
1448 break;
1449
1450 case ST_OP:
1451 case STREL_OP:
1452 if (u.insn.x)
1453 /* oops, really a semaphore op (cmpxchg, etc) */
1454 goto failure;
1455 /* no break */
1456 case ST_IMM_OP:
1457 case STREL_IMM_OP:
1458 ret = emulate_store_int(ifa, u.insn, regs);
1459 break;
1460
1461 case LDF_OP:
1462 case LDFA_OP:
1463 case LDFCCLR_OP:
1464 case LDFCNC_OP:
1465 case LDF_IMM_OP:
1466 case LDFA_IMM_OP:
1467 case LDFCCLR_IMM_OP:
1468 case LDFCNC_IMM_OP:
1469 if (u.insn.x)
1470 ret = emulate_load_floatpair(ifa, u.insn, regs);
1471 else
1472 ret = emulate_load_float(ifa, u.insn, regs);
1473 break;
1474
1475 case STF_OP:
1476 case STF_IMM_OP:
1477 ret = emulate_store_float(ifa, u.insn, regs);
1478 break;
1479
1480 default:
1481 goto failure;
1482 }
1483 DPRINT("ret=%d\n", ret);
1484 if (ret)
1485 goto failure;
1486
1487 if (ipsr->ri == 2)
1488 /*
1489 * given today's architecture this case is not likely to happen because a
1490 * memory access instruction (M) can never be in the last slot of a
1491 * bundle. But let's keep it for now.
1492 */
1493 regs->cr_iip += 16;
1494 ipsr->ri = (ipsr->ri + 1) & 0x3;
1495
1496 DPRINT("ipsr->ri=%d iip=%lx\n", ipsr->ri, regs->cr_iip);
1497 done:
1498 set_fs(old_fs); /* restore original address limit */
1499 return;
1500
1501 failure:
1502 /* something went wrong... */
1503 if (!user_mode(regs)) {
1504 if (eh) {
1505 ia64_handle_exception(regs, eh);
1506 goto done;
1507 }
1508 die_if_kernel("error during unaligned kernel access\n", regs, ret);
1509 /* NOT_REACHED */
1510 }
1511 force_sigbus:
1512 si.si_signo = SIGBUS;
1513 si.si_errno = 0;
1514 si.si_code = BUS_ADRALN;
1515 si.si_addr = (void __user *) ifa;
1516 si.si_flags = 0;
1517 si.si_isr = 0;
1518 si.si_imm = 0;
1519 force_sig_info(SIGBUS, &si, current);
1520 goto done;
1521}
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
new file mode 100644
index 000000000000..d494ff647cac
--- /dev/null
+++ b/arch/ia64/kernel/unwind.c
@@ -0,0 +1,2306 @@
1/*
2 * Copyright (C) 1999-2004 Hewlett-Packard Co
3 * David Mosberger-Tang <davidm@hpl.hp.com>
4 * Copyright (C) 2003 Fenghua Yu <fenghua.yu@intel.com>
5 * - Change pt_regs_off() to make it less dependant on pt_regs structure.
6 */
7/*
8 * This file implements call frame unwind support for the Linux
9 * kernel. Parsing and processing the unwind information is
10 * time-consuming, so this implementation translates the unwind
11 * descriptors into unwind scripts. These scripts are very simple
12 * (basically a sequence of assignments) and efficient to execute.
13 * They are cached for later re-use. Each script is specific for a
14 * given instruction pointer address and the set of predicate values
15 * that the script depends on (most unwind descriptors are
16 * unconditional and scripts often do not depend on predicates at
17 * all). This code is based on the unwind conventions described in
18 * the "IA-64 Software Conventions and Runtime Architecture" manual.
19 *
20 * SMP conventions:
21 * o updates to the global unwind data (in structure "unw") are serialized
22 * by the unw.lock spinlock
23 * o each unwind script has its own read-write lock; a thread must acquire
24 * a read lock before executing a script and must acquire a write lock
25 * before modifying a script
26 * o if both the unw.lock spinlock and a script's read-write lock must be
27 * acquired, then the read-write lock must be acquired first.
28 */
29#include <linux/module.h>
30#include <linux/bootmem.h>
31#include <linux/elf.h>
32#include <linux/kernel.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35
36#include <asm/unwind.h>
37
38#include <asm/delay.h>
39#include <asm/page.h>
40#include <asm/ptrace.h>
41#include <asm/ptrace_offsets.h>
42#include <asm/rse.h>
43#include <asm/sections.h>
44#include <asm/system.h>
45#include <asm/uaccess.h>
46
47#include "entry.h"
48#include "unwind_i.h"
49
50#define UNW_LOG_CACHE_SIZE 7 /* each unw_script is ~256 bytes in size */
51#define UNW_CACHE_SIZE (1 << UNW_LOG_CACHE_SIZE)
52
53#define UNW_LOG_HASH_SIZE (UNW_LOG_CACHE_SIZE + 1)
54#define UNW_HASH_SIZE (1 << UNW_LOG_HASH_SIZE)
55
56#define UNW_STATS 0 /* WARNING: this disabled interrupts for long time-spans!! */
57
58#ifdef UNW_DEBUG
59 static unsigned int unw_debug_level = UNW_DEBUG;
60# define UNW_DEBUG_ON(n) unw_debug_level >= n
61 /* Do not code a printk level, not all debug lines end in newline */
62# define UNW_DPRINT(n, ...) if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__)
63# define inline
64#else /* !UNW_DEBUG */
65# define UNW_DEBUG_ON(n) 0
66# define UNW_DPRINT(n, ...)
67#endif /* UNW_DEBUG */
68
69#if UNW_STATS
70# define STAT(x...) x
71#else
72# define STAT(x...)
73#endif
74
75#define alloc_reg_state() kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)
76#define free_reg_state(usr) kfree(usr)
77#define alloc_labeled_state() kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)
78#define free_labeled_state(usr) kfree(usr)
79
80typedef unsigned long unw_word;
81typedef unsigned char unw_hash_index_t;
82
83static struct {
84 spinlock_t lock; /* spinlock for unwind data */
85
86 /* list of unwind tables (one per load-module) */
87 struct unw_table *tables;
88
89 unsigned long r0; /* constant 0 for r0 */
90
91 /* table of registers that prologues can save (and order in which they're saved): */
92 const unsigned char save_order[8];
93
94 /* maps a preserved register index (preg_index) to corresponding switch_stack offset: */
95 unsigned short sw_off[sizeof(struct unw_frame_info) / 8];
96
97 unsigned short lru_head; /* index of lead-recently used script */
98 unsigned short lru_tail; /* index of most-recently used script */
99
100 /* index into unw_frame_info for preserved register i */
101 unsigned short preg_index[UNW_NUM_REGS];
102
103 short pt_regs_offsets[32];
104
105 /* unwind table for the kernel: */
106 struct unw_table kernel_table;
107
108 /* unwind table describing the gate page (kernel code that is mapped into user space): */
109 size_t gate_table_size;
110 unsigned long *gate_table;
111
112 /* hash table that maps instruction pointer to script index: */
113 unsigned short hash[UNW_HASH_SIZE];
114
115 /* script cache: */
116 struct unw_script cache[UNW_CACHE_SIZE];
117
118# ifdef UNW_DEBUG
119 const char *preg_name[UNW_NUM_REGS];
120# endif
121# if UNW_STATS
122 struct {
123 struct {
124 int lookups;
125 int hinted_hits;
126 int normal_hits;
127 int collision_chain_traversals;
128 } cache;
129 struct {
130 unsigned long build_time;
131 unsigned long run_time;
132 unsigned long parse_time;
133 int builds;
134 int news;
135 int collisions;
136 int runs;
137 } script;
138 struct {
139 unsigned long init_time;
140 unsigned long unwind_time;
141 int inits;
142 int unwinds;
143 } api;
144 } stat;
145# endif
146} unw = {
147 .tables = &unw.kernel_table,
148 .lock = SPIN_LOCK_UNLOCKED,
149 .save_order = {
150 UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
151 UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
152 },
153 .preg_index = {
154 offsetof(struct unw_frame_info, pri_unat_loc)/8, /* PRI_UNAT_GR */
155 offsetof(struct unw_frame_info, pri_unat_loc)/8, /* PRI_UNAT_MEM */
156 offsetof(struct unw_frame_info, bsp_loc)/8,
157 offsetof(struct unw_frame_info, bspstore_loc)/8,
158 offsetof(struct unw_frame_info, pfs_loc)/8,
159 offsetof(struct unw_frame_info, rnat_loc)/8,
160 offsetof(struct unw_frame_info, psp)/8,
161 offsetof(struct unw_frame_info, rp_loc)/8,
162 offsetof(struct unw_frame_info, r4)/8,
163 offsetof(struct unw_frame_info, r5)/8,
164 offsetof(struct unw_frame_info, r6)/8,
165 offsetof(struct unw_frame_info, r7)/8,
166 offsetof(struct unw_frame_info, unat_loc)/8,
167 offsetof(struct unw_frame_info, pr_loc)/8,
168 offsetof(struct unw_frame_info, lc_loc)/8,
169 offsetof(struct unw_frame_info, fpsr_loc)/8,
170 offsetof(struct unw_frame_info, b1_loc)/8,
171 offsetof(struct unw_frame_info, b2_loc)/8,
172 offsetof(struct unw_frame_info, b3_loc)/8,
173 offsetof(struct unw_frame_info, b4_loc)/8,
174 offsetof(struct unw_frame_info, b5_loc)/8,
175 offsetof(struct unw_frame_info, f2_loc)/8,
176 offsetof(struct unw_frame_info, f3_loc)/8,
177 offsetof(struct unw_frame_info, f4_loc)/8,
178 offsetof(struct unw_frame_info, f5_loc)/8,
179 offsetof(struct unw_frame_info, fr_loc[16 - 16])/8,
180 offsetof(struct unw_frame_info, fr_loc[17 - 16])/8,
181 offsetof(struct unw_frame_info, fr_loc[18 - 16])/8,
182 offsetof(struct unw_frame_info, fr_loc[19 - 16])/8,
183 offsetof(struct unw_frame_info, fr_loc[20 - 16])/8,
184 offsetof(struct unw_frame_info, fr_loc[21 - 16])/8,
185 offsetof(struct unw_frame_info, fr_loc[22 - 16])/8,
186 offsetof(struct unw_frame_info, fr_loc[23 - 16])/8,
187 offsetof(struct unw_frame_info, fr_loc[24 - 16])/8,
188 offsetof(struct unw_frame_info, fr_loc[25 - 16])/8,
189 offsetof(struct unw_frame_info, fr_loc[26 - 16])/8,
190 offsetof(struct unw_frame_info, fr_loc[27 - 16])/8,
191 offsetof(struct unw_frame_info, fr_loc[28 - 16])/8,
192 offsetof(struct unw_frame_info, fr_loc[29 - 16])/8,
193 offsetof(struct unw_frame_info, fr_loc[30 - 16])/8,
194 offsetof(struct unw_frame_info, fr_loc[31 - 16])/8,
195 },
196 .pt_regs_offsets = {
197 [0] = -1,
198 offsetof(struct pt_regs, r1),
199 offsetof(struct pt_regs, r2),
200 offsetof(struct pt_regs, r3),
201 [4] = -1, [5] = -1, [6] = -1, [7] = -1,
202 offsetof(struct pt_regs, r8),
203 offsetof(struct pt_regs, r9),
204 offsetof(struct pt_regs, r10),
205 offsetof(struct pt_regs, r11),
206 offsetof(struct pt_regs, r12),
207 offsetof(struct pt_regs, r13),
208 offsetof(struct pt_regs, r14),
209 offsetof(struct pt_regs, r15),
210 offsetof(struct pt_regs, r16),
211 offsetof(struct pt_regs, r17),
212 offsetof(struct pt_regs, r18),
213 offsetof(struct pt_regs, r19),
214 offsetof(struct pt_regs, r20),
215 offsetof(struct pt_regs, r21),
216 offsetof(struct pt_regs, r22),
217 offsetof(struct pt_regs, r23),
218 offsetof(struct pt_regs, r24),
219 offsetof(struct pt_regs, r25),
220 offsetof(struct pt_regs, r26),
221 offsetof(struct pt_regs, r27),
222 offsetof(struct pt_regs, r28),
223 offsetof(struct pt_regs, r29),
224 offsetof(struct pt_regs, r30),
225 offsetof(struct pt_regs, r31),
226 },
227 .hash = { [0 ... UNW_HASH_SIZE - 1] = -1 },
228#ifdef UNW_DEBUG
229 .preg_name = {
230 "pri_unat_gr", "pri_unat_mem", "bsp", "bspstore", "ar.pfs", "ar.rnat", "psp", "rp",
231 "r4", "r5", "r6", "r7",
232 "ar.unat", "pr", "ar.lc", "ar.fpsr",
233 "b1", "b2", "b3", "b4", "b5",
234 "f2", "f3", "f4", "f5",
235 "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
236 "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"
237 }
238#endif
239};
240
241static inline int
242read_only (void *addr)
243{
244 return (unsigned long) ((char *) addr - (char *) &unw.r0) < sizeof(unw.r0);
245}
246
247/*
248 * Returns offset of rREG in struct pt_regs.
249 */
250static inline unsigned long
251pt_regs_off (unsigned long reg)
252{
253 short off = -1;
254
255 if (reg < ARRAY_SIZE(unw.pt_regs_offsets))
256 off = unw.pt_regs_offsets[reg];
257
258 if (off < 0) {
259 UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __FUNCTION__, reg);
260 off = 0;
261 }
262 return (unsigned long) off;
263}
264
265static inline struct pt_regs *
266get_scratch_regs (struct unw_frame_info *info)
267{
268 if (!info->pt) {
269 /* This should not happen with valid unwind info. */
270 UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __FUNCTION__);
271 if (info->flags & UNW_FLAG_INTERRUPT_FRAME)
272 info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1);
273 else
274 info->pt = info->sp - 16;
275 }
276 UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __FUNCTION__, info->sp, info->pt);
277 return (struct pt_regs *) info->pt;
278}
279
280/* Unwind accessors. */
281
282int
283unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char *nat, int write)
284{
285 unsigned long *addr, *nat_addr, nat_mask = 0, dummy_nat;
286 struct unw_ireg *ireg;
287 struct pt_regs *pt;
288
289 if ((unsigned) regnum - 1 >= 127) {
290 if (regnum == 0 && !write) {
291 *val = 0; /* read r0 always returns 0 */
292 *nat = 0;
293 return 0;
294 }
295 UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n",
296 __FUNCTION__, regnum);
297 return -1;
298 }
299
300 if (regnum < 32) {
301 if (regnum >= 4 && regnum <= 7) {
302 /* access a preserved register */
303 ireg = &info->r4 + (regnum - 4);
304 addr = ireg->loc;
305 if (addr) {
306 nat_addr = addr + ireg->nat.off;
307 switch (ireg->nat.type) {
308 case UNW_NAT_VAL:
309 /* simulate getf.sig/setf.sig */
310 if (write) {
311 if (*nat) {
312 /* write NaTVal and be done with it */
313 addr[0] = 0;
314 addr[1] = 0x1fffe;
315 return 0;
316 }
317 addr[1] = 0x1003e;
318 } else {
319 if (addr[0] == 0 && addr[1] == 0x1ffe) {
320 /* return NaT and be done with it */
321 *val = 0;
322 *nat = 1;
323 return 0;
324 }
325 }
326 /* fall through */
327 case UNW_NAT_NONE:
328 dummy_nat = 0;
329 nat_addr = &dummy_nat;
330 break;
331
332 case UNW_NAT_MEMSTK:
333 nat_mask = (1UL << ((long) addr & 0x1f8)/8);
334 break;
335
336 case UNW_NAT_REGSTK:
337 nat_addr = ia64_rse_rnat_addr(addr);
338 if ((unsigned long) addr < info->regstk.limit
339 || (unsigned long) addr >= info->regstk.top)
340 {
341 UNW_DPRINT(0, "unwind.%s: %p outside of regstk "
342 "[0x%lx-0x%lx)\n",
343 __FUNCTION__, (void *) addr,
344 info->regstk.limit,
345 info->regstk.top);
346 return -1;
347 }
348 if ((unsigned long) nat_addr >= info->regstk.top)
349 nat_addr = &info->sw->ar_rnat;
350 nat_mask = (1UL << ia64_rse_slot_num(addr));
351 break;
352 }
353 } else {
354 addr = &info->sw->r4 + (regnum - 4);
355 nat_addr = &info->sw->ar_unat;
356 nat_mask = (1UL << ((long) addr & 0x1f8)/8);
357 }
358 } else {
359 /* access a scratch register */
360 pt = get_scratch_regs(info);
361 addr = (unsigned long *) ((unsigned long)pt + pt_regs_off(regnum));
362 if (info->pri_unat_loc)
363 nat_addr = info->pri_unat_loc;
364 else
365 nat_addr = &info->sw->ar_unat;
366 nat_mask = (1UL << ((long) addr & 0x1f8)/8);
367 }
368 } else {
369 /* access a stacked register */
370 addr = ia64_rse_skip_regs((unsigned long *) info->bsp, regnum - 32);
371 nat_addr = ia64_rse_rnat_addr(addr);
372 if ((unsigned long) addr < info->regstk.limit
373 || (unsigned long) addr >= info->regstk.top)
374 {
375 UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside "
376 "of rbs\n", __FUNCTION__);
377 return -1;
378 }
379 if ((unsigned long) nat_addr >= info->regstk.top)
380 nat_addr = &info->sw->ar_rnat;
381 nat_mask = (1UL << ia64_rse_slot_num(addr));
382 }
383
384 if (write) {
385 if (read_only(addr)) {
386 UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
387 __FUNCTION__);
388 } else {
389 *addr = *val;
390 if (*nat)
391 *nat_addr |= nat_mask;
392 else
393 *nat_addr &= ~nat_mask;
394 }
395 } else {
396 if ((*nat_addr & nat_mask) == 0) {
397 *val = *addr;
398 *nat = 0;
399 } else {
400 *val = 0; /* if register is a NaT, *addr may contain kernel data! */
401 *nat = 1;
402 }
403 }
404 return 0;
405}
406EXPORT_SYMBOL(unw_access_gr);
407
408int
409unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
410{
411 unsigned long *addr;
412 struct pt_regs *pt;
413
414 switch (regnum) {
415 /* scratch: */
416 case 0: pt = get_scratch_regs(info); addr = &pt->b0; break;
417 case 6: pt = get_scratch_regs(info); addr = &pt->b6; break;
418 case 7: pt = get_scratch_regs(info); addr = &pt->b7; break;
419
420 /* preserved: */
421 case 1: case 2: case 3: case 4: case 5:
422 addr = *(&info->b1_loc + (regnum - 1));
423 if (!addr)
424 addr = &info->sw->b1 + (regnum - 1);
425 break;
426
427 default:
428 UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n",
429 __FUNCTION__, regnum);
430 return -1;
431 }
432 if (write)
433 if (read_only(addr)) {
434 UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
435 __FUNCTION__);
436 } else
437 *addr = *val;
438 else
439 *val = *addr;
440 return 0;
441}
442EXPORT_SYMBOL(unw_access_br);
443
444int
445unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, int write)
446{
447 struct ia64_fpreg *addr = NULL;
448 struct pt_regs *pt;
449
450 if ((unsigned) (regnum - 2) >= 126) {
451 UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n",
452 __FUNCTION__, regnum);
453 return -1;
454 }
455
456 if (regnum <= 5) {
457 addr = *(&info->f2_loc + (regnum - 2));
458 if (!addr)
459 addr = &info->sw->f2 + (regnum - 2);
460 } else if (regnum <= 15) {
461 if (regnum <= 11) {
462 pt = get_scratch_regs(info);
463 addr = &pt->f6 + (regnum - 6);
464 }
465 else
466 addr = &info->sw->f12 + (regnum - 12);
467 } else if (regnum <= 31) {
468 addr = info->fr_loc[regnum - 16];
469 if (!addr)
470 addr = &info->sw->f16 + (regnum - 16);
471 } else {
472 struct task_struct *t = info->task;
473
474 if (write)
475 ia64_sync_fph(t);
476 else
477 ia64_flush_fph(t);
478 addr = t->thread.fph + (regnum - 32);
479 }
480
481 if (write)
482 if (read_only(addr)) {
483 UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
484 __FUNCTION__);
485 } else
486 *addr = *val;
487 else
488 *val = *addr;
489 return 0;
490}
491EXPORT_SYMBOL(unw_access_fr);
492
493int
494unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
495{
496 unsigned long *addr;
497 struct pt_regs *pt;
498
499 switch (regnum) {
500 case UNW_AR_BSP:
501 addr = info->bsp_loc;
502 if (!addr)
503 addr = &info->sw->ar_bspstore;
504 break;
505
506 case UNW_AR_BSPSTORE:
507 addr = info->bspstore_loc;
508 if (!addr)
509 addr = &info->sw->ar_bspstore;
510 break;
511
512 case UNW_AR_PFS:
513 addr = info->pfs_loc;
514 if (!addr)
515 addr = &info->sw->ar_pfs;
516 break;
517
518 case UNW_AR_RNAT:
519 addr = info->rnat_loc;
520 if (!addr)
521 addr = &info->sw->ar_rnat;
522 break;
523
524 case UNW_AR_UNAT:
525 addr = info->unat_loc;
526 if (!addr)
527 addr = &info->sw->ar_unat;
528 break;
529
530 case UNW_AR_LC:
531 addr = info->lc_loc;
532 if (!addr)
533 addr = &info->sw->ar_lc;
534 break;
535
536 case UNW_AR_EC:
537 if (!info->cfm_loc)
538 return -1;
539 if (write)
540 *info->cfm_loc =
541 (*info->cfm_loc & ~(0x3fUL << 52)) | ((*val & 0x3f) << 52);
542 else
543 *val = (*info->cfm_loc >> 52) & 0x3f;
544 return 0;
545
546 case UNW_AR_FPSR:
547 addr = info->fpsr_loc;
548 if (!addr)
549 addr = &info->sw->ar_fpsr;
550 break;
551
552 case UNW_AR_RSC:
553 pt = get_scratch_regs(info);
554 addr = &pt->ar_rsc;
555 break;
556
557 case UNW_AR_CCV:
558 pt = get_scratch_regs(info);
559 addr = &pt->ar_ccv;
560 break;
561
562 case UNW_AR_CSD:
563 pt = get_scratch_regs(info);
564 addr = &pt->ar_csd;
565 break;
566
567 case UNW_AR_SSD:
568 pt = get_scratch_regs(info);
569 addr = &pt->ar_ssd;
570 break;
571
572 default:
573 UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n",
574 __FUNCTION__, regnum);
575 return -1;
576 }
577
578 if (write) {
579 if (read_only(addr)) {
580 UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
581 __FUNCTION__);
582 } else
583 *addr = *val;
584 } else
585 *val = *addr;
586 return 0;
587}
588EXPORT_SYMBOL(unw_access_ar);
589
590int
591unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write)
592{
593 unsigned long *addr;
594
595 addr = info->pr_loc;
596 if (!addr)
597 addr = &info->sw->pr;
598
599 if (write) {
600 if (read_only(addr)) {
601 UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
602 __FUNCTION__);
603 } else
604 *addr = *val;
605 } else
606 *val = *addr;
607 return 0;
608}
609EXPORT_SYMBOL(unw_access_pr);
610
611
612/* Routines to manipulate the state stack. */
613
614static inline void
615push (struct unw_state_record *sr)
616{
617 struct unw_reg_state *rs;
618
619 rs = alloc_reg_state();
620 if (!rs) {
621 printk(KERN_ERR "unwind: cannot stack reg state!\n");
622 return;
623 }
624 memcpy(rs, &sr->curr, sizeof(*rs));
625 sr->curr.next = rs;
626}
627
628static void
629pop (struct unw_state_record *sr)
630{
631 struct unw_reg_state *rs = sr->curr.next;
632
633 if (!rs) {
634 printk(KERN_ERR "unwind: stack underflow!\n");
635 return;
636 }
637 memcpy(&sr->curr, rs, sizeof(*rs));
638 free_reg_state(rs);
639}
640
641/* Make a copy of the state stack. Non-recursive to avoid stack overflows. */
642static struct unw_reg_state *
643dup_state_stack (struct unw_reg_state *rs)
644{
645 struct unw_reg_state *copy, *prev = NULL, *first = NULL;
646
647 while (rs) {
648 copy = alloc_reg_state();
649 if (!copy) {
650 printk(KERN_ERR "unwind.dup_state_stack: out of memory\n");
651 return NULL;
652 }
653 memcpy(copy, rs, sizeof(*copy));
654 if (first)
655 prev->next = copy;
656 else
657 first = copy;
658 rs = rs->next;
659 prev = copy;
660 }
661 return first;
662}
663
664/* Free all stacked register states (but not RS itself). */
665static void
666free_state_stack (struct unw_reg_state *rs)
667{
668 struct unw_reg_state *p, *next;
669
670 for (p = rs->next; p != NULL; p = next) {
671 next = p->next;
672 free_reg_state(p);
673 }
674 rs->next = NULL;
675}
676
677/* Unwind decoder routines */
678
679static enum unw_register_index __attribute_const__
680decode_abreg (unsigned char abreg, int memory)
681{
682 switch (abreg) {
683 case 0x04 ... 0x07: return UNW_REG_R4 + (abreg - 0x04);
684 case 0x22 ... 0x25: return UNW_REG_F2 + (abreg - 0x22);
685 case 0x30 ... 0x3f: return UNW_REG_F16 + (abreg - 0x30);
686 case 0x41 ... 0x45: return UNW_REG_B1 + (abreg - 0x41);
687 case 0x60: return UNW_REG_PR;
688 case 0x61: return UNW_REG_PSP;
689 case 0x62: return memory ? UNW_REG_PRI_UNAT_MEM : UNW_REG_PRI_UNAT_GR;
690 case 0x63: return UNW_REG_RP;
691 case 0x64: return UNW_REG_BSP;
692 case 0x65: return UNW_REG_BSPSTORE;
693 case 0x66: return UNW_REG_RNAT;
694 case 0x67: return UNW_REG_UNAT;
695 case 0x68: return UNW_REG_FPSR;
696 case 0x69: return UNW_REG_PFS;
697 case 0x6a: return UNW_REG_LC;
698 default:
699 break;
700 }
701 UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __FUNCTION__, abreg);
702 return UNW_REG_LC;
703}
704
705static void
706set_reg (struct unw_reg_info *reg, enum unw_where where, int when, unsigned long val)
707{
708 reg->val = val;
709 reg->where = where;
710 if (reg->when == UNW_WHEN_NEVER)
711 reg->when = when;
712}
713
714static void
715alloc_spill_area (unsigned long *offp, unsigned long regsize,
716 struct unw_reg_info *lo, struct unw_reg_info *hi)
717{
718 struct unw_reg_info *reg;
719
720 for (reg = hi; reg >= lo; --reg) {
721 if (reg->where == UNW_WHERE_SPILL_HOME) {
722 reg->where = UNW_WHERE_PSPREL;
723 *offp -= regsize;
724 reg->val = *offp;
725 }
726 }
727}
728
729static inline void
730spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word t)
731{
732 struct unw_reg_info *reg;
733
734 for (reg = *regp; reg <= lim; ++reg) {
735 if (reg->where == UNW_WHERE_SPILL_HOME) {
736 reg->when = t;
737 *regp = reg + 1;
738 return;
739 }
740 }
741 UNW_DPRINT(0, "unwind.%s: excess spill!\n", __FUNCTION__);
742}
743
744static inline void
745finish_prologue (struct unw_state_record *sr)
746{
747 struct unw_reg_info *reg;
748 unsigned long off;
749 int i;
750
751 /*
752 * First, resolve implicit register save locations (see Section "11.4.2.3 Rules
753 * for Using Unwind Descriptors", rule 3):
754 */
755 for (i = 0; i < (int) ARRAY_SIZE(unw.save_order); ++i) {
756 reg = sr->curr.reg + unw.save_order[i];
757 if (reg->where == UNW_WHERE_GR_SAVE) {
758 reg->where = UNW_WHERE_GR;
759 reg->val = sr->gr_save_loc++;
760 }
761 }
762
763 /*
764 * Next, compute when the fp, general, and branch registers get
765 * saved. This must come before alloc_spill_area() because
766 * we need to know which registers are spilled to their home
767 * locations.
768 */
769 if (sr->imask) {
770 unsigned char kind, mask = 0, *cp = sr->imask;
771 int t;
772 static const unsigned char limit[3] = {
773 UNW_REG_F31, UNW_REG_R7, UNW_REG_B5
774 };
775 struct unw_reg_info *(regs[3]);
776
777 regs[0] = sr->curr.reg + UNW_REG_F2;
778 regs[1] = sr->curr.reg + UNW_REG_R4;
779 regs[2] = sr->curr.reg + UNW_REG_B1;
780
781 for (t = 0; t < sr->region_len; ++t) {
782 if ((t & 3) == 0)
783 mask = *cp++;
784 kind = (mask >> 2*(3-(t & 3))) & 3;
785 if (kind > 0)
786 spill_next_when(&regs[kind - 1], sr->curr.reg + limit[kind - 1],
787 sr->region_start + t);
788 }
789 }
790 /*
791 * Next, lay out the memory stack spill area:
792 */
793 if (sr->any_spills) {
794 off = sr->spill_offset;
795 alloc_spill_area(&off, 16, sr->curr.reg + UNW_REG_F2, sr->curr.reg + UNW_REG_F31);
796 alloc_spill_area(&off, 8, sr->curr.reg + UNW_REG_B1, sr->curr.reg + UNW_REG_B5);
797 alloc_spill_area(&off, 8, sr->curr.reg + UNW_REG_R4, sr->curr.reg + UNW_REG_R7);
798 }
799}
800
801/*
802 * Region header descriptors.
803 */
804
805static void
806desc_prologue (int body, unw_word rlen, unsigned char mask, unsigned char grsave,
807 struct unw_state_record *sr)
808{
809 int i, region_start;
810
811 if (!(sr->in_body || sr->first_region))
812 finish_prologue(sr);
813 sr->first_region = 0;
814
815 /* check if we're done: */
816 if (sr->when_target < sr->region_start + sr->region_len) {
817 sr->done = 1;
818 return;
819 }
820
821 region_start = sr->region_start + sr->region_len;
822
823 for (i = 0; i < sr->epilogue_count; ++i)
824 pop(sr);
825 sr->epilogue_count = 0;
826 sr->epilogue_start = UNW_WHEN_NEVER;
827
828 sr->region_start = region_start;
829 sr->region_len = rlen;
830 sr->in_body = body;
831
832 if (!body) {
833 push(sr);
834
835 for (i = 0; i < 4; ++i) {
836 if (mask & 0x8)
837 set_reg(sr->curr.reg + unw.save_order[i], UNW_WHERE_GR,
838 sr->region_start + sr->region_len - 1, grsave++);
839 mask <<= 1;
840 }
841 sr->gr_save_loc = grsave;
842 sr->any_spills = 0;
843 sr->imask = NULL;
844 sr->spill_offset = 0x10; /* default to psp+16 */
845 }
846}
847
848/*
849 * Prologue descriptors.
850 */
851
852static inline void
853desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr)
854{
855 if (abi == 3 && context == 'i') {
856 sr->flags |= UNW_FLAG_INTERRUPT_FRAME;
857 UNW_DPRINT(3, "unwind.%s: interrupt frame\n", __FUNCTION__);
858 }
859 else
860 UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n",
861 __FUNCTION__, abi, context);
862}
863
864static inline void
865desc_br_gr (unsigned char brmask, unsigned char gr, struct unw_state_record *sr)
866{
867 int i;
868
869 for (i = 0; i < 5; ++i) {
870 if (brmask & 1)
871 set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_GR,
872 sr->region_start + sr->region_len - 1, gr++);
873 brmask >>= 1;
874 }
875}
876
877static inline void
878desc_br_mem (unsigned char brmask, struct unw_state_record *sr)
879{
880 int i;
881
882 for (i = 0; i < 5; ++i) {
883 if (brmask & 1) {
884 set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_SPILL_HOME,
885 sr->region_start + sr->region_len - 1, 0);
886 sr->any_spills = 1;
887 }
888 brmask >>= 1;
889 }
890}
891
892static inline void
893desc_frgr_mem (unsigned char grmask, unw_word frmask, struct unw_state_record *sr)
894{
895 int i;
896
897 for (i = 0; i < 4; ++i) {
898 if ((grmask & 1) != 0) {
899 set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
900 sr->region_start + sr->region_len - 1, 0);
901 sr->any_spills = 1;
902 }
903 grmask >>= 1;
904 }
905 for (i = 0; i < 20; ++i) {
906 if ((frmask & 1) != 0) {
907 int base = (i < 4) ? UNW_REG_F2 : UNW_REG_F16 - 4;
908 set_reg(sr->curr.reg + base + i, UNW_WHERE_SPILL_HOME,
909 sr->region_start + sr->region_len - 1, 0);
910 sr->any_spills = 1;
911 }
912 frmask >>= 1;
913 }
914}
915
916static inline void
917desc_fr_mem (unsigned char frmask, struct unw_state_record *sr)
918{
919 int i;
920
921 for (i = 0; i < 4; ++i) {
922 if ((frmask & 1) != 0) {
923 set_reg(sr->curr.reg + UNW_REG_F2 + i, UNW_WHERE_SPILL_HOME,
924 sr->region_start + sr->region_len - 1, 0);
925 sr->any_spills = 1;
926 }
927 frmask >>= 1;
928 }
929}
930
931static inline void
932desc_gr_gr (unsigned char grmask, unsigned char gr, struct unw_state_record *sr)
933{
934 int i;
935
936 for (i = 0; i < 4; ++i) {
937 if ((grmask & 1) != 0)
938 set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_GR,
939 sr->region_start + sr->region_len - 1, gr++);
940 grmask >>= 1;
941 }
942}
943
944static inline void
945desc_gr_mem (unsigned char grmask, struct unw_state_record *sr)
946{
947 int i;
948
949 for (i = 0; i < 4; ++i) {
950 if ((grmask & 1) != 0) {
951 set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
952 sr->region_start + sr->region_len - 1, 0);
953 sr->any_spills = 1;
954 }
955 grmask >>= 1;
956 }
957}
958
959static inline void
960desc_mem_stack_f (unw_word t, unw_word size, struct unw_state_record *sr)
961{
962 set_reg(sr->curr.reg + UNW_REG_PSP, UNW_WHERE_NONE,
963 sr->region_start + min_t(int, t, sr->region_len - 1), 16*size);
964}
965
966static inline void
967desc_mem_stack_v (unw_word t, struct unw_state_record *sr)
968{
969 sr->curr.reg[UNW_REG_PSP].when = sr->region_start + min_t(int, t, sr->region_len - 1);
970}
971
972static inline void
973desc_reg_gr (unsigned char reg, unsigned char dst, struct unw_state_record *sr)
974{
975 set_reg(sr->curr.reg + reg, UNW_WHERE_GR, sr->region_start + sr->region_len - 1, dst);
976}
977
978static inline void
979desc_reg_psprel (unsigned char reg, unw_word pspoff, struct unw_state_record *sr)
980{
981 set_reg(sr->curr.reg + reg, UNW_WHERE_PSPREL, sr->region_start + sr->region_len - 1,
982 0x10 - 4*pspoff);
983}
984
985static inline void
986desc_reg_sprel (unsigned char reg, unw_word spoff, struct unw_state_record *sr)
987{
988 set_reg(sr->curr.reg + reg, UNW_WHERE_SPREL, sr->region_start + sr->region_len - 1,
989 4*spoff);
990}
991
992static inline void
993desc_rp_br (unsigned char dst, struct unw_state_record *sr)
994{
995 sr->return_link_reg = dst;
996}
997
998static inline void
999desc_reg_when (unsigned char regnum, unw_word t, struct unw_state_record *sr)
1000{
1001 struct unw_reg_info *reg = sr->curr.reg + regnum;
1002
1003 if (reg->where == UNW_WHERE_NONE)
1004 reg->where = UNW_WHERE_GR_SAVE;
1005 reg->when = sr->region_start + min_t(int, t, sr->region_len - 1);
1006}
1007
1008static inline void
1009desc_spill_base (unw_word pspoff, struct unw_state_record *sr)
1010{
1011 sr->spill_offset = 0x10 - 4*pspoff;
1012}
1013
1014static inline unsigned char *
1015desc_spill_mask (unsigned char *imaskp, struct unw_state_record *sr)
1016{
1017 sr->imask = imaskp;
1018 return imaskp + (2*sr->region_len + 7)/8;
1019}
1020
1021/*
1022 * Body descriptors.
1023 */
1024static inline void
1025desc_epilogue (unw_word t, unw_word ecount, struct unw_state_record *sr)
1026{
1027 sr->epilogue_start = sr->region_start + sr->region_len - 1 - t;
1028 sr->epilogue_count = ecount + 1;
1029}
1030
1031static inline void
1032desc_copy_state (unw_word label, struct unw_state_record *sr)
1033{
1034 struct unw_labeled_state *ls;
1035
1036 for (ls = sr->labeled_states; ls; ls = ls->next) {
1037 if (ls->label == label) {
1038 free_state_stack(&sr->curr);
1039 memcpy(&sr->curr, &ls->saved_state, sizeof(sr->curr));
1040 sr->curr.next = dup_state_stack(ls->saved_state.next);
1041 return;
1042 }
1043 }
1044 printk(KERN_ERR "unwind: failed to find state labeled 0x%lx\n", label);
1045}
1046
1047static inline void
1048desc_label_state (unw_word label, struct unw_state_record *sr)
1049{
1050 struct unw_labeled_state *ls;
1051
1052 ls = alloc_labeled_state();
1053 if (!ls) {
1054 printk(KERN_ERR "unwind.desc_label_state(): out of memory\n");
1055 return;
1056 }
1057 ls->label = label;
1058 memcpy(&ls->saved_state, &sr->curr, sizeof(ls->saved_state));
1059 ls->saved_state.next = dup_state_stack(sr->curr.next);
1060
1061 /* insert into list of labeled states: */
1062 ls->next = sr->labeled_states;
1063 sr->labeled_states = ls;
1064}
1065
1066/*
1067 * General descriptors.
1068 */
1069
1070static inline int
1071desc_is_active (unsigned char qp, unw_word t, struct unw_state_record *sr)
1072{
1073 if (sr->when_target <= sr->region_start + min_t(int, t, sr->region_len - 1))
1074 return 0;
1075 if (qp > 0) {
1076 if ((sr->pr_val & (1UL << qp)) == 0)
1077 return 0;
1078 sr->pr_mask |= (1UL << qp);
1079 }
1080 return 1;
1081}
1082
1083static inline void
1084desc_restore_p (unsigned char qp, unw_word t, unsigned char abreg, struct unw_state_record *sr)
1085{
1086 struct unw_reg_info *r;
1087
1088 if (!desc_is_active(qp, t, sr))
1089 return;
1090
1091 r = sr->curr.reg + decode_abreg(abreg, 0);
1092 r->where = UNW_WHERE_NONE;
1093 r->when = UNW_WHEN_NEVER;
1094 r->val = 0;
1095}
1096
1097static inline void
1098desc_spill_reg_p (unsigned char qp, unw_word t, unsigned char abreg, unsigned char x,
1099 unsigned char ytreg, struct unw_state_record *sr)
1100{
1101 enum unw_where where = UNW_WHERE_GR;
1102 struct unw_reg_info *r;
1103
1104 if (!desc_is_active(qp, t, sr))
1105 return;
1106
1107 if (x)
1108 where = UNW_WHERE_BR;
1109 else if (ytreg & 0x80)
1110 where = UNW_WHERE_FR;
1111
1112 r = sr->curr.reg + decode_abreg(abreg, 0);
1113 r->where = where;
1114 r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
1115 r->val = (ytreg & 0x7f);
1116}
1117
1118static inline void
1119desc_spill_psprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word pspoff,
1120 struct unw_state_record *sr)
1121{
1122 struct unw_reg_info *r;
1123
1124 if (!desc_is_active(qp, t, sr))
1125 return;
1126
1127 r = sr->curr.reg + decode_abreg(abreg, 1);
1128 r->where = UNW_WHERE_PSPREL;
1129 r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
1130 r->val = 0x10 - 4*pspoff;
1131}
1132
1133static inline void
1134desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word spoff,
1135 struct unw_state_record *sr)
1136{
1137 struct unw_reg_info *r;
1138
1139 if (!desc_is_active(qp, t, sr))
1140 return;
1141
1142 r = sr->curr.reg + decode_abreg(abreg, 1);
1143 r->where = UNW_WHERE_SPREL;
1144 r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
1145 r->val = 4*spoff;
1146}
1147
1148#define UNW_DEC_BAD_CODE(code) printk(KERN_ERR "unwind: unknown code 0x%02x\n", \
1149 code);
1150
1151/*
1152 * region headers:
1153 */
1154#define UNW_DEC_PROLOGUE_GR(fmt,r,m,gr,arg) desc_prologue(0,r,m,gr,arg)
1155#define UNW_DEC_PROLOGUE(fmt,b,r,arg) desc_prologue(b,r,0,32,arg)
1156/*
1157 * prologue descriptors:
1158 */
1159#define UNW_DEC_ABI(fmt,a,c,arg) desc_abi(a,c,arg)
1160#define UNW_DEC_BR_GR(fmt,b,g,arg) desc_br_gr(b,g,arg)
1161#define UNW_DEC_BR_MEM(fmt,b,arg) desc_br_mem(b,arg)
1162#define UNW_DEC_FRGR_MEM(fmt,g,f,arg) desc_frgr_mem(g,f,arg)
1163#define UNW_DEC_FR_MEM(fmt,f,arg) desc_fr_mem(f,arg)
1164#define UNW_DEC_GR_GR(fmt,m,g,arg) desc_gr_gr(m,g,arg)
1165#define UNW_DEC_GR_MEM(fmt,m,arg) desc_gr_mem(m,arg)
1166#define UNW_DEC_MEM_STACK_F(fmt,t,s,arg) desc_mem_stack_f(t,s,arg)
1167#define UNW_DEC_MEM_STACK_V(fmt,t,arg) desc_mem_stack_v(t,arg)
1168#define UNW_DEC_REG_GR(fmt,r,d,arg) desc_reg_gr(r,d,arg)
1169#define UNW_DEC_REG_PSPREL(fmt,r,o,arg) desc_reg_psprel(r,o,arg)
1170#define UNW_DEC_REG_SPREL(fmt,r,o,arg) desc_reg_sprel(r,o,arg)
1171#define UNW_DEC_REG_WHEN(fmt,r,t,arg) desc_reg_when(r,t,arg)
1172#define UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg) desc_reg_when(UNW_REG_PRI_UNAT_GR,t,arg)
1173#define UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg) desc_reg_when(UNW_REG_PRI_UNAT_MEM,t,arg)
1174#define UNW_DEC_PRIUNAT_GR(fmt,r,arg) desc_reg_gr(UNW_REG_PRI_UNAT_GR,r,arg)
1175#define UNW_DEC_PRIUNAT_PSPREL(fmt,o,arg) desc_reg_psprel(UNW_REG_PRI_UNAT_MEM,o,arg)
1176#define UNW_DEC_PRIUNAT_SPREL(fmt,o,arg) desc_reg_sprel(UNW_REG_PRI_UNAT_MEM,o,arg)
1177#define UNW_DEC_RP_BR(fmt,d,arg) desc_rp_br(d,arg)
1178#define UNW_DEC_SPILL_BASE(fmt,o,arg) desc_spill_base(o,arg)
1179#define UNW_DEC_SPILL_MASK(fmt,m,arg) (m = desc_spill_mask(m,arg))
1180/*
1181 * body descriptors:
1182 */
1183#define UNW_DEC_EPILOGUE(fmt,t,c,arg) desc_epilogue(t,c,arg)
1184#define UNW_DEC_COPY_STATE(fmt,l,arg) desc_copy_state(l,arg)
1185#define UNW_DEC_LABEL_STATE(fmt,l,arg) desc_label_state(l,arg)
1186/*
1187 * general unwind descriptors:
1188 */
1189#define UNW_DEC_SPILL_REG_P(f,p,t,a,x,y,arg) desc_spill_reg_p(p,t,a,x,y,arg)
1190#define UNW_DEC_SPILL_REG(f,t,a,x,y,arg) desc_spill_reg_p(0,t,a,x,y,arg)
1191#define UNW_DEC_SPILL_PSPREL_P(f,p,t,a,o,arg) desc_spill_psprel_p(p,t,a,o,arg)
1192#define UNW_DEC_SPILL_PSPREL(f,t,a,o,arg) desc_spill_psprel_p(0,t,a,o,arg)
1193#define UNW_DEC_SPILL_SPREL_P(f,p,t,a,o,arg) desc_spill_sprel_p(p,t,a,o,arg)
1194#define UNW_DEC_SPILL_SPREL(f,t,a,o,arg) desc_spill_sprel_p(0,t,a,o,arg)
1195#define UNW_DEC_RESTORE_P(f,p,t,a,arg) desc_restore_p(p,t,a,arg)
1196#define UNW_DEC_RESTORE(f,t,a,arg) desc_restore_p(0,t,a,arg)
1197
1198#include "unwind_decoder.c"
1199
1200
1201/* Unwind scripts. */
1202
1203static inline unw_hash_index_t
1204hash (unsigned long ip)
1205{
1206# define hashmagic 0x9e3779b97f4a7c16UL /* based on (sqrt(5)/2-1)*2^64 */
1207
1208 return (ip >> 4)*hashmagic >> (64 - UNW_LOG_HASH_SIZE);
1209#undef hashmagic
1210}
1211
1212static inline long
1213cache_match (struct unw_script *script, unsigned long ip, unsigned long pr)
1214{
1215 read_lock(&script->lock);
1216 if (ip == script->ip && ((pr ^ script->pr_val) & script->pr_mask) == 0)
1217 /* keep the read lock... */
1218 return 1;
1219 read_unlock(&script->lock);
1220 return 0;
1221}
1222
1223static inline struct unw_script *
1224script_lookup (struct unw_frame_info *info)
1225{
1226 struct unw_script *script = unw.cache + info->hint;
1227 unsigned short index;
1228 unsigned long ip, pr;
1229
1230 if (UNW_DEBUG_ON(0))
1231 return NULL; /* Always regenerate scripts in debug mode */
1232
1233 STAT(++unw.stat.cache.lookups);
1234
1235 ip = info->ip;
1236 pr = info->pr;
1237
1238 if (cache_match(script, ip, pr)) {
1239 STAT(++unw.stat.cache.hinted_hits);
1240 return script;
1241 }
1242
1243 index = unw.hash[hash(ip)];
1244 if (index >= UNW_CACHE_SIZE)
1245 return NULL;
1246
1247 script = unw.cache + index;
1248 while (1) {
1249 if (cache_match(script, ip, pr)) {
1250 /* update hint; no locking required as single-word writes are atomic */
1251 STAT(++unw.stat.cache.normal_hits);
1252 unw.cache[info->prev_script].hint = script - unw.cache;
1253 return script;
1254 }
1255 if (script->coll_chain >= UNW_HASH_SIZE)
1256 return NULL;
1257 script = unw.cache + script->coll_chain;
1258 STAT(++unw.stat.cache.collision_chain_traversals);
1259 }
1260}
1261
1262/*
1263 * On returning, a write lock for the SCRIPT is still being held.
1264 */
1265static inline struct unw_script *
1266script_new (unsigned long ip)
1267{
1268 struct unw_script *script, *prev, *tmp;
1269 unw_hash_index_t index;
1270 unsigned short head;
1271
1272 STAT(++unw.stat.script.news);
1273
1274 /*
1275 * Can't (easily) use cmpxchg() here because of ABA problem
1276 * that is intrinsic in cmpxchg()...
1277 */
1278 head = unw.lru_head;
1279 script = unw.cache + head;
1280 unw.lru_head = script->lru_chain;
1281
1282 /*
1283 * We'd deadlock here if we interrupted a thread that is holding a read lock on
1284 * script->lock. Thus, if the write_trylock() fails, we simply bail out. The
1285 * alternative would be to disable interrupts whenever we hold a read-lock, but
1286 * that seems silly.
1287 */
1288 if (!write_trylock(&script->lock))
1289 return NULL;
1290
1291 /* re-insert script at the tail of the LRU chain: */
1292 unw.cache[unw.lru_tail].lru_chain = head;
1293 unw.lru_tail = head;
1294
1295 /* remove the old script from the hash table (if it's there): */
1296 if (script->ip) {
1297 index = hash(script->ip);
1298 tmp = unw.cache + unw.hash[index];
1299 prev = NULL;
1300 while (1) {
1301 if (tmp == script) {
1302 if (prev)
1303 prev->coll_chain = tmp->coll_chain;
1304 else
1305 unw.hash[index] = tmp->coll_chain;
1306 break;
1307 } else
1308 prev = tmp;
1309 if (tmp->coll_chain >= UNW_CACHE_SIZE)
1310 /* old script wasn't in the hash-table */
1311 break;
1312 tmp = unw.cache + tmp->coll_chain;
1313 }
1314 }
1315
1316 /* enter new script in the hash table */
1317 index = hash(ip);
1318 script->coll_chain = unw.hash[index];
1319 unw.hash[index] = script - unw.cache;
1320
1321 script->ip = ip; /* set new IP while we're holding the locks */
1322
1323 STAT(if (script->coll_chain < UNW_CACHE_SIZE) ++unw.stat.script.collisions);
1324
1325 script->flags = 0;
1326 script->hint = 0;
1327 script->count = 0;
1328 return script;
1329}
1330
1331static void
1332script_finalize (struct unw_script *script, struct unw_state_record *sr)
1333{
1334 script->pr_mask = sr->pr_mask;
1335 script->pr_val = sr->pr_val;
1336 /*
1337 * We could down-grade our write-lock on script->lock here but
1338 * the rwlock API doesn't offer atomic lock downgrading, so
1339 * we'll just keep the write-lock and release it later when
1340 * we're done using the script.
1341 */
1342}
1343
1344static inline void
1345script_emit (struct unw_script *script, struct unw_insn insn)
1346{
1347 if (script->count >= UNW_MAX_SCRIPT_LEN) {
1348 UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n",
1349 __FUNCTION__, UNW_MAX_SCRIPT_LEN);
1350 return;
1351 }
1352 script->insn[script->count++] = insn;
1353}
1354
1355static inline void
1356emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script)
1357{
1358 struct unw_reg_info *r = sr->curr.reg + i;
1359 enum unw_insn_opcode opc;
1360 struct unw_insn insn;
1361 unsigned long val = 0;
1362
1363 switch (r->where) {
1364 case UNW_WHERE_GR:
1365 if (r->val >= 32) {
1366 /* register got spilled to a stacked register */
1367 opc = UNW_INSN_SETNAT_TYPE;
1368 val = UNW_NAT_REGSTK;
1369 } else
1370 /* register got spilled to a scratch register */
1371 opc = UNW_INSN_SETNAT_MEMSTK;
1372 break;
1373
1374 case UNW_WHERE_FR:
1375 opc = UNW_INSN_SETNAT_TYPE;
1376 val = UNW_NAT_VAL;
1377 break;
1378
1379 case UNW_WHERE_BR:
1380 opc = UNW_INSN_SETNAT_TYPE;
1381 val = UNW_NAT_NONE;
1382 break;
1383
1384 case UNW_WHERE_PSPREL:
1385 case UNW_WHERE_SPREL:
1386 opc = UNW_INSN_SETNAT_MEMSTK;
1387 break;
1388
1389 default:
1390 UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n",
1391 __FUNCTION__, r->where);
1392 return;
1393 }
1394 insn.opc = opc;
1395 insn.dst = unw.preg_index[i];
1396 insn.val = val;
1397 script_emit(script, insn);
1398}
1399
1400static void
1401compile_reg (struct unw_state_record *sr, int i, struct unw_script *script)
1402{
1403 struct unw_reg_info *r = sr->curr.reg + i;
1404 enum unw_insn_opcode opc;
1405 unsigned long val, rval;
1406 struct unw_insn insn;
1407 long need_nat_info;
1408
1409 if (r->where == UNW_WHERE_NONE || r->when >= sr->when_target)
1410 return;
1411
1412 opc = UNW_INSN_MOVE;
1413 val = rval = r->val;
1414 need_nat_info = (i >= UNW_REG_R4 && i <= UNW_REG_R7);
1415
1416 switch (r->where) {
1417 case UNW_WHERE_GR:
1418 if (rval >= 32) {
1419 opc = UNW_INSN_MOVE_STACKED;
1420 val = rval - 32;
1421 } else if (rval >= 4 && rval <= 7) {
1422 if (need_nat_info) {
1423 opc = UNW_INSN_MOVE2;
1424 need_nat_info = 0;
1425 }
1426 val = unw.preg_index[UNW_REG_R4 + (rval - 4)];
1427 } else if (rval == 0) {
1428 opc = UNW_INSN_MOVE_CONST;
1429 val = 0;
1430 } else {
1431 /* register got spilled to a scratch register */
1432 opc = UNW_INSN_MOVE_SCRATCH;
1433 val = pt_regs_off(rval);
1434 }
1435 break;
1436
1437 case UNW_WHERE_FR:
1438 if (rval <= 5)
1439 val = unw.preg_index[UNW_REG_F2 + (rval - 2)];
1440 else if (rval >= 16 && rval <= 31)
1441 val = unw.preg_index[UNW_REG_F16 + (rval - 16)];
1442 else {
1443 opc = UNW_INSN_MOVE_SCRATCH;
1444 if (rval <= 11)
1445 val = offsetof(struct pt_regs, f6) + 16*(rval - 6);
1446 else
1447 UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n",
1448 __FUNCTION__, rval);
1449 }
1450 break;
1451
1452 case UNW_WHERE_BR:
1453 if (rval >= 1 && rval <= 5)
1454 val = unw.preg_index[UNW_REG_B1 + (rval - 1)];
1455 else {
1456 opc = UNW_INSN_MOVE_SCRATCH;
1457 if (rval == 0)
1458 val = offsetof(struct pt_regs, b0);
1459 else if (rval == 6)
1460 val = offsetof(struct pt_regs, b6);
1461 else
1462 val = offsetof(struct pt_regs, b7);
1463 }
1464 break;
1465
1466 case UNW_WHERE_SPREL:
1467 opc = UNW_INSN_ADD_SP;
1468 break;
1469
1470 case UNW_WHERE_PSPREL:
1471 opc = UNW_INSN_ADD_PSP;
1472 break;
1473
1474 default:
1475 UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n",
1476 __FUNCTION__, i, r->where);
1477 break;
1478 }
1479 insn.opc = opc;
1480 insn.dst = unw.preg_index[i];
1481 insn.val = val;
1482 script_emit(script, insn);
1483 if (need_nat_info)
1484 emit_nat_info(sr, i, script);
1485
1486 if (i == UNW_REG_PSP) {
1487 /*
1488 * info->psp must contain the _value_ of the previous
1489 * sp, not it's save location. We get this by
1490 * dereferencing the value we just stored in
1491 * info->psp:
1492 */
1493 insn.opc = UNW_INSN_LOAD;
1494 insn.dst = insn.val = unw.preg_index[UNW_REG_PSP];
1495 script_emit(script, insn);
1496 }
1497}
1498
1499static inline const struct unw_table_entry *
1500lookup (struct unw_table *table, unsigned long rel_ip)
1501{
1502 const struct unw_table_entry *e = NULL;
1503 unsigned long lo, hi, mid;
1504
1505 /* do a binary search for right entry: */
1506 for (lo = 0, hi = table->length; lo < hi; ) {
1507 mid = (lo + hi) / 2;
1508 e = &table->array[mid];
1509 if (rel_ip < e->start_offset)
1510 hi = mid;
1511 else if (rel_ip >= e->end_offset)
1512 lo = mid + 1;
1513 else
1514 break;
1515 }
1516 if (rel_ip < e->start_offset || rel_ip >= e->end_offset)
1517 return NULL;
1518 return e;
1519}
1520
1521/*
1522 * Build an unwind script that unwinds from state OLD_STATE to the
1523 * entrypoint of the function that called OLD_STATE.
1524 */
1525static inline struct unw_script *
1526build_script (struct unw_frame_info *info)
1527{
1528 const struct unw_table_entry *e = NULL;
1529 struct unw_script *script = NULL;
1530 struct unw_labeled_state *ls, *next;
1531 unsigned long ip = info->ip;
1532 struct unw_state_record sr;
1533 struct unw_table *table;
1534 struct unw_reg_info *r;
1535 struct unw_insn insn;
1536 u8 *dp, *desc_end;
1537 u64 hdr;
1538 int i;
1539 STAT(unsigned long start, parse_start;)
1540
1541 STAT(++unw.stat.script.builds; start = ia64_get_itc());
1542
1543 /* build state record */
1544 memset(&sr, 0, sizeof(sr));
1545 for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
1546 r->when = UNW_WHEN_NEVER;
1547 sr.pr_val = info->pr;
1548
1549 UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __FUNCTION__, ip);
1550 script = script_new(ip);
1551 if (!script) {
1552 UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n", __FUNCTION__);
1553 STAT(unw.stat.script.build_time += ia64_get_itc() - start);
1554 return NULL;
1555 }
1556 unw.cache[info->prev_script].hint = script - unw.cache;
1557
1558 /* search the kernels and the modules' unwind tables for IP: */
1559
1560 STAT(parse_start = ia64_get_itc());
1561
1562 for (table = unw.tables; table; table = table->next) {
1563 if (ip >= table->start && ip < table->end) {
1564 e = lookup(table, ip - table->segment_base);
1565 break;
1566 }
1567 }
1568 if (!e) {
1569 /* no info, return default unwinder (leaf proc, no mem stack, no saved regs) */
1570 UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n",
1571 __FUNCTION__, ip, unw.cache[info->prev_script].ip);
1572 sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
1573 sr.curr.reg[UNW_REG_RP].when = -1;
1574 sr.curr.reg[UNW_REG_RP].val = 0;
1575 compile_reg(&sr, UNW_REG_RP, script);
1576 script_finalize(script, &sr);
1577 STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
1578 STAT(unw.stat.script.build_time += ia64_get_itc() - start);
1579 return script;
1580 }
1581
1582 sr.when_target = (3*((ip & ~0xfUL) - (table->segment_base + e->start_offset))/16
1583 + (ip & 0xfUL));
1584 hdr = *(u64 *) (table->segment_base + e->info_offset);
1585 dp = (u8 *) (table->segment_base + e->info_offset + 8);
1586 desc_end = dp + 8*UNW_LENGTH(hdr);
1587
1588 while (!sr.done && dp < desc_end)
1589 dp = unw_decode(dp, sr.in_body, &sr);
1590
1591 if (sr.when_target > sr.epilogue_start) {
1592 /*
1593 * sp has been restored and all values on the memory stack below
1594 * psp also have been restored.
1595 */
1596 sr.curr.reg[UNW_REG_PSP].val = 0;
1597 sr.curr.reg[UNW_REG_PSP].where = UNW_WHERE_NONE;
1598 sr.curr.reg[UNW_REG_PSP].when = UNW_WHEN_NEVER;
1599 for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
1600 if ((r->where == UNW_WHERE_PSPREL && r->val <= 0x10)
1601 || r->where == UNW_WHERE_SPREL)
1602 {
1603 r->val = 0;
1604 r->where = UNW_WHERE_NONE;
1605 r->when = UNW_WHEN_NEVER;
1606 }
1607 }
1608
1609 script->flags = sr.flags;
1610
1611 /*
1612 * If RP did't get saved, generate entry for the return link
1613 * register.
1614 */
1615 if (sr.curr.reg[UNW_REG_RP].when >= sr.when_target) {
1616 sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
1617 sr.curr.reg[UNW_REG_RP].when = -1;
1618 sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg;
1619 UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n",
1620 __FUNCTION__, ip, sr.curr.reg[UNW_REG_RP].where,
1621 sr.curr.reg[UNW_REG_RP].val);
1622 }
1623
1624#ifdef UNW_DEBUG
1625 UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n",
1626 __FUNCTION__, table->segment_base + e->start_offset, sr.when_target);
1627 for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) {
1628 if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) {
1629 UNW_DPRINT(1, " %s <- ", unw.preg_name[r - sr.curr.reg]);
1630 switch (r->where) {
1631 case UNW_WHERE_GR: UNW_DPRINT(1, "r%lu", r->val); break;
1632 case UNW_WHERE_FR: UNW_DPRINT(1, "f%lu", r->val); break;
1633 case UNW_WHERE_BR: UNW_DPRINT(1, "b%lu", r->val); break;
1634 case UNW_WHERE_SPREL: UNW_DPRINT(1, "[sp+0x%lx]", r->val); break;
1635 case UNW_WHERE_PSPREL: UNW_DPRINT(1, "[psp+0x%lx]", r->val); break;
1636 case UNW_WHERE_NONE:
1637 UNW_DPRINT(1, "%s+0x%lx", unw.preg_name[r - sr.curr.reg], r->val);
1638 break;
1639
1640 default:
1641 UNW_DPRINT(1, "BADWHERE(%d)", r->where);
1642 break;
1643 }
1644 UNW_DPRINT(1, "\t\t%d\n", r->when);
1645 }
1646 }
1647#endif
1648
1649 STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
1650
1651 /* translate state record into unwinder instructions: */
1652
1653 /*
1654 * First, set psp if we're dealing with a fixed-size frame;
1655 * subsequent instructions may depend on this value.
1656 */
1657 if (sr.when_target > sr.curr.reg[UNW_REG_PSP].when
1658 && (sr.curr.reg[UNW_REG_PSP].where == UNW_WHERE_NONE)
1659 && sr.curr.reg[UNW_REG_PSP].val != 0) {
1660 /* new psp is sp plus frame size */
1661 insn.opc = UNW_INSN_ADD;
1662 insn.dst = offsetof(struct unw_frame_info, psp)/8;
1663 insn.val = sr.curr.reg[UNW_REG_PSP].val; /* frame size */
1664 script_emit(script, insn);
1665 }
1666
1667 /* determine where the primary UNaT is: */
1668 if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
1669 i = UNW_REG_PRI_UNAT_MEM;
1670 else if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when)
1671 i = UNW_REG_PRI_UNAT_GR;
1672 else if (sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when > sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
1673 i = UNW_REG_PRI_UNAT_MEM;
1674 else
1675 i = UNW_REG_PRI_UNAT_GR;
1676
1677 compile_reg(&sr, i, script);
1678
1679 for (i = UNW_REG_BSP; i < UNW_NUM_REGS; ++i)
1680 compile_reg(&sr, i, script);
1681
1682 /* free labeled register states & stack: */
1683
1684 STAT(parse_start = ia64_get_itc());
1685 for (ls = sr.labeled_states; ls; ls = next) {
1686 next = ls->next;
1687 free_state_stack(&ls->saved_state);
1688 free_labeled_state(ls);
1689 }
1690 free_state_stack(&sr.curr);
1691 STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
1692
1693 script_finalize(script, &sr);
1694 STAT(unw.stat.script.build_time += ia64_get_itc() - start);
1695 return script;
1696}
1697
1698/*
1699 * Apply the unwinding actions represented by OPS and update SR to
1700 * reflect the state that existed upon entry to the function that this
1701 * unwinder represents.
1702 */
1703static inline void
1704run_script (struct unw_script *script, struct unw_frame_info *state)
1705{
1706 struct unw_insn *ip, *limit, next_insn;
1707 unsigned long opc, dst, val, off;
1708 unsigned long *s = (unsigned long *) state;
1709 STAT(unsigned long start;)
1710
1711 STAT(++unw.stat.script.runs; start = ia64_get_itc());
1712 state->flags = script->flags;
1713 ip = script->insn;
1714 limit = script->insn + script->count;
1715 next_insn = *ip;
1716
1717 while (ip++ < limit) {
1718 opc = next_insn.opc;
1719 dst = next_insn.dst;
1720 val = next_insn.val;
1721 next_insn = *ip;
1722
1723 redo:
1724 switch (opc) {
1725 case UNW_INSN_ADD:
1726 s[dst] += val;
1727 break;
1728
1729 case UNW_INSN_MOVE2:
1730 if (!s[val])
1731 goto lazy_init;
1732 s[dst+1] = s[val+1];
1733 s[dst] = s[val];
1734 break;
1735
1736 case UNW_INSN_MOVE:
1737 if (!s[val])
1738 goto lazy_init;
1739 s[dst] = s[val];
1740 break;
1741
1742 case UNW_INSN_MOVE_SCRATCH:
1743 if (state->pt) {
1744 s[dst] = (unsigned long) get_scratch_regs(state) + val;
1745 } else {
1746 s[dst] = 0;
1747 UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n",
1748 __FUNCTION__, dst, val);
1749 }
1750 break;
1751
1752 case UNW_INSN_MOVE_CONST:
1753 if (val == 0)
1754 s[dst] = (unsigned long) &unw.r0;
1755 else {
1756 s[dst] = 0;
1757 UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n",
1758 __FUNCTION__, val);
1759 }
1760 break;
1761
1762
1763 case UNW_INSN_MOVE_STACKED:
1764 s[dst] = (unsigned long) ia64_rse_skip_regs((unsigned long *)state->bsp,
1765 val);
1766 break;
1767
1768 case UNW_INSN_ADD_PSP:
1769 s[dst] = state->psp + val;
1770 break;
1771
1772 case UNW_INSN_ADD_SP:
1773 s[dst] = state->sp + val;
1774 break;
1775
1776 case UNW_INSN_SETNAT_MEMSTK:
1777 if (!state->pri_unat_loc)
1778 state->pri_unat_loc = &state->sw->ar_unat;
1779 /* register off. is a multiple of 8, so the least 3 bits (type) are 0 */
1780 s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK;
1781 break;
1782
1783 case UNW_INSN_SETNAT_TYPE:
1784 s[dst+1] = val;
1785 break;
1786
1787 case UNW_INSN_LOAD:
1788#ifdef UNW_DEBUG
1789 if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0
1790 || s[val] < TASK_SIZE)
1791 {
1792 UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n",
1793 __FUNCTION__, s[val]);
1794 break;
1795 }
1796#endif
1797 s[dst] = *(unsigned long *) s[val];
1798 break;
1799 }
1800 }
1801 STAT(unw.stat.script.run_time += ia64_get_itc() - start);
1802 return;
1803
1804 lazy_init:
1805 off = unw.sw_off[val];
1806 s[val] = (unsigned long) state->sw + off;
1807 if (off >= offsetof(struct switch_stack, r4) && off <= offsetof(struct switch_stack, r7))
1808 /*
1809 * We're initializing a general register: init NaT info, too. Note that
1810 * the offset is a multiple of 8 which gives us the 3 bits needed for
1811 * the type field.
1812 */
1813 s[val+1] = (offsetof(struct switch_stack, ar_unat) - off) | UNW_NAT_MEMSTK;
1814 goto redo;
1815}
1816
1817static int
1818find_save_locs (struct unw_frame_info *info)
1819{
1820 int have_write_lock = 0;
1821 struct unw_script *scr;
1822 unsigned long flags = 0;
1823
1824 if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) {
1825 /* don't let obviously bad addresses pollute the cache */
1826 /* FIXME: should really be level 0 but it occurs too often. KAO */
1827 UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip);
1828 info->rp_loc = NULL;
1829 return -1;
1830 }
1831
1832 scr = script_lookup(info);
1833 if (!scr) {
1834 spin_lock_irqsave(&unw.lock, flags);
1835 scr = build_script(info);
1836 if (!scr) {
1837 spin_unlock_irqrestore(&unw.lock, flags);
1838 UNW_DPRINT(0,
1839 "unwind.%s: failed to locate/build unwind script for ip %lx\n",
1840 __FUNCTION__, info->ip);
1841 return -1;
1842 }
1843 have_write_lock = 1;
1844 }
1845 info->hint = scr->hint;
1846 info->prev_script = scr - unw.cache;
1847
1848 run_script(scr, info);
1849
1850 if (have_write_lock) {
1851 write_unlock(&scr->lock);
1852 spin_unlock_irqrestore(&unw.lock, flags);
1853 } else
1854 read_unlock(&scr->lock);
1855 return 0;
1856}
1857
1858int
1859unw_unwind (struct unw_frame_info *info)
1860{
1861 unsigned long prev_ip, prev_sp, prev_bsp;
1862 unsigned long ip, pr, num_regs;
1863 STAT(unsigned long start, flags;)
1864 int retval;
1865
1866 STAT(local_irq_save(flags); ++unw.stat.api.unwinds; start = ia64_get_itc());
1867
1868 prev_ip = info->ip;
1869 prev_sp = info->sp;
1870 prev_bsp = info->bsp;
1871
1872 /* restore the ip */
1873 if (!info->rp_loc) {
1874 /* FIXME: should really be level 0 but it occurs too often. KAO */
1875 UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n",
1876 __FUNCTION__, info->ip);
1877 STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
1878 return -1;
1879 }
1880 ip = info->ip = *info->rp_loc;
1881 if (ip < GATE_ADDR) {
1882 UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __FUNCTION__, ip);
1883 STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
1884 return -1;
1885 }
1886
1887 /* restore the cfm: */
1888 if (!info->pfs_loc) {
1889 UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __FUNCTION__);
1890 STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
1891 return -1;
1892 }
1893 info->cfm_loc = info->pfs_loc;
1894
1895 /* restore the bsp: */
1896 pr = info->pr;
1897 num_regs = 0;
1898 if ((info->flags & UNW_FLAG_INTERRUPT_FRAME)) {
1899 info->pt = info->sp + 16;
1900 if ((pr & (1UL << PRED_NON_SYSCALL)) != 0)
1901 num_regs = *info->cfm_loc & 0x7f; /* size of frame */
1902 info->pfs_loc =
1903 (unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs));
1904 UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __FUNCTION__, info->pt);
1905 } else
1906 num_regs = (*info->cfm_loc >> 7) & 0x7f; /* size of locals */
1907 info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs);
1908 if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) {
1909 UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n",
1910 __FUNCTION__, info->bsp, info->regstk.limit, info->regstk.top);
1911 STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
1912 return -1;
1913 }
1914
1915 /* restore the sp: */
1916 info->sp = info->psp;
1917 if (info->sp < info->memstk.top || info->sp > info->memstk.limit) {
1918 UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n",
1919 __FUNCTION__, info->sp, info->memstk.top, info->memstk.limit);
1920 STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
1921 return -1;
1922 }
1923
1924 if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) {
1925 UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n",
1926 __FUNCTION__, ip);
1927 STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
1928 return -1;
1929 }
1930
1931 /* as we unwind, the saved ar.unat becomes the primary unat: */
1932 info->pri_unat_loc = info->unat_loc;
1933
1934 /* finally, restore the predicates: */
1935 unw_get_pr(info, &info->pr);
1936
1937 retval = find_save_locs(info);
1938 STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
1939 return retval;
1940}
1941EXPORT_SYMBOL(unw_unwind);
1942
1943int
1944unw_unwind_to_user (struct unw_frame_info *info)
1945{
1946 unsigned long ip, sp;
1947
1948 while (unw_unwind(info) >= 0) {
1949 if (unw_get_rp(info, &ip) < 0) {
1950 unw_get_ip(info, &ip);
1951 UNW_DPRINT(0, "unwind.%s: failed to read return pointer (ip=0x%lx)\n",
1952 __FUNCTION__, ip);
1953 return -1;
1954 }
1955 unw_get_sp(info, &sp);
1956 if (sp >= (unsigned long)info->task + IA64_STK_OFFSET)
1957 break;
1958 if (ip < FIXADDR_USER_END)
1959 return 0;
1960 }
1961 unw_get_ip(info, &ip);
1962 UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n", __FUNCTION__, ip);
1963 return -1;
1964}
1965EXPORT_SYMBOL(unw_unwind_to_user);
1966
1967static void
1968init_frame_info (struct unw_frame_info *info, struct task_struct *t,
1969 struct switch_stack *sw, unsigned long stktop)
1970{
1971 unsigned long rbslimit, rbstop, stklimit;
1972 STAT(unsigned long start, flags;)
1973
1974 STAT(local_irq_save(flags); ++unw.stat.api.inits; start = ia64_get_itc());
1975
1976 /*
1977 * Subtle stuff here: we _could_ unwind through the switch_stack frame but we
1978 * don't want to do that because it would be slow as each preserved register would
1979 * have to be processed. Instead, what we do here is zero out the frame info and
1980 * start the unwind process at the function that created the switch_stack frame.
1981 * When a preserved value in switch_stack needs to be accessed, run_script() will
1982 * initialize the appropriate pointer on demand.
1983 */
1984 memset(info, 0, sizeof(*info));
1985
1986 rbslimit = (unsigned long) t + IA64_RBS_OFFSET;
1987 rbstop = sw->ar_bspstore;
1988 if (rbstop - (unsigned long) t >= IA64_STK_OFFSET)
1989 rbstop = rbslimit;
1990
1991 stklimit = (unsigned long) t + IA64_STK_OFFSET;
1992 if (stktop <= rbstop)
1993 stktop = rbstop;
1994
1995 info->regstk.limit = rbslimit;
1996 info->regstk.top = rbstop;
1997 info->memstk.limit = stklimit;
1998 info->memstk.top = stktop;
1999 info->task = t;
2000 info->sw = sw;
2001 info->sp = info->psp = stktop;
2002 info->pr = sw->pr;
2003 UNW_DPRINT(3, "unwind.%s:\n"
2004 " task 0x%lx\n"
2005 " rbs = [0x%lx-0x%lx)\n"
2006 " stk = [0x%lx-0x%lx)\n"
2007 " pr 0x%lx\n"
2008 " sw 0x%lx\n"
2009 " sp 0x%lx\n",
2010 __FUNCTION__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit,
2011 info->pr, (unsigned long) info->sw, info->sp);
2012 STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags));
2013}
2014
2015void
2016unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t,
2017 struct pt_regs *pt, struct switch_stack *sw)
2018{
2019 unsigned long sof;
2020
2021 init_frame_info(info, t, sw, pt->r12);
2022 info->cfm_loc = &pt->cr_ifs;
2023 info->unat_loc = &pt->ar_unat;
2024 info->pfs_loc = &pt->ar_pfs;
2025 sof = *info->cfm_loc & 0x7f;
2026 info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sof);
2027 info->ip = pt->cr_iip + ia64_psr(pt)->ri;
2028 info->pt = (unsigned long) pt;
2029 UNW_DPRINT(3, "unwind.%s:\n"
2030 " bsp 0x%lx\n"
2031 " sof 0x%lx\n"
2032 " ip 0x%lx\n",
2033 __FUNCTION__, info->bsp, sof, info->ip);
2034 find_save_locs(info);
2035}
2036
2037void
2038unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw)
2039{
2040 unsigned long sol;
2041
2042 init_frame_info(info, t, sw, (unsigned long) (sw + 1) - 16);
2043 info->cfm_loc = &sw->ar_pfs;
2044 sol = (*info->cfm_loc >> 7) & 0x7f;
2045 info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sol);
2046 info->ip = sw->b0;
2047 UNW_DPRINT(3, "unwind.%s:\n"
2048 " bsp 0x%lx\n"
2049 " sol 0x%lx\n"
2050 " ip 0x%lx\n",
2051 __FUNCTION__, info->bsp, sol, info->ip);
2052 find_save_locs(info);
2053}
2054
2055EXPORT_SYMBOL(unw_init_frame_info);
2056
2057void
2058unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t)
2059{
2060 struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16);
2061
2062 UNW_DPRINT(1, "unwind.%s\n", __FUNCTION__);
2063 unw_init_frame_info(info, t, sw);
2064}
2065EXPORT_SYMBOL(unw_init_from_blocked_task);
2066
2067static void
2068init_unwind_table (struct unw_table *table, const char *name, unsigned long segment_base,
2069 unsigned long gp, const void *table_start, const void *table_end)
2070{
2071 const struct unw_table_entry *start = table_start, *end = table_end;
2072
2073 table->name = name;
2074 table->segment_base = segment_base;
2075 table->gp = gp;
2076 table->start = segment_base + start[0].start_offset;
2077 table->end = segment_base + end[-1].end_offset;
2078 table->array = start;
2079 table->length = end - start;
2080}
2081
2082void *
2083unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp,
2084 const void *table_start, const void *table_end)
2085{
2086 const struct unw_table_entry *start = table_start, *end = table_end;
2087 struct unw_table *table;
2088 unsigned long flags;
2089
2090 if (end - start <= 0) {
2091 UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n",
2092 __FUNCTION__);
2093 return NULL;
2094 }
2095
2096 table = kmalloc(sizeof(*table), GFP_USER);
2097 if (!table)
2098 return NULL;
2099
2100 init_unwind_table(table, name, segment_base, gp, table_start, table_end);
2101
2102 spin_lock_irqsave(&unw.lock, flags);
2103 {
2104 /* keep kernel unwind table at the front (it's searched most commonly): */
2105 table->next = unw.tables->next;
2106 unw.tables->next = table;
2107 }
2108 spin_unlock_irqrestore(&unw.lock, flags);
2109
2110 return table;
2111}
2112
2113void
2114unw_remove_unwind_table (void *handle)
2115{
2116 struct unw_table *table, *prev;
2117 struct unw_script *tmp;
2118 unsigned long flags;
2119 long index;
2120
2121 if (!handle) {
2122 UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n",
2123 __FUNCTION__);
2124 return;
2125 }
2126
2127 table = handle;
2128 if (table == &unw.kernel_table) {
2129 UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a "
2130 "no-can-do!\n", __FUNCTION__);
2131 return;
2132 }
2133
2134 spin_lock_irqsave(&unw.lock, flags);
2135 {
2136 /* first, delete the table: */
2137
2138 for (prev = (struct unw_table *) &unw.tables; prev; prev = prev->next)
2139 if (prev->next == table)
2140 break;
2141 if (!prev) {
2142 UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n",
2143 __FUNCTION__, (void *) table);
2144 spin_unlock_irqrestore(&unw.lock, flags);
2145 return;
2146 }
2147 prev->next = table->next;
2148 }
2149 spin_unlock_irqrestore(&unw.lock, flags);
2150
2151 /* next, remove hash table entries for this table */
2152
2153 for (index = 0; index <= UNW_HASH_SIZE; ++index) {
2154 tmp = unw.cache + unw.hash[index];
2155 if (unw.hash[index] >= UNW_CACHE_SIZE
2156 || tmp->ip < table->start || tmp->ip >= table->end)
2157 continue;
2158
2159 write_lock(&tmp->lock);
2160 {
2161 if (tmp->ip >= table->start && tmp->ip < table->end) {
2162 unw.hash[index] = tmp->coll_chain;
2163 tmp->ip = 0;
2164 }
2165 }
2166 write_unlock(&tmp->lock);
2167 }
2168
2169 kfree(table);
2170}
2171
2172static int __init
2173create_gate_table (void)
2174{
2175 const struct unw_table_entry *entry, *start, *end;
2176 unsigned long *lp, segbase = GATE_ADDR;
2177 size_t info_size, size;
2178 char *info;
2179 Elf64_Phdr *punw = NULL, *phdr = (Elf64_Phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
2180 int i;
2181
2182 for (i = 0; i < GATE_EHDR->e_phnum; ++i, ++phdr)
2183 if (phdr->p_type == PT_IA_64_UNWIND) {
2184 punw = phdr;
2185 break;
2186 }
2187
2188 if (!punw) {
2189 printk("%s: failed to find gate DSO's unwind table!\n", __FUNCTION__);
2190 return 0;
2191 }
2192
2193 start = (const struct unw_table_entry *) punw->p_vaddr;
2194 end = (struct unw_table_entry *) ((char *) start + punw->p_memsz);
2195 size = 0;
2196
2197 unw_add_unwind_table("linux-gate.so", segbase, 0, start, end);
2198
2199 for (entry = start; entry < end; ++entry)
2200 size += 3*8 + 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
2201 size += 8; /* reserve space for "end of table" marker */
2202
2203 unw.gate_table = kmalloc(size, GFP_KERNEL);
2204 if (!unw.gate_table) {
2205 unw.gate_table_size = 0;
2206 printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __FUNCTION__);
2207 return 0;
2208 }
2209 unw.gate_table_size = size;
2210
2211 lp = unw.gate_table;
2212 info = (char *) unw.gate_table + size;
2213
2214 for (entry = start; entry < end; ++entry, lp += 3) {
2215 info_size = 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
2216 info -= info_size;
2217 memcpy(info, (char *) segbase + entry->info_offset, info_size);
2218
2219 lp[0] = segbase + entry->start_offset; /* start */
2220 lp[1] = segbase + entry->end_offset; /* end */
2221 lp[2] = info - (char *) unw.gate_table; /* info */
2222 }
2223 *lp = 0; /* end-of-table marker */
2224 return 0;
2225}
2226
2227__initcall(create_gate_table);
2228
2229void __init
2230unw_init (void)
2231{
2232 extern char __gp[];
2233 extern void unw_hash_index_t_is_too_narrow (void);
2234 long i, off;
2235
2236 if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE)
2237 unw_hash_index_t_is_too_narrow();
2238
2239 unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(AR_UNAT);
2240 unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE);
2241 unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_UNAT);
2242 unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0);
2243 unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(AR_UNAT);
2244 unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR);
2245 unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC);
2246 unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR);
2247 for (i = UNW_REG_R4, off = SW(R4); i <= UNW_REG_R7; ++i, off += 8)
2248 unw.sw_off[unw.preg_index[i]] = off;
2249 for (i = UNW_REG_B1, off = SW(B1); i <= UNW_REG_B5; ++i, off += 8)
2250 unw.sw_off[unw.preg_index[i]] = off;
2251 for (i = UNW_REG_F2, off = SW(F2); i <= UNW_REG_F5; ++i, off += 16)
2252 unw.sw_off[unw.preg_index[i]] = off;
2253 for (i = UNW_REG_F16, off = SW(F16); i <= UNW_REG_F31; ++i, off += 16)
2254 unw.sw_off[unw.preg_index[i]] = off;
2255
2256 for (i = 0; i < UNW_CACHE_SIZE; ++i) {
2257 if (i > 0)
2258 unw.cache[i].lru_chain = (i - 1);
2259 unw.cache[i].coll_chain = -1;
2260 rwlock_init(&unw.cache[i].lock);
2261 }
2262 unw.lru_head = UNW_CACHE_SIZE - 1;
2263 unw.lru_tail = 0;
2264
2265 init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp,
2266 __start_unwind, __end_unwind);
2267}
2268
2269/*
2270 * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
2271 *
2272 * This system call has been deprecated. The new and improved way to get
2273 * at the kernel's unwind info is via the gate DSO. The address of the
2274 * ELF header for this DSO is passed to user-level via AT_SYSINFO_EHDR.
2275 *
2276 * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
2277 *
2278 * This system call copies the unwind data into the buffer pointed to by BUF and returns
2279 * the size of the unwind data. If BUF_SIZE is smaller than the size of the unwind data
2280 * or if BUF is NULL, nothing is copied, but the system call still returns the size of the
2281 * unwind data.
2282 *
2283 * The first portion of the unwind data contains an unwind table and rest contains the
2284 * associated unwind info (in no particular order). The unwind table consists of a table
2285 * of entries of the form:
2286 *
2287 * u64 start; (64-bit address of start of function)
2288 * u64 end; (64-bit address of start of function)
2289 * u64 info; (BUF-relative offset to unwind info)
2290 *
2291 * The end of the unwind table is indicated by an entry with a START address of zero.
2292 *
2293 * Please see the IA-64 Software Conventions and Runtime Architecture manual for details
2294 * on the format of the unwind info.
2295 *
2296 * ERRORS
2297 * EFAULT BUF points outside your accessible address space.
2298 */
2299asmlinkage long
2300sys_getunwind (void __user *buf, size_t buf_size)
2301{
2302 if (buf && buf_size >= unw.gate_table_size)
2303 if (copy_to_user(buf, unw.gate_table, unw.gate_table_size) != 0)
2304 return -EFAULT;
2305 return unw.gate_table_size;
2306}
diff --git a/arch/ia64/kernel/unwind_decoder.c b/arch/ia64/kernel/unwind_decoder.c
new file mode 100644
index 000000000000..50ac2d82f9bf
--- /dev/null
+++ b/arch/ia64/kernel/unwind_decoder.c
@@ -0,0 +1,459 @@
1/*
2 * Copyright (C) 2000 Hewlett-Packard Co
3 * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
4 *
5 * Generic IA-64 unwind info decoder.
6 *
7 * This file is used both by the Linux kernel and objdump. Please keep
8 * the two copies of this file in sync.
9 *
10 * You need to customize the decoder by defining the following
11 * macros/constants before including this file:
12 *
13 * Types:
14 * unw_word Unsigned integer type with at least 64 bits
15 *
16 * Register names:
17 * UNW_REG_BSP
18 * UNW_REG_BSPSTORE
19 * UNW_REG_FPSR
20 * UNW_REG_LC
21 * UNW_REG_PFS
22 * UNW_REG_PR
23 * UNW_REG_RNAT
24 * UNW_REG_PSP
25 * UNW_REG_RP
26 * UNW_REG_UNAT
27 *
28 * Decoder action macros:
29 * UNW_DEC_BAD_CODE(code)
30 * UNW_DEC_ABI(fmt,abi,context,arg)
31 * UNW_DEC_BR_GR(fmt,brmask,gr,arg)
32 * UNW_DEC_BR_MEM(fmt,brmask,arg)
33 * UNW_DEC_COPY_STATE(fmt,label,arg)
34 * UNW_DEC_EPILOGUE(fmt,t,ecount,arg)
35 * UNW_DEC_FRGR_MEM(fmt,grmask,frmask,arg)
36 * UNW_DEC_FR_MEM(fmt,frmask,arg)
37 * UNW_DEC_GR_GR(fmt,grmask,gr,arg)
38 * UNW_DEC_GR_MEM(fmt,grmask,arg)
39 * UNW_DEC_LABEL_STATE(fmt,label,arg)
40 * UNW_DEC_MEM_STACK_F(fmt,t,size,arg)
41 * UNW_DEC_MEM_STACK_V(fmt,t,arg)
42 * UNW_DEC_PRIUNAT_GR(fmt,r,arg)
43 * UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg)
44 * UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg)
45 * UNW_DEC_PRIUNAT_WHEN_PSPREL(fmt,pspoff,arg)
46 * UNW_DEC_PRIUNAT_WHEN_SPREL(fmt,spoff,arg)
47 * UNW_DEC_PROLOGUE(fmt,body,rlen,arg)
48 * UNW_DEC_PROLOGUE_GR(fmt,rlen,mask,grsave,arg)
49 * UNW_DEC_REG_PSPREL(fmt,reg,pspoff,arg)
50 * UNW_DEC_REG_REG(fmt,src,dst,arg)
51 * UNW_DEC_REG_SPREL(fmt,reg,spoff,arg)
52 * UNW_DEC_REG_WHEN(fmt,reg,t,arg)
53 * UNW_DEC_RESTORE(fmt,t,abreg,arg)
54 * UNW_DEC_RESTORE_P(fmt,qp,t,abreg,arg)
55 * UNW_DEC_SPILL_BASE(fmt,pspoff,arg)
56 * UNW_DEC_SPILL_MASK(fmt,imaskp,arg)
57 * UNW_DEC_SPILL_PSPREL(fmt,t,abreg,pspoff,arg)
58 * UNW_DEC_SPILL_PSPREL_P(fmt,qp,t,abreg,pspoff,arg)
59 * UNW_DEC_SPILL_REG(fmt,t,abreg,x,ytreg,arg)
60 * UNW_DEC_SPILL_REG_P(fmt,qp,t,abreg,x,ytreg,arg)
61 * UNW_DEC_SPILL_SPREL(fmt,t,abreg,spoff,arg)
62 * UNW_DEC_SPILL_SPREL_P(fmt,qp,t,abreg,pspoff,arg)
63 */
64
65static unw_word
66unw_decode_uleb128 (unsigned char **dpp)
67{
68 unsigned shift = 0;
69 unw_word byte, result = 0;
70 unsigned char *bp = *dpp;
71
72 while (1)
73 {
74 byte = *bp++;
75 result |= (byte & 0x7f) << shift;
76 if ((byte & 0x80) == 0)
77 break;
78 shift += 7;
79 }
80 *dpp = bp;
81 return result;
82}
83
84static unsigned char *
85unw_decode_x1 (unsigned char *dp, unsigned char code, void *arg)
86{
87 unsigned char byte1, abreg;
88 unw_word t, off;
89
90 byte1 = *dp++;
91 t = unw_decode_uleb128 (&dp);
92 off = unw_decode_uleb128 (&dp);
93 abreg = (byte1 & 0x7f);
94 if (byte1 & 0x80)
95 UNW_DEC_SPILL_SPREL(X1, t, abreg, off, arg);
96 else
97 UNW_DEC_SPILL_PSPREL(X1, t, abreg, off, arg);
98 return dp;
99}
100
101static unsigned char *
102unw_decode_x2 (unsigned char *dp, unsigned char code, void *arg)
103{
104 unsigned char byte1, byte2, abreg, x, ytreg;
105 unw_word t;
106
107 byte1 = *dp++; byte2 = *dp++;
108 t = unw_decode_uleb128 (&dp);
109 abreg = (byte1 & 0x7f);
110 ytreg = byte2;
111 x = (byte1 >> 7) & 1;
112 if ((byte1 & 0x80) == 0 && ytreg == 0)
113 UNW_DEC_RESTORE(X2, t, abreg, arg);
114 else
115 UNW_DEC_SPILL_REG(X2, t, abreg, x, ytreg, arg);
116 return dp;
117}
118
119static unsigned char *
120unw_decode_x3 (unsigned char *dp, unsigned char code, void *arg)
121{
122 unsigned char byte1, byte2, abreg, qp;
123 unw_word t, off;
124
125 byte1 = *dp++; byte2 = *dp++;
126 t = unw_decode_uleb128 (&dp);
127 off = unw_decode_uleb128 (&dp);
128
129 qp = (byte1 & 0x3f);
130 abreg = (byte2 & 0x7f);
131
132 if (byte1 & 0x80)
133 UNW_DEC_SPILL_SPREL_P(X3, qp, t, abreg, off, arg);
134 else
135 UNW_DEC_SPILL_PSPREL_P(X3, qp, t, abreg, off, arg);
136 return dp;
137}
138
139static unsigned char *
140unw_decode_x4 (unsigned char *dp, unsigned char code, void *arg)
141{
142 unsigned char byte1, byte2, byte3, qp, abreg, x, ytreg;
143 unw_word t;
144
145 byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
146 t = unw_decode_uleb128 (&dp);
147
148 qp = (byte1 & 0x3f);
149 abreg = (byte2 & 0x7f);
150 x = (byte2 >> 7) & 1;
151 ytreg = byte3;
152
153 if ((byte2 & 0x80) == 0 && byte3 == 0)
154 UNW_DEC_RESTORE_P(X4, qp, t, abreg, arg);
155 else
156 UNW_DEC_SPILL_REG_P(X4, qp, t, abreg, x, ytreg, arg);
157 return dp;
158}
159
160static unsigned char *
161unw_decode_r1 (unsigned char *dp, unsigned char code, void *arg)
162{
163 int body = (code & 0x20) != 0;
164 unw_word rlen;
165
166 rlen = (code & 0x1f);
167 UNW_DEC_PROLOGUE(R1, body, rlen, arg);
168 return dp;
169}
170
171static unsigned char *
172unw_decode_r2 (unsigned char *dp, unsigned char code, void *arg)
173{
174 unsigned char byte1, mask, grsave;
175 unw_word rlen;
176
177 byte1 = *dp++;
178
179 mask = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
180 grsave = (byte1 & 0x7f);
181 rlen = unw_decode_uleb128 (&dp);
182 UNW_DEC_PROLOGUE_GR(R2, rlen, mask, grsave, arg);
183 return dp;
184}
185
186static unsigned char *
187unw_decode_r3 (unsigned char *dp, unsigned char code, void *arg)
188{
189 unw_word rlen;
190
191 rlen = unw_decode_uleb128 (&dp);
192 UNW_DEC_PROLOGUE(R3, ((code & 0x3) == 1), rlen, arg);
193 return dp;
194}
195
196static unsigned char *
197unw_decode_p1 (unsigned char *dp, unsigned char code, void *arg)
198{
199 unsigned char brmask = (code & 0x1f);
200
201 UNW_DEC_BR_MEM(P1, brmask, arg);
202 return dp;
203}
204
205static unsigned char *
206unw_decode_p2_p5 (unsigned char *dp, unsigned char code, void *arg)
207{
208 if ((code & 0x10) == 0)
209 {
210 unsigned char byte1 = *dp++;
211
212 UNW_DEC_BR_GR(P2, ((code & 0xf) << 1) | ((byte1 >> 7) & 1),
213 (byte1 & 0x7f), arg);
214 }
215 else if ((code & 0x08) == 0)
216 {
217 unsigned char byte1 = *dp++, r, dst;
218
219 r = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
220 dst = (byte1 & 0x7f);
221 switch (r)
222 {
223 case 0: UNW_DEC_REG_GR(P3, UNW_REG_PSP, dst, arg); break;
224 case 1: UNW_DEC_REG_GR(P3, UNW_REG_RP, dst, arg); break;
225 case 2: UNW_DEC_REG_GR(P3, UNW_REG_PFS, dst, arg); break;
226 case 3: UNW_DEC_REG_GR(P3, UNW_REG_PR, dst, arg); break;
227 case 4: UNW_DEC_REG_GR(P3, UNW_REG_UNAT, dst, arg); break;
228 case 5: UNW_DEC_REG_GR(P3, UNW_REG_LC, dst, arg); break;
229 case 6: UNW_DEC_RP_BR(P3, dst, arg); break;
230 case 7: UNW_DEC_REG_GR(P3, UNW_REG_RNAT, dst, arg); break;
231 case 8: UNW_DEC_REG_GR(P3, UNW_REG_BSP, dst, arg); break;
232 case 9: UNW_DEC_REG_GR(P3, UNW_REG_BSPSTORE, dst, arg); break;
233 case 10: UNW_DEC_REG_GR(P3, UNW_REG_FPSR, dst, arg); break;
234 case 11: UNW_DEC_PRIUNAT_GR(P3, dst, arg); break;
235 default: UNW_DEC_BAD_CODE(r); break;
236 }
237 }
238 else if ((code & 0x7) == 0)
239 UNW_DEC_SPILL_MASK(P4, dp, arg);
240 else if ((code & 0x7) == 1)
241 {
242 unw_word grmask, frmask, byte1, byte2, byte3;
243
244 byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
245 grmask = ((byte1 >> 4) & 0xf);
246 frmask = ((byte1 & 0xf) << 16) | (byte2 << 8) | byte3;
247 UNW_DEC_FRGR_MEM(P5, grmask, frmask, arg);
248 }
249 else
250 UNW_DEC_BAD_CODE(code);
251 return dp;
252}
253
254static unsigned char *
255unw_decode_p6 (unsigned char *dp, unsigned char code, void *arg)
256{
257 int gregs = (code & 0x10) != 0;
258 unsigned char mask = (code & 0x0f);
259
260 if (gregs)
261 UNW_DEC_GR_MEM(P6, mask, arg);
262 else
263 UNW_DEC_FR_MEM(P6, mask, arg);
264 return dp;
265}
266
267static unsigned char *
268unw_decode_p7_p10 (unsigned char *dp, unsigned char code, void *arg)
269{
270 unsigned char r, byte1, byte2;
271 unw_word t, size;
272
273 if ((code & 0x10) == 0)
274 {
275 r = (code & 0xf);
276 t = unw_decode_uleb128 (&dp);
277 switch (r)
278 {
279 case 0:
280 size = unw_decode_uleb128 (&dp);
281 UNW_DEC_MEM_STACK_F(P7, t, size, arg);
282 break;
283
284 case 1: UNW_DEC_MEM_STACK_V(P7, t, arg); break;
285 case 2: UNW_DEC_SPILL_BASE(P7, t, arg); break;
286 case 3: UNW_DEC_REG_SPREL(P7, UNW_REG_PSP, t, arg); break;
287 case 4: UNW_DEC_REG_WHEN(P7, UNW_REG_RP, t, arg); break;
288 case 5: UNW_DEC_REG_PSPREL(P7, UNW_REG_RP, t, arg); break;
289 case 6: UNW_DEC_REG_WHEN(P7, UNW_REG_PFS, t, arg); break;
290 case 7: UNW_DEC_REG_PSPREL(P7, UNW_REG_PFS, t, arg); break;
291 case 8: UNW_DEC_REG_WHEN(P7, UNW_REG_PR, t, arg); break;
292 case 9: UNW_DEC_REG_PSPREL(P7, UNW_REG_PR, t, arg); break;
293 case 10: UNW_DEC_REG_WHEN(P7, UNW_REG_LC, t, arg); break;
294 case 11: UNW_DEC_REG_PSPREL(P7, UNW_REG_LC, t, arg); break;
295 case 12: UNW_DEC_REG_WHEN(P7, UNW_REG_UNAT, t, arg); break;
296 case 13: UNW_DEC_REG_PSPREL(P7, UNW_REG_UNAT, t, arg); break;
297 case 14: UNW_DEC_REG_WHEN(P7, UNW_REG_FPSR, t, arg); break;
298 case 15: UNW_DEC_REG_PSPREL(P7, UNW_REG_FPSR, t, arg); break;
299 default: UNW_DEC_BAD_CODE(r); break;
300 }
301 }
302 else
303 {
304 switch (code & 0xf)
305 {
306 case 0x0: /* p8 */
307 {
308 r = *dp++;
309 t = unw_decode_uleb128 (&dp);
310 switch (r)
311 {
312 case 1: UNW_DEC_REG_SPREL(P8, UNW_REG_RP, t, arg); break;
313 case 2: UNW_DEC_REG_SPREL(P8, UNW_REG_PFS, t, arg); break;
314 case 3: UNW_DEC_REG_SPREL(P8, UNW_REG_PR, t, arg); break;
315 case 4: UNW_DEC_REG_SPREL(P8, UNW_REG_LC, t, arg); break;
316 case 5: UNW_DEC_REG_SPREL(P8, UNW_REG_UNAT, t, arg); break;
317 case 6: UNW_DEC_REG_SPREL(P8, UNW_REG_FPSR, t, arg); break;
318 case 7: UNW_DEC_REG_WHEN(P8, UNW_REG_BSP, t, arg); break;
319 case 8: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSP, t, arg); break;
320 case 9: UNW_DEC_REG_SPREL(P8, UNW_REG_BSP, t, arg); break;
321 case 10: UNW_DEC_REG_WHEN(P8, UNW_REG_BSPSTORE, t, arg); break;
322 case 11: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
323 case 12: UNW_DEC_REG_SPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
324 case 13: UNW_DEC_REG_WHEN(P8, UNW_REG_RNAT, t, arg); break;
325 case 14: UNW_DEC_REG_PSPREL(P8, UNW_REG_RNAT, t, arg); break;
326 case 15: UNW_DEC_REG_SPREL(P8, UNW_REG_RNAT, t, arg); break;
327 case 16: UNW_DEC_PRIUNAT_WHEN_GR(P8, t, arg); break;
328 case 17: UNW_DEC_PRIUNAT_PSPREL(P8, t, arg); break;
329 case 18: UNW_DEC_PRIUNAT_SPREL(P8, t, arg); break;
330 case 19: UNW_DEC_PRIUNAT_WHEN_MEM(P8, t, arg); break;
331 default: UNW_DEC_BAD_CODE(r); break;
332 }
333 }
334 break;
335
336 case 0x1:
337 byte1 = *dp++; byte2 = *dp++;
338 UNW_DEC_GR_GR(P9, (byte1 & 0xf), (byte2 & 0x7f), arg);
339 break;
340
341 case 0xf: /* p10 */
342 byte1 = *dp++; byte2 = *dp++;
343 UNW_DEC_ABI(P10, byte1, byte2, arg);
344 break;
345
346 case 0x9:
347 return unw_decode_x1 (dp, code, arg);
348
349 case 0xa:
350 return unw_decode_x2 (dp, code, arg);
351
352 case 0xb:
353 return unw_decode_x3 (dp, code, arg);
354
355 case 0xc:
356 return unw_decode_x4 (dp, code, arg);
357
358 default:
359 UNW_DEC_BAD_CODE(code);
360 break;
361 }
362 }
363 return dp;
364}
365
366static unsigned char *
367unw_decode_b1 (unsigned char *dp, unsigned char code, void *arg)
368{
369 unw_word label = (code & 0x1f);
370
371 if ((code & 0x20) != 0)
372 UNW_DEC_COPY_STATE(B1, label, arg);
373 else
374 UNW_DEC_LABEL_STATE(B1, label, arg);
375 return dp;
376}
377
378static unsigned char *
379unw_decode_b2 (unsigned char *dp, unsigned char code, void *arg)
380{
381 unw_word t;
382
383 t = unw_decode_uleb128 (&dp);
384 UNW_DEC_EPILOGUE(B2, t, (code & 0x1f), arg);
385 return dp;
386}
387
388static unsigned char *
389unw_decode_b3_x4 (unsigned char *dp, unsigned char code, void *arg)
390{
391 unw_word t, ecount, label;
392
393 if ((code & 0x10) == 0)
394 {
395 t = unw_decode_uleb128 (&dp);
396 ecount = unw_decode_uleb128 (&dp);
397 UNW_DEC_EPILOGUE(B3, t, ecount, arg);
398 }
399 else if ((code & 0x07) == 0)
400 {
401 label = unw_decode_uleb128 (&dp);
402 if ((code & 0x08) != 0)
403 UNW_DEC_COPY_STATE(B4, label, arg);
404 else
405 UNW_DEC_LABEL_STATE(B4, label, arg);
406 }
407 else
408 switch (code & 0x7)
409 {
410 case 1: return unw_decode_x1 (dp, code, arg);
411 case 2: return unw_decode_x2 (dp, code, arg);
412 case 3: return unw_decode_x3 (dp, code, arg);
413 case 4: return unw_decode_x4 (dp, code, arg);
414 default: UNW_DEC_BAD_CODE(code); break;
415 }
416 return dp;
417}
418
419typedef unsigned char *(*unw_decoder) (unsigned char *, unsigned char, void *);
420
421static unw_decoder unw_decode_table[2][8] =
422{
423 /* prologue table: */
424 {
425 unw_decode_r1, /* 0 */
426 unw_decode_r1,
427 unw_decode_r2,
428 unw_decode_r3,
429 unw_decode_p1, /* 4 */
430 unw_decode_p2_p5,
431 unw_decode_p6,
432 unw_decode_p7_p10
433 },
434 {
435 unw_decode_r1, /* 0 */
436 unw_decode_r1,
437 unw_decode_r2,
438 unw_decode_r3,
439 unw_decode_b1, /* 4 */
440 unw_decode_b1,
441 unw_decode_b2,
442 unw_decode_b3_x4
443 }
444};
445
446/*
447 * Decode one descriptor and return address of next descriptor.
448 */
449static inline unsigned char *
450unw_decode (unsigned char *dp, int inside_body, void *arg)
451{
452 unw_decoder decoder;
453 unsigned char code;
454
455 code = *dp++;
456 decoder = unw_decode_table[inside_body][code >> 5];
457 dp = (*decoder) (dp, code, arg);
458 return dp;
459}
diff --git a/arch/ia64/kernel/unwind_i.h b/arch/ia64/kernel/unwind_i.h
new file mode 100644
index 000000000000..96693a6ae370
--- /dev/null
+++ b/arch/ia64/kernel/unwind_i.h
@@ -0,0 +1,164 @@
1/*
2 * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co
3 * David Mosberger-Tang <davidm@hpl.hp.com>
4 *
5 * Kernel unwind support.
6 */
7
8#define UNW_VER(x) ((x) >> 48)
9#define UNW_FLAG_MASK 0x0000ffff00000000
10#define UNW_FLAG_OSMASK 0x0000f00000000000
11#define UNW_FLAG_EHANDLER(x) ((x) & 0x0000000100000000L)
12#define UNW_FLAG_UHANDLER(x) ((x) & 0x0000000200000000L)
13#define UNW_LENGTH(x) ((x) & 0x00000000ffffffffL)
14
15enum unw_register_index {
16 /* primary unat: */
17 UNW_REG_PRI_UNAT_GR,
18 UNW_REG_PRI_UNAT_MEM,
19
20 /* register stack */
21 UNW_REG_BSP, /* register stack pointer */
22 UNW_REG_BSPSTORE,
23 UNW_REG_PFS, /* previous function state */
24 UNW_REG_RNAT,
25 /* memory stack */
26 UNW_REG_PSP, /* previous memory stack pointer */
27 /* return pointer: */
28 UNW_REG_RP,
29
30 /* preserved registers: */
31 UNW_REG_R4, UNW_REG_R5, UNW_REG_R6, UNW_REG_R7,
32 UNW_REG_UNAT, UNW_REG_PR, UNW_REG_LC, UNW_REG_FPSR,
33 UNW_REG_B1, UNW_REG_B2, UNW_REG_B3, UNW_REG_B4, UNW_REG_B5,
34 UNW_REG_F2, UNW_REG_F3, UNW_REG_F4, UNW_REG_F5,
35 UNW_REG_F16, UNW_REG_F17, UNW_REG_F18, UNW_REG_F19,
36 UNW_REG_F20, UNW_REG_F21, UNW_REG_F22, UNW_REG_F23,
37 UNW_REG_F24, UNW_REG_F25, UNW_REG_F26, UNW_REG_F27,
38 UNW_REG_F28, UNW_REG_F29, UNW_REG_F30, UNW_REG_F31,
39 UNW_NUM_REGS
40};
41
42struct unw_info_block {
43 u64 header;
44 u64 desc[0]; /* unwind descriptors */
45 /* personality routine and language-specific data follow behind descriptors */
46};
47
48struct unw_table {
49 struct unw_table *next; /* must be first member! */
50 const char *name;
51 unsigned long gp; /* global pointer for this load-module */
52 unsigned long segment_base; /* base for offsets in the unwind table entries */
53 unsigned long start;
54 unsigned long end;
55 const struct unw_table_entry *array;
56 unsigned long length;
57};
58
59enum unw_where {
60 UNW_WHERE_NONE, /* register isn't saved at all */
61 UNW_WHERE_GR, /* register is saved in a general register */
62 UNW_WHERE_FR, /* register is saved in a floating-point register */
63 UNW_WHERE_BR, /* register is saved in a branch register */
64 UNW_WHERE_SPREL, /* register is saved on memstack (sp-relative) */
65 UNW_WHERE_PSPREL, /* register is saved on memstack (psp-relative) */
66 /*
67 * At the end of each prologue these locations get resolved to
68 * UNW_WHERE_PSPREL and UNW_WHERE_GR, respectively:
69 */
70 UNW_WHERE_SPILL_HOME, /* register is saved in its spill home */
71 UNW_WHERE_GR_SAVE /* register is saved in next general register */
72};
73
74#define UNW_WHEN_NEVER 0x7fffffff
75
76struct unw_reg_info {
77 unsigned long val; /* save location: register number or offset */
78 enum unw_where where; /* where the register gets saved */
79 int when; /* when the register gets saved */
80};
81
82struct unw_reg_state {
83 struct unw_reg_state *next; /* next (outer) element on state stack */
84 struct unw_reg_info reg[UNW_NUM_REGS]; /* register save locations */
85};
86
87struct unw_labeled_state {
88 struct unw_labeled_state *next; /* next labeled state (or NULL) */
89 unsigned long label; /* label for this state */
90 struct unw_reg_state saved_state;
91};
92
93struct unw_state_record {
94 unsigned int first_region : 1; /* is this the first region? */
95 unsigned int done : 1; /* are we done scanning descriptors? */
96 unsigned int any_spills : 1; /* got any register spills? */
97 unsigned int in_body : 1; /* are we inside a body (as opposed to a prologue)? */
98 unsigned long flags; /* see UNW_FLAG_* in unwind.h */
99
100 u8 *imask; /* imask of spill_mask record or NULL */
101 unsigned long pr_val; /* predicate values */
102 unsigned long pr_mask; /* predicate mask */
103 long spill_offset; /* psp-relative offset for spill base */
104 int region_start;
105 int region_len;
106 int epilogue_start;
107 int epilogue_count;
108 int when_target;
109
110 u8 gr_save_loc; /* next general register to use for saving a register */
111 u8 return_link_reg; /* branch register in which the return link is passed */
112
113 struct unw_labeled_state *labeled_states; /* list of all labeled states */
114 struct unw_reg_state curr; /* current state */
115};
116
117enum unw_nat_type {
118 UNW_NAT_NONE, /* NaT not represented */
119 UNW_NAT_VAL, /* NaT represented by NaT value (fp reg) */
120 UNW_NAT_MEMSTK, /* NaT value is in unat word at offset OFF */
121 UNW_NAT_REGSTK /* NaT is in rnat */
122};
123
124enum unw_insn_opcode {
125 UNW_INSN_ADD, /* s[dst] += val */
126 UNW_INSN_ADD_PSP, /* s[dst] = (s.psp + val) */
127 UNW_INSN_ADD_SP, /* s[dst] = (s.sp + val) */
128 UNW_INSN_MOVE, /* s[dst] = s[val] */
129 UNW_INSN_MOVE2, /* s[dst] = s[val]; s[dst+1] = s[val+1] */
130 UNW_INSN_MOVE_STACKED, /* s[dst] = ia64_rse_skip(*s.bsp, val) */
131 UNW_INSN_SETNAT_MEMSTK, /* s[dst+1].nat.type = MEMSTK;
132 s[dst+1].nat.off = *s.pri_unat - s[dst] */
133 UNW_INSN_SETNAT_TYPE, /* s[dst+1].nat.type = val */
134 UNW_INSN_LOAD, /* s[dst] = *s[val] */
135 UNW_INSN_MOVE_SCRATCH, /* s[dst] = scratch reg "val" */
136 UNW_INSN_MOVE_CONST, /* s[dst] = constant reg "val" */
137};
138
139struct unw_insn {
140 unsigned int opc : 4;
141 unsigned int dst : 9;
142 signed int val : 19;
143};
144
145/*
146 * Preserved general static registers (r4-r7) give rise to two script
147 * instructions; everything else yields at most one instruction; at
148 * the end of the script, the psp gets popped, accounting for one more
149 * instruction.
150 */
151#define UNW_MAX_SCRIPT_LEN (UNW_NUM_REGS + 5)
152
153struct unw_script {
154 unsigned long ip; /* ip this script is for */
155 unsigned long pr_mask; /* mask of predicates script depends on */
156 unsigned long pr_val; /* predicate values this script is for */
157 rwlock_t lock;
158 unsigned int flags; /* see UNW_FLAG_* in unwind.h */
159 unsigned short lru_chain; /* used for least-recently-used chain */
160 unsigned short coll_chain; /* used for hash collisions */
161 unsigned short hint; /* hint for next script to try (or -1) */
162 unsigned short count; /* number of instructions in script */
163 struct unw_insn insn[UNW_MAX_SCRIPT_LEN];
164};
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..b9f0db4c1b04
--- /dev/null
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -0,0 +1,251 @@
1#include <linux/config.h>
2
3#include <asm/cache.h>
4#include <asm/ptrace.h>
5#include <asm/system.h>
6#include <asm/pgtable.h>
7
8#define LOAD_OFFSET (KERNEL_START - KERNEL_TR_PAGE_SIZE)
9#include <asm-generic/vmlinux.lds.h>
10
11OUTPUT_FORMAT("elf64-ia64-little")
12OUTPUT_ARCH(ia64)
13ENTRY(phys_start)
14jiffies = jiffies_64;
15PHDRS {
16 code PT_LOAD;
17 percpu PT_LOAD;
18 data PT_LOAD;
19}
20SECTIONS
21{
22 /* Sections to be discarded */
23 /DISCARD/ : {
24 *(.exit.text)
25 *(.exit.data)
26 *(.exitcall.exit)
27 *(.IA_64.unwind.exit.text)
28 *(.IA_64.unwind_info.exit.text)
29 }
30
31 v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */
32 phys_start = _start - LOAD_OFFSET;
33
34 code : { } :code
35 . = KERNEL_START;
36
37 _text = .;
38 _stext = .;
39
40 .text : AT(ADDR(.text) - LOAD_OFFSET)
41 {
42 *(.text.ivt)
43 *(.text)
44 SCHED_TEXT
45 LOCK_TEXT
46 *(.gnu.linkonce.t*)
47 }
48 .text2 : AT(ADDR(.text2) - LOAD_OFFSET)
49 { *(.text2) }
50#ifdef CONFIG_SMP
51 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET)
52 { *(.text.lock) }
53#endif
54 _etext = .;
55
56 /* Read-only data */
57
58 /* Exception table */
59 . = ALIGN(16);
60 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET)
61 {
62 __start___ex_table = .;
63 *(__ex_table)
64 __stop___ex_table = .;
65 }
66
67 .data.patch.vtop : AT(ADDR(.data.patch.vtop) - LOAD_OFFSET)
68 {
69 __start___vtop_patchlist = .;
70 *(.data.patch.vtop)
71 __end___vtop_patchlist = .;
72 }
73
74 .data.patch.mckinley_e9 : AT(ADDR(.data.patch.mckinley_e9) - LOAD_OFFSET)
75 {
76 __start___mckinley_e9_bundles = .;
77 *(.data.patch.mckinley_e9)
78 __end___mckinley_e9_bundles = .;
79 }
80
81 /* Global data */
82 _data = .;
83
84#if defined(CONFIG_IA64_GENERIC)
85 /* Machine Vector */
86 . = ALIGN(16);
87 .machvec : AT(ADDR(.machvec) - LOAD_OFFSET)
88 {
89 machvec_start = .;
90 *(.machvec)
91 machvec_end = .;
92 }
93#endif
94
95 /* Unwind info & table: */
96 . = ALIGN(8);
97 .IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET)
98 { *(.IA_64.unwind_info*) }
99 .IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET)
100 {
101 __start_unwind = .;
102 *(.IA_64.unwind*)
103 __end_unwind = .;
104 }
105
106 RODATA
107
108 .opd : AT(ADDR(.opd) - LOAD_OFFSET)
109 { *(.opd) }
110
111 /* Initialization code and data: */
112
113 . = ALIGN(PAGE_SIZE);
114 __init_begin = .;
115 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET)
116 {
117 _sinittext = .;
118 *(.init.text)
119 _einittext = .;
120 }
121
122 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET)
123 { *(.init.data) }
124
125 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET)
126 {
127 __initramfs_start = .;
128 *(.init.ramfs)
129 __initramfs_end = .;
130 }
131
132 . = ALIGN(16);
133 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET)
134 {
135 __setup_start = .;
136 *(.init.setup)
137 __setup_end = .;
138 }
139 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET)
140 {
141 __initcall_start = .;
142 *(.initcall1.init)
143 *(.initcall2.init)
144 *(.initcall3.init)
145 *(.initcall4.init)
146 *(.initcall5.init)
147 *(.initcall6.init)
148 *(.initcall7.init)
149 __initcall_end = .;
150 }
151 __con_initcall_start = .;
152 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET)
153 { *(.con_initcall.init) }
154 __con_initcall_end = .;
155 __security_initcall_start = .;
156 .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET)
157 { *(.security_initcall.init) }
158 __security_initcall_end = .;
159 . = ALIGN(PAGE_SIZE);
160 __init_end = .;
161
162 /* The initial task and kernel stack */
163 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET)
164 { *(.data.init_task) }
165
166 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET)
167 { *(__special_page_section)
168 __start_gate_section = .;
169 *(.data.gate)
170 __stop_gate_section = .;
171 }
172 . = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose kernel data */
173
174 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET)
175 { *(.data.cacheline_aligned) }
176
177 /* Per-cpu data: */
178 percpu : { } :percpu
179 . = ALIGN(PERCPU_PAGE_SIZE);
180 __phys_per_cpu_start = .;
181 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
182 {
183 __per_cpu_start = .;
184 *(.data.percpu)
185 __per_cpu_end = .;
186 }
187 . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits into percpu page size */
188
189 data : { } :data
190 .data : AT(ADDR(.data) - LOAD_OFFSET)
191 { *(.data) *(.data1) *(.gnu.linkonce.d*) CONSTRUCTORS }
192
193 . = ALIGN(16); /* gp must be 16-byte aligned for exc. table */
194 .got : AT(ADDR(.got) - LOAD_OFFSET)
195 { *(.got.plt) *(.got) }
196 __gp = ADDR(.got) + 0x200000;
197 /* We want the small data sections together, so single-instruction offsets
198 can access them all, and initialized data all before uninitialized, so
199 we can shorten the on-disk segment size. */
200 .sdata : AT(ADDR(.sdata) - LOAD_OFFSET)
201 { *(.sdata) *(.sdata1) *(.srdata) }
202 _edata = .;
203 _bss = .;
204 .sbss : AT(ADDR(.sbss) - LOAD_OFFSET)
205 { *(.sbss) *(.scommon) }
206 .bss : AT(ADDR(.bss) - LOAD_OFFSET)
207 { *(.bss) *(COMMON) }
208
209 _end = .;
210
211 code : { } :code
212 /* Stabs debugging sections. */
213 .stab 0 : { *(.stab) }
214 .stabstr 0 : { *(.stabstr) }
215 .stab.excl 0 : { *(.stab.excl) }
216 .stab.exclstr 0 : { *(.stab.exclstr) }
217 .stab.index 0 : { *(.stab.index) }
218 .stab.indexstr 0 : { *(.stab.indexstr) }
219 /* DWARF debug sections.
220 Symbols in the DWARF debugging sections are relative to the beginning
221 of the section so we begin them at 0. */
222 /* DWARF 1 */
223 .debug 0 : { *(.debug) }
224 .line 0 : { *(.line) }
225 /* GNU DWARF 1 extensions */
226 .debug_srcinfo 0 : { *(.debug_srcinfo) }
227 .debug_sfnames 0 : { *(.debug_sfnames) }
228 /* DWARF 1.1 and DWARF 2 */
229 .debug_aranges 0 : { *(.debug_aranges) }
230 .debug_pubnames 0 : { *(.debug_pubnames) }
231 /* DWARF 2 */
232 .debug_info 0 : { *(.debug_info) }
233 .debug_abbrev 0 : { *(.debug_abbrev) }
234 .debug_line 0 : { *(.debug_line) }
235 .debug_frame 0 : { *(.debug_frame) }
236 .debug_str 0 : { *(.debug_str) }
237 .debug_loc 0 : { *(.debug_loc) }
238 .debug_macinfo 0 : { *(.debug_macinfo) }
239 /* SGI/MIPS DWARF 2 extensions */
240 .debug_weaknames 0 : { *(.debug_weaknames) }
241 .debug_funcnames 0 : { *(.debug_funcnames) }
242 .debug_typenames 0 : { *(.debug_typenames) }
243 .debug_varnames 0 : { *(.debug_varnames) }
244 /* These must appear regardless of . */
245 /* Discard them for now since Intel SoftSDV cannot handle them.
246 .comment 0 : { *(.comment) }
247 .note 0 : { *(.note) }
248 */
249 /DISCARD/ : { *(.comment) }
250 /DISCARD/ : { *(.note) }
251}