aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ia64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/mm')
-rw-r--r--arch/ia64/mm/Makefile12
-rw-r--r--arch/ia64/mm/contig.c299
-rw-r--r--arch/ia64/mm/discontig.c737
-rw-r--r--arch/ia64/mm/extable.c90
-rw-r--r--arch/ia64/mm/fault.c261
-rw-r--r--arch/ia64/mm/hugetlbpage.c357
-rw-r--r--arch/ia64/mm/init.c597
-rw-r--r--arch/ia64/mm/numa.c49
-rw-r--r--arch/ia64/mm/tlb.c190
9 files changed, 2592 insertions, 0 deletions
diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile
new file mode 100644
index 000000000000..7078f67887ec
--- /dev/null
+++ b/arch/ia64/mm/Makefile
@@ -0,0 +1,12 @@
1#
2# Makefile for the ia64-specific parts of the memory manager.
3#
4
5obj-y := init.o fault.o tlb.o extable.o
6
7obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
8obj-$(CONFIG_NUMA) += numa.o
9obj-$(CONFIG_DISCONTIGMEM) += discontig.o
10ifndef CONFIG_DISCONTIGMEM
11obj-y += contig.o
12endif
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
new file mode 100644
index 000000000000..6daf15ac8940
--- /dev/null
+++ b/arch/ia64/mm/contig.c
@@ -0,0 +1,299 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 1998-2003 Hewlett-Packard Co
7 * David Mosberger-Tang <davidm@hpl.hp.com>
8 * Stephane Eranian <eranian@hpl.hp.com>
9 * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
10 * Copyright (C) 1999 VA Linux Systems
11 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
12 * Copyright (C) 2003 Silicon Graphics, Inc. All rights reserved.
13 *
14 * Routines used by ia64 machines with contiguous (or virtually contiguous)
15 * memory.
16 */
17#include <linux/config.h>
18#include <linux/bootmem.h>
19#include <linux/efi.h>
20#include <linux/mm.h>
21#include <linux/swap.h>
22
23#include <asm/meminit.h>
24#include <asm/pgalloc.h>
25#include <asm/pgtable.h>
26#include <asm/sections.h>
27#include <asm/mca.h>
28
29#ifdef CONFIG_VIRTUAL_MEM_MAP
30static unsigned long num_dma_physpages;
31#endif
32
33/**
34 * show_mem - display a memory statistics summary
35 *
36 * Just walks the pages in the system and describes where they're allocated.
37 */
38void
39show_mem (void)
40{
41 int i, total = 0, reserved = 0;
42 int shared = 0, cached = 0;
43
44 printk("Mem-info:\n");
45 show_free_areas();
46
47 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
48 i = max_mapnr;
49 while (i-- > 0) {
50 if (!pfn_valid(i))
51 continue;
52 total++;
53 if (PageReserved(mem_map+i))
54 reserved++;
55 else if (PageSwapCache(mem_map+i))
56 cached++;
57 else if (page_count(mem_map + i))
58 shared += page_count(mem_map + i) - 1;
59 }
60 printk("%d pages of RAM\n", total);
61 printk("%d reserved pages\n", reserved);
62 printk("%d pages shared\n", shared);
63 printk("%d pages swap cached\n", cached);
64 printk("%ld pages in page table cache\n", pgtable_cache_size);
65}
66
67/* physical address where the bootmem map is located */
68unsigned long bootmap_start;
69
70/**
71 * find_max_pfn - adjust the maximum page number callback
72 * @start: start of range
73 * @end: end of range
74 * @arg: address of pointer to global max_pfn variable
75 *
76 * Passed as a callback function to efi_memmap_walk() to determine the highest
77 * available page frame number in the system.
78 */
79int
80find_max_pfn (unsigned long start, unsigned long end, void *arg)
81{
82 unsigned long *max_pfnp = arg, pfn;
83
84 pfn = (PAGE_ALIGN(end - 1) - PAGE_OFFSET) >> PAGE_SHIFT;
85 if (pfn > *max_pfnp)
86 *max_pfnp = pfn;
87 return 0;
88}
89
90/**
91 * find_bootmap_location - callback to find a memory area for the bootmap
92 * @start: start of region
93 * @end: end of region
94 * @arg: unused callback data
95 *
96 * Find a place to put the bootmap and return its starting address in
97 * bootmap_start. This address must be page-aligned.
98 */
99int
100find_bootmap_location (unsigned long start, unsigned long end, void *arg)
101{
102 unsigned long needed = *(unsigned long *)arg;
103 unsigned long range_start, range_end, free_start;
104 int i;
105
106#if IGNORE_PFN0
107 if (start == PAGE_OFFSET) {
108 start += PAGE_SIZE;
109 if (start >= end)
110 return 0;
111 }
112#endif
113
114 free_start = PAGE_OFFSET;
115
116 for (i = 0; i < num_rsvd_regions; i++) {
117 range_start = max(start, free_start);
118 range_end = min(end, rsvd_region[i].start & PAGE_MASK);
119
120 free_start = PAGE_ALIGN(rsvd_region[i].end);
121
122 if (range_end <= range_start)
123 continue; /* skip over empty range */
124
125 if (range_end - range_start >= needed) {
126 bootmap_start = __pa(range_start);
127 return -1; /* done */
128 }
129
130 /* nothing more available in this segment */
131 if (range_end == end)
132 return 0;
133 }
134 return 0;
135}
136
137/**
138 * find_memory - setup memory map
139 *
140 * Walk the EFI memory map and find usable memory for the system, taking
141 * into account reserved areas.
142 */
143void
144find_memory (void)
145{
146 unsigned long bootmap_size;
147
148 reserve_memory();
149
150 /* first find highest page frame number */
151 max_pfn = 0;
152 efi_memmap_walk(find_max_pfn, &max_pfn);
153
154 /* how many bytes to cover all the pages */
155 bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT;
156
157 /* look for a location to hold the bootmap */
158 bootmap_start = ~0UL;
159 efi_memmap_walk(find_bootmap_location, &bootmap_size);
160 if (bootmap_start == ~0UL)
161 panic("Cannot find %ld bytes for bootmap\n", bootmap_size);
162
163 bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn);
164
165 /* Free all available memory, then mark bootmem-map as being in use. */
166 efi_memmap_walk(filter_rsvd_memory, free_bootmem);
167 reserve_bootmem(bootmap_start, bootmap_size);
168
169 find_initrd();
170}
171
172#ifdef CONFIG_SMP
173/**
174 * per_cpu_init - setup per-cpu variables
175 *
176 * Allocate and setup per-cpu data areas.
177 */
178void *
179per_cpu_init (void)
180{
181 void *cpu_data;
182 int cpu;
183
184 /*
185 * get_free_pages() cannot be used before cpu_init() done. BSP
186 * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
187 * get_zeroed_page().
188 */
189 if (smp_processor_id() == 0) {
190 cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
191 PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
192 for (cpu = 0; cpu < NR_CPUS; cpu++) {
193 memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
194 __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
195 cpu_data += PERCPU_PAGE_SIZE;
196 per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
197 }
198 }
199 return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
200}
201#endif /* CONFIG_SMP */
202
203static int
204count_pages (u64 start, u64 end, void *arg)
205{
206 unsigned long *count = arg;
207
208 *count += (end - start) >> PAGE_SHIFT;
209 return 0;
210}
211
212#ifdef CONFIG_VIRTUAL_MEM_MAP
213static int
214count_dma_pages (u64 start, u64 end, void *arg)
215{
216 unsigned long *count = arg;
217
218 if (start < MAX_DMA_ADDRESS)
219 *count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT;
220 return 0;
221}
222#endif
223
224/*
225 * Set up the page tables.
226 */
227
228void
229paging_init (void)
230{
231 unsigned long max_dma;
232 unsigned long zones_size[MAX_NR_ZONES];
233#ifdef CONFIG_VIRTUAL_MEM_MAP
234 unsigned long zholes_size[MAX_NR_ZONES];
235 unsigned long max_gap;
236#endif
237
238 /* initialize mem_map[] */
239
240 memset(zones_size, 0, sizeof(zones_size));
241
242 num_physpages = 0;
243 efi_memmap_walk(count_pages, &num_physpages);
244
245 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
246
247#ifdef CONFIG_VIRTUAL_MEM_MAP
248 memset(zholes_size, 0, sizeof(zholes_size));
249
250 num_dma_physpages = 0;
251 efi_memmap_walk(count_dma_pages, &num_dma_physpages);
252
253 if (max_low_pfn < max_dma) {
254 zones_size[ZONE_DMA] = max_low_pfn;
255 zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
256 } else {
257 zones_size[ZONE_DMA] = max_dma;
258 zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
259 if (num_physpages > num_dma_physpages) {
260 zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
261 zholes_size[ZONE_NORMAL] =
262 ((max_low_pfn - max_dma) -
263 (num_physpages - num_dma_physpages));
264 }
265 }
266
267 max_gap = 0;
268 efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
269 if (max_gap < LARGE_GAP) {
270 vmem_map = (struct page *) 0;
271 free_area_init_node(0, &contig_page_data, zones_size, 0,
272 zholes_size);
273 } else {
274 unsigned long map_size;
275
276 /* allocate virtual_mem_map */
277
278 map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page));
279 vmalloc_end -= map_size;
280 vmem_map = (struct page *) vmalloc_end;
281 efi_memmap_walk(create_mem_map_page_table, NULL);
282
283 NODE_DATA(0)->node_mem_map = vmem_map;
284 free_area_init_node(0, &contig_page_data, zones_size,
285 0, zholes_size);
286
287 printk("Virtual mem_map starts at 0x%p\n", mem_map);
288 }
289#else /* !CONFIG_VIRTUAL_MEM_MAP */
290 if (max_low_pfn < max_dma)
291 zones_size[ZONE_DMA] = max_low_pfn;
292 else {
293 zones_size[ZONE_DMA] = max_dma;
294 zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
295 }
296 free_area_init(zones_size);
297#endif /* !CONFIG_VIRTUAL_MEM_MAP */
298 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
299}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
new file mode 100644
index 000000000000..3456a9b6971e
--- /dev/null
+++ b/arch/ia64/mm/discontig.c
@@ -0,0 +1,737 @@
1/*
2 * Copyright (c) 2000, 2003 Silicon Graphics, Inc. All rights reserved.
3 * Copyright (c) 2001 Intel Corp.
4 * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
5 * Copyright (c) 2002 NEC Corp.
6 * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
7 * Copyright (c) 2004 Silicon Graphics, Inc
8 * Russ Anderson <rja@sgi.com>
9 * Jesse Barnes <jbarnes@sgi.com>
10 * Jack Steiner <steiner@sgi.com>
11 */
12
13/*
14 * Platform initialization for Discontig Memory
15 */
16
17#include <linux/kernel.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/bootmem.h>
21#include <linux/acpi.h>
22#include <linux/efi.h>
23#include <linux/nodemask.h>
24#include <asm/pgalloc.h>
25#include <asm/tlb.h>
26#include <asm/meminit.h>
27#include <asm/numa.h>
28#include <asm/sections.h>
29
30/*
31 * Track per-node information needed to setup the boot memory allocator, the
32 * per-node areas, and the real VM.
33 */
34struct early_node_data {
35 struct ia64_node_data *node_data;
36 pg_data_t *pgdat;
37 unsigned long pernode_addr;
38 unsigned long pernode_size;
39 struct bootmem_data bootmem_data;
40 unsigned long num_physpages;
41 unsigned long num_dma_physpages;
42 unsigned long min_pfn;
43 unsigned long max_pfn;
44};
45
46static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
47
48/**
49 * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
50 *
51 * This function will move nodes with only CPUs (no memory)
52 * to a node with memory which is at the minimum numa_slit distance.
53 * Any reassigments will result in the compression of the nodes
54 * and renumbering the nid values where appropriate.
55 * The static declarations below are to avoid large stack size which
56 * makes the code not re-entrant.
57 */
58static void __init reassign_cpu_only_nodes(void)
59{
60 struct node_memblk_s *p;
61 int i, j, k, nnode, nid, cpu, cpunid, pxm;
62 u8 cslit, slit;
63 static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
64 static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
65 static int node_flip[MAX_NUMNODES] __initdata;
66 static int old_nid_map[NR_CPUS] __initdata;
67
68 for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
69 if (!test_bit(p->nid, (void *) nodes_with_mem)) {
70 set_bit(p->nid, (void *) nodes_with_mem);
71 nnode++;
72 }
73
74 /*
75 * All nids with memory.
76 */
77 if (nnode == num_online_nodes())
78 return;
79
80 /*
81 * Change nids and attempt to migrate CPU-only nodes
82 * to the best numa_slit (closest neighbor) possible.
83 * For reassigned CPU nodes a nid can't be arrived at
84 * until after this loop because the target nid's new
85 * identity might not have been established yet. So
86 * new nid values are fabricated above num_online_nodes() and
87 * mapped back later to their true value.
88 */
89 /* MCD - This code is a bit complicated, but may be unnecessary now.
90 * We can now handle much more interesting node-numbering.
91 * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
92 * and that there be no holes in the numbering 0..numnodes
93 * has become simply 0 <= nid <= MAX_NUMNODES.
94 */
95 nid = 0;
96 for_each_online_node(i) {
97 if (test_bit(i, (void *) nodes_with_mem)) {
98 /*
99 * Save original nid value for numa_slit
100 * fixup and node_cpuid reassignments.
101 */
102 node_flip[nid] = i;
103
104 if (i == nid) {
105 nid++;
106 continue;
107 }
108
109 for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
110 if (p->nid == i)
111 p->nid = nid;
112
113 cpunid = nid;
114 nid++;
115 } else
116 cpunid = MAX_NUMNODES;
117
118 for (cpu = 0; cpu < NR_CPUS; cpu++)
119 if (node_cpuid[cpu].nid == i) {
120 /*
121 * For nodes not being reassigned just
122 * fix the cpu's nid and reverse pxm map
123 */
124 if (cpunid < MAX_NUMNODES) {
125 pxm = nid_to_pxm_map[i];
126 pxm_to_nid_map[pxm] =
127 node_cpuid[cpu].nid = cpunid;
128 continue;
129 }
130
131 /*
132 * For nodes being reassigned, find best node by
133 * numa_slit information and then make a temporary
134 * nid value based on current nid and num_online_nodes().
135 */
136 slit = 0xff;
137 k = 2*num_online_nodes();
138 for_each_online_node(j) {
139 if (i == j)
140 continue;
141 else if (test_bit(j, (void *) nodes_with_mem)) {
142 cslit = numa_slit[i * num_online_nodes() + j];
143 if (cslit < slit) {
144 k = num_online_nodes() + j;
145 slit = cslit;
146 }
147 }
148 }
149
150 /* save old nid map so we can update the pxm */
151 old_nid_map[cpu] = node_cpuid[cpu].nid;
152 node_cpuid[cpu].nid = k;
153 }
154 }
155
156 /*
157 * Fixup temporary nid values for CPU-only nodes.
158 */
159 for (cpu = 0; cpu < NR_CPUS; cpu++)
160 if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
161 pxm = nid_to_pxm_map[old_nid_map[cpu]];
162 pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
163 } else {
164 for (i = 0; i < nnode; i++) {
165 if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
166 continue;
167
168 pxm = nid_to_pxm_map[old_nid_map[cpu]];
169 pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
170 break;
171 }
172 }
173
174 /*
175 * Fix numa_slit by compressing from larger
176 * nid array to reduced nid array.
177 */
178 for (i = 0; i < nnode; i++)
179 for (j = 0; j < nnode; j++)
180 numa_slit_fix[i * nnode + j] =
181 numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
182
183 memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
184
185 nodes_clear(node_online_map);
186 for (i = 0; i < nnode; i++)
187 node_set_online(i);
188
189 return;
190}
191
192/*
193 * To prevent cache aliasing effects, align per-node structures so that they
194 * start at addresses that are strided by node number.
195 */
196#define NODEDATA_ALIGN(addr, node) \
197 ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE)
198
199/**
200 * build_node_maps - callback to setup bootmem structs for each node
201 * @start: physical start of range
202 * @len: length of range
203 * @node: node where this range resides
204 *
205 * We allocate a struct bootmem_data for each piece of memory that we wish to
206 * treat as a virtually contiguous block (i.e. each node). Each such block
207 * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
208 * if necessary. Any non-existent pages will simply be part of the virtual
209 * memmap. We also update min_low_pfn and max_low_pfn here as we receive
210 * memory ranges from the caller.
211 */
212static int __init build_node_maps(unsigned long start, unsigned long len,
213 int node)
214{
215 unsigned long cstart, epfn, end = start + len;
216 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
217
218 epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
219 cstart = GRANULEROUNDDOWN(start);
220
221 if (!bdp->node_low_pfn) {
222 bdp->node_boot_start = cstart;
223 bdp->node_low_pfn = epfn;
224 } else {
225 bdp->node_boot_start = min(cstart, bdp->node_boot_start);
226 bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
227 }
228
229 min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
230 max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
231
232 return 0;
233}
234
235/**
236 * early_nr_phys_cpus_node - return number of physical cpus on a given node
237 * @node: node to check
238 *
239 * Count the number of physical cpus on @node. These are cpus that actually
240 * exist. We can't use nr_cpus_node() yet because
241 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
242 * called yet.
243 */
244static int early_nr_phys_cpus_node(int node)
245{
246 int cpu, n = 0;
247
248 for (cpu = 0; cpu < NR_CPUS; cpu++)
249 if (node == node_cpuid[cpu].nid)
250 if ((cpu == 0) || node_cpuid[cpu].phys_id)
251 n++;
252
253 return n;
254}
255
256
257/**
258 * early_nr_cpus_node - return number of cpus on a given node
259 * @node: node to check
260 *
261 * Count the number of cpus on @node. We can't use nr_cpus_node() yet because
262 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
263 * called yet. Note that node 0 will also count all non-existent cpus.
264 */
265static int early_nr_cpus_node(int node)
266{
267 int cpu, n = 0;
268
269 for (cpu = 0; cpu < NR_CPUS; cpu++)
270 if (node == node_cpuid[cpu].nid)
271 n++;
272
273 return n;
274}
275
276/**
277 * find_pernode_space - allocate memory for memory map and per-node structures
278 * @start: physical start of range
279 * @len: length of range
280 * @node: node where this range resides
281 *
282 * This routine reserves space for the per-cpu data struct, the list of
283 * pg_data_ts and the per-node data struct. Each node will have something like
284 * the following in the first chunk of addr. space large enough to hold it.
285 *
286 * ________________________
287 * | |
288 * |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
289 * | PERCPU_PAGE_SIZE * | start and length big enough
290 * | cpus_on_this_node | Node 0 will also have entries for all non-existent cpus.
291 * |------------------------|
292 * | local pg_data_t * |
293 * |------------------------|
294 * | local ia64_node_data |
295 * |------------------------|
296 * | ??? |
297 * |________________________|
298 *
299 * Once this space has been set aside, the bootmem maps are initialized. We
300 * could probably move the allocation of the per-cpu and ia64_node_data space
301 * outside of this function and use alloc_bootmem_node(), but doing it here
302 * is straightforward and we get the alignments we want so...
303 */
304static int __init find_pernode_space(unsigned long start, unsigned long len,
305 int node)
306{
307 unsigned long epfn, cpu, cpus, phys_cpus;
308 unsigned long pernodesize = 0, pernode, pages, mapsize;
309 void *cpu_data;
310 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
311
312 epfn = (start + len) >> PAGE_SHIFT;
313
314 pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
315 mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
316
317 /*
318 * Make sure this memory falls within this node's usable memory
319 * since we may have thrown some away in build_maps().
320 */
321 if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn)
322 return 0;
323
324 /* Don't setup this node's local space twice... */
325 if (mem_data[node].pernode_addr)
326 return 0;
327
328 /*
329 * Calculate total size needed, incl. what's necessary
330 * for good alignment and alias prevention.
331 */
332 cpus = early_nr_cpus_node(node);
333 phys_cpus = early_nr_phys_cpus_node(node);
334 pernodesize += PERCPU_PAGE_SIZE * cpus;
335 pernodesize += node * L1_CACHE_BYTES;
336 pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
337 pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
338 pernodesize = PAGE_ALIGN(pernodesize);
339 pernode = NODEDATA_ALIGN(start, node);
340
341 /* Is this range big enough for what we want to store here? */
342 if (start + len > (pernode + pernodesize + mapsize)) {
343 mem_data[node].pernode_addr = pernode;
344 mem_data[node].pernode_size = pernodesize;
345 memset(__va(pernode), 0, pernodesize);
346
347 cpu_data = (void *)pernode;
348 pernode += PERCPU_PAGE_SIZE * cpus;
349 pernode += node * L1_CACHE_BYTES;
350
351 mem_data[node].pgdat = __va(pernode);
352 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
353
354 mem_data[node].node_data = __va(pernode);
355 pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
356
357 mem_data[node].pgdat->bdata = bdp;
358 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
359
360 /*
361 * Copy the static per-cpu data into the region we
362 * just set aside and then setup __per_cpu_offset
363 * for each CPU on this node.
364 */
365 for (cpu = 0; cpu < NR_CPUS; cpu++) {
366 if (node == node_cpuid[cpu].nid) {
367 memcpy(__va(cpu_data), __phys_per_cpu_start,
368 __per_cpu_end - __per_cpu_start);
369 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
370 __per_cpu_start;
371 cpu_data += PERCPU_PAGE_SIZE;
372 }
373 }
374 }
375
376 return 0;
377}
378
379/**
380 * free_node_bootmem - free bootmem allocator memory for use
381 * @start: physical start of range
382 * @len: length of range
383 * @node: node where this range resides
384 *
385 * Simply calls the bootmem allocator to free the specified ranged from
386 * the given pg_data_t's bdata struct. After this function has been called
387 * for all the entries in the EFI memory map, the bootmem allocator will
388 * be ready to service allocation requests.
389 */
390static int __init free_node_bootmem(unsigned long start, unsigned long len,
391 int node)
392{
393 free_bootmem_node(mem_data[node].pgdat, start, len);
394
395 return 0;
396}
397
398/**
399 * reserve_pernode_space - reserve memory for per-node space
400 *
401 * Reserve the space used by the bootmem maps & per-node space in the boot
402 * allocator so that when we actually create the real mem maps we don't
403 * use their memory.
404 */
405static void __init reserve_pernode_space(void)
406{
407 unsigned long base, size, pages;
408 struct bootmem_data *bdp;
409 int node;
410
411 for_each_online_node(node) {
412 pg_data_t *pdp = mem_data[node].pgdat;
413
414 bdp = pdp->bdata;
415
416 /* First the bootmem_map itself */
417 pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
418 size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
419 base = __pa(bdp->node_bootmem_map);
420 reserve_bootmem_node(pdp, base, size);
421
422 /* Now the per-node space */
423 size = mem_data[node].pernode_size;
424 base = __pa(mem_data[node].pernode_addr);
425 reserve_bootmem_node(pdp, base, size);
426 }
427}
428
429/**
430 * initialize_pernode_data - fixup per-cpu & per-node pointers
431 *
432 * Each node's per-node area has a copy of the global pg_data_t list, so
433 * we copy that to each node here, as well as setting the per-cpu pointer
434 * to the local node data structure. The active_cpus field of the per-node
435 * structure gets setup by the platform_cpu_init() function later.
436 */
437static void __init initialize_pernode_data(void)
438{
439 int cpu, node;
440 pg_data_t *pgdat_list[MAX_NUMNODES];
441
442 for_each_online_node(node)
443 pgdat_list[node] = mem_data[node].pgdat;
444
445 /* Copy the pg_data_t list to each node and init the node field */
446 for_each_online_node(node) {
447 memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
448 sizeof(pgdat_list));
449 }
450
451 /* Set the node_data pointer for each per-cpu struct */
452 for (cpu = 0; cpu < NR_CPUS; cpu++) {
453 node = node_cpuid[cpu].nid;
454 per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
455 }
456}
457
458/**
459 * find_memory - walk the EFI memory map and setup the bootmem allocator
460 *
461 * Called early in boot to setup the bootmem allocator, and to
462 * allocate the per-cpu and per-node structures.
463 */
464void __init find_memory(void)
465{
466 int node;
467
468 reserve_memory();
469
470 if (num_online_nodes() == 0) {
471 printk(KERN_ERR "node info missing!\n");
472 node_set_online(0);
473 }
474
475 min_low_pfn = -1;
476 max_low_pfn = 0;
477
478 if (num_online_nodes() > 1)
479 reassign_cpu_only_nodes();
480
481 /* These actually end up getting called by call_pernode_memory() */
482 efi_memmap_walk(filter_rsvd_memory, build_node_maps);
483 efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
484
485 /*
486 * Initialize the boot memory maps in reverse order since that's
487 * what the bootmem allocator expects
488 */
489 for (node = MAX_NUMNODES - 1; node >= 0; node--) {
490 unsigned long pernode, pernodesize, map;
491 struct bootmem_data *bdp;
492
493 if (!node_online(node))
494 continue;
495
496 bdp = &mem_data[node].bootmem_data;
497 pernode = mem_data[node].pernode_addr;
498 pernodesize = mem_data[node].pernode_size;
499 map = pernode + pernodesize;
500
501 /* Sanity check... */
502 if (!pernode)
503 panic("pernode space for node %d "
504 "could not be allocated!", node);
505
506 init_bootmem_node(mem_data[node].pgdat,
507 map>>PAGE_SHIFT,
508 bdp->node_boot_start>>PAGE_SHIFT,
509 bdp->node_low_pfn);
510 }
511
512 efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
513
514 reserve_pernode_space();
515 initialize_pernode_data();
516
517 max_pfn = max_low_pfn;
518
519 find_initrd();
520}
521
522/**
523 * per_cpu_init - setup per-cpu variables
524 *
525 * find_pernode_space() does most of this already, we just need to set
526 * local_per_cpu_offset
527 */
528void *per_cpu_init(void)
529{
530 int cpu;
531
532 if (smp_processor_id() == 0) {
533 for (cpu = 0; cpu < NR_CPUS; cpu++) {
534 per_cpu(local_per_cpu_offset, cpu) =
535 __per_cpu_offset[cpu];
536 }
537 }
538
539 return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
540}
541
542/**
543 * show_mem - give short summary of memory stats
544 *
545 * Shows a simple page count of reserved and used pages in the system.
546 * For discontig machines, it does this on a per-pgdat basis.
547 */
548void show_mem(void)
549{
550 int i, total_reserved = 0;
551 int total_shared = 0, total_cached = 0;
552 unsigned long total_present = 0;
553 pg_data_t *pgdat;
554
555 printk("Mem-info:\n");
556 show_free_areas();
557 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
558 for_each_pgdat(pgdat) {
559 unsigned long present = pgdat->node_present_pages;
560 int shared = 0, cached = 0, reserved = 0;
561 printk("Node ID: %d\n", pgdat->node_id);
562 for(i = 0; i < pgdat->node_spanned_pages; i++) {
563 if (!ia64_pfn_valid(pgdat->node_start_pfn+i))
564 continue;
565 if (PageReserved(pgdat->node_mem_map+i))
566 reserved++;
567 else if (PageSwapCache(pgdat->node_mem_map+i))
568 cached++;
569 else if (page_count(pgdat->node_mem_map+i))
570 shared += page_count(pgdat->node_mem_map+i)-1;
571 }
572 total_present += present;
573 total_reserved += reserved;
574 total_cached += cached;
575 total_shared += shared;
576 printk("\t%ld pages of RAM\n", present);
577 printk("\t%d reserved pages\n", reserved);
578 printk("\t%d pages shared\n", shared);
579 printk("\t%d pages swap cached\n", cached);
580 }
581 printk("%ld pages of RAM\n", total_present);
582 printk("%d reserved pages\n", total_reserved);
583 printk("%d pages shared\n", total_shared);
584 printk("%d pages swap cached\n", total_cached);
585 printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
586 printk("%d free buffer pages\n", nr_free_buffer_pages());
587}
588
589/**
590 * call_pernode_memory - use SRAT to call callback functions with node info
591 * @start: physical start of range
592 * @len: length of range
593 * @arg: function to call for each range
594 *
595 * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
596 * out to which node a block of memory belongs. Ignore memory that we cannot
597 * identify, and split blocks that run across multiple nodes.
598 *
599 * Take this opportunity to round the start address up and the end address
600 * down to page boundaries.
601 */
602void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
603{
604 unsigned long rs, re, end = start + len;
605 void (*func)(unsigned long, unsigned long, int);
606 int i;
607
608 start = PAGE_ALIGN(start);
609 end &= PAGE_MASK;
610 if (start >= end)
611 return;
612
613 func = arg;
614
615 if (!num_node_memblks) {
616 /* No SRAT table, so assume one node (node 0) */
617 if (start < end)
618 (*func)(start, end - start, 0);
619 return;
620 }
621
622 for (i = 0; i < num_node_memblks; i++) {
623 rs = max(start, node_memblk[i].start_paddr);
624 re = min(end, node_memblk[i].start_paddr +
625 node_memblk[i].size);
626
627 if (rs < re)
628 (*func)(rs, re - rs, node_memblk[i].nid);
629
630 if (re == end)
631 break;
632 }
633}
634
635/**
636 * count_node_pages - callback to build per-node memory info structures
637 * @start: physical start of range
638 * @len: length of range
639 * @node: node where this range resides
640 *
641 * Each node has it's own number of physical pages, DMAable pages, start, and
642 * end page frame number. This routine will be called by call_pernode_memory()
643 * for each piece of usable memory and will setup these values for each node.
644 * Very similar to build_maps().
645 */
646static __init int count_node_pages(unsigned long start, unsigned long len, int node)
647{
648 unsigned long end = start + len;
649
650 mem_data[node].num_physpages += len >> PAGE_SHIFT;
651 if (start <= __pa(MAX_DMA_ADDRESS))
652 mem_data[node].num_dma_physpages +=
653 (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT;
654 start = GRANULEROUNDDOWN(start);
655 start = ORDERROUNDDOWN(start);
656 end = GRANULEROUNDUP(end);
657 mem_data[node].max_pfn = max(mem_data[node].max_pfn,
658 end >> PAGE_SHIFT);
659 mem_data[node].min_pfn = min(mem_data[node].min_pfn,
660 start >> PAGE_SHIFT);
661
662 return 0;
663}
664
665/**
666 * paging_init - setup page tables
667 *
668 * paging_init() sets up the page tables for each node of the system and frees
669 * the bootmem allocator memory for general use.
670 */
671void __init paging_init(void)
672{
673 unsigned long max_dma;
674 unsigned long zones_size[MAX_NR_ZONES];
675 unsigned long zholes_size[MAX_NR_ZONES];
676 unsigned long pfn_offset = 0;
677 int node;
678
679 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
680
681 /* so min() will work in count_node_pages */
682 for_each_online_node(node)
683 mem_data[node].min_pfn = ~0UL;
684
685 efi_memmap_walk(filter_rsvd_memory, count_node_pages);
686
687 for_each_online_node(node) {
688 memset(zones_size, 0, sizeof(zones_size));
689 memset(zholes_size, 0, sizeof(zholes_size));
690
691 num_physpages += mem_data[node].num_physpages;
692
693 if (mem_data[node].min_pfn >= max_dma) {
694 /* All of this node's memory is above ZONE_DMA */
695 zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
696 mem_data[node].min_pfn;
697 zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
698 mem_data[node].min_pfn -
699 mem_data[node].num_physpages;
700 } else if (mem_data[node].max_pfn < max_dma) {
701 /* All of this node's memory is in ZONE_DMA */
702 zones_size[ZONE_DMA] = mem_data[node].max_pfn -
703 mem_data[node].min_pfn;
704 zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
705 mem_data[node].min_pfn -
706 mem_data[node].num_dma_physpages;
707 } else {
708 /* This node has memory in both zones */
709 zones_size[ZONE_DMA] = max_dma -
710 mem_data[node].min_pfn;
711 zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
712 mem_data[node].num_dma_physpages;
713 zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
714 max_dma;
715 zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
716 (mem_data[node].num_physpages -
717 mem_data[node].num_dma_physpages);
718 }
719
720 if (node == 0) {
721 vmalloc_end -=
722 PAGE_ALIGN(max_low_pfn * sizeof(struct page));
723 vmem_map = (struct page *) vmalloc_end;
724
725 efi_memmap_walk(create_mem_map_page_table, NULL);
726 printk("Virtual mem_map starts at 0x%p\n", vmem_map);
727 }
728
729 pfn_offset = mem_data[node].min_pfn;
730
731 NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
732 free_area_init_node(node, NODE_DATA(node), zones_size,
733 pfn_offset, zholes_size);
734 }
735
736 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
737}
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
new file mode 100644
index 000000000000..6d259e34f359
--- /dev/null
+++ b/arch/ia64/mm/extable.c
@@ -0,0 +1,90 @@
1/*
2 * Kernel exception handling table support. Derived from arch/alpha/mm/extable.c.
3 *
4 * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 */
7
8#include <linux/config.h>
9#include <linux/sort.h>
10
11#include <asm/uaccess.h>
12#include <asm/module.h>
13
14static int cmp_ex(const void *a, const void *b)
15{
16 const struct exception_table_entry *l = a, *r = b;
17 u64 lip = (u64) &l->addr + l->addr;
18 u64 rip = (u64) &r->addr + r->addr;
19
20 /* avoid overflow */
21 if (lip > rip)
22 return 1;
23 if (lip < rip)
24 return -1;
25 return 0;
26}
27
28static void swap_ex(void *a, void *b, int size)
29{
30 struct exception_table_entry *l = a, *r = b, tmp;
31 u64 delta = (u64) r - (u64) l;
32
33 tmp = *l;
34 l->addr = r->addr + delta;
35 l->cont = r->cont + delta;
36 r->addr = tmp.addr - delta;
37 r->cont = tmp.cont - delta;
38}
39
40/*
41 * Sort the exception table. It's usually already sorted, but there
42 * may be unordered entries due to multiple text sections (such as the
43 * .init text section). Note that the exception-table-entries contain
44 * location-relative addresses, which requires a bit of care during
45 * sorting to avoid overflows in the offset members (e.g., it would
46 * not be safe to make a temporary copy of an exception-table entry on
47 * the stack, because the stack may be more than 2GB away from the
48 * exception-table).
49 */
50void sort_extable (struct exception_table_entry *start,
51 struct exception_table_entry *finish)
52{
53 sort(start, finish - start, sizeof(struct exception_table_entry),
54 cmp_ex, swap_ex);
55}
56
57const struct exception_table_entry *
58search_extable (const struct exception_table_entry *first,
59 const struct exception_table_entry *last,
60 unsigned long ip)
61{
62 const struct exception_table_entry *mid;
63 unsigned long mid_ip;
64 long diff;
65
66 while (first <= last) {
67 mid = &first[(last - first)/2];
68 mid_ip = (u64) &mid->addr + mid->addr;
69 diff = mid_ip - ip;
70 if (diff == 0)
71 return mid;
72 else if (diff < 0)
73 first = mid + 1;
74 else
75 last = mid - 1;
76 }
77 return NULL;
78}
79
80void
81ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
82{
83 long fix = (u64) &e->cont + e->cont;
84
85 regs->r8 = -EFAULT;
86 if (fix & 4)
87 regs->r9 = 0;
88 regs->cr_iip = fix & ~0xf;
89 ia64_psr(regs)->ri = fix & 0x3; /* set continuation slot number */
90}
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
new file mode 100644
index 000000000000..da859125aaef
--- /dev/null
+++ b/arch/ia64/mm/fault.c
@@ -0,0 +1,261 @@
1/*
2 * MMU fault handling support.
3 *
4 * Copyright (C) 1998-2002 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 */
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/mm.h>
10#include <linux/smp_lock.h>
11#include <linux/interrupt.h>
12
13#include <asm/pgtable.h>
14#include <asm/processor.h>
15#include <asm/system.h>
16#include <asm/uaccess.h>
17
18extern void die (char *, struct pt_regs *, long);
19
20/*
21 * This routine is analogous to expand_stack() but instead grows the
22 * register backing store (which grows towards higher addresses).
23 * Since the register backing store is access sequentially, we
24 * disallow growing the RBS by more than a page at a time. Note that
25 * the VM_GROWSUP flag can be set on any VM area but that's fine
26 * because the total process size is still limited by RLIMIT_STACK and
27 * RLIMIT_AS.
28 */
29static inline long
30expand_backing_store (struct vm_area_struct *vma, unsigned long address)
31{
32 unsigned long grow;
33
34 grow = PAGE_SIZE >> PAGE_SHIFT;
35 if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
36 || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
37 return -ENOMEM;
38 vma->vm_end += PAGE_SIZE;
39 vma->vm_mm->total_vm += grow;
40 if (vma->vm_flags & VM_LOCKED)
41 vma->vm_mm->locked_vm += grow;
42 __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
43 return 0;
44}
45
46/*
47 * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
48 * (inside region 5, on ia64) and that page is present.
49 */
50static int
51mapped_kernel_page_is_present (unsigned long address)
52{
53 pgd_t *pgd;
54 pud_t *pud;
55 pmd_t *pmd;
56 pte_t *ptep, pte;
57
58 pgd = pgd_offset_k(address);
59 if (pgd_none(*pgd) || pgd_bad(*pgd))
60 return 0;
61
62 pud = pud_offset(pgd, address);
63 if (pud_none(*pud) || pud_bad(*pud))
64 return 0;
65
66 pmd = pmd_offset(pud, address);
67 if (pmd_none(*pmd) || pmd_bad(*pmd))
68 return 0;
69
70 ptep = pte_offset_kernel(pmd, address);
71 if (!ptep)
72 return 0;
73
74 pte = *ptep;
75 return pte_present(pte);
76}
77
78void
79ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs)
80{
81 int signal = SIGSEGV, code = SEGV_MAPERR;
82 struct vm_area_struct *vma, *prev_vma;
83 struct mm_struct *mm = current->mm;
84 struct siginfo si;
85 unsigned long mask;
86
87 /*
88 * If we're in an interrupt or have no user context, we must not take the fault..
89 */
90 if (in_atomic() || !mm)
91 goto no_context;
92
93#ifdef CONFIG_VIRTUAL_MEM_MAP
94 /*
95 * If fault is in region 5 and we are in the kernel, we may already
96 * have the mmap_sem (pfn_valid macro is called during mmap). There
97 * is no vma for region 5 addr's anyway, so skip getting the semaphore
98 * and go directly to the exception handling code.
99 */
100
101 if ((REGION_NUMBER(address) == 5) && !user_mode(regs))
102 goto bad_area_no_up;
103#endif
104
105 down_read(&mm->mmap_sem);
106
107 vma = find_vma_prev(mm, address, &prev_vma);
108 if (!vma)
109 goto bad_area;
110
111 /* find_vma_prev() returns vma such that address < vma->vm_end or NULL */
112 if (address < vma->vm_start)
113 goto check_expansion;
114
115 good_area:
116 code = SEGV_ACCERR;
117
118 /* OK, we've got a good vm_area for this memory area. Check the access permissions: */
119
120# define VM_READ_BIT 0
121# define VM_WRITE_BIT 1
122# define VM_EXEC_BIT 2
123
124# if (((1 << VM_READ_BIT) != VM_READ || (1 << VM_WRITE_BIT) != VM_WRITE) \
125 || (1 << VM_EXEC_BIT) != VM_EXEC)
126# error File is out of sync with <linux/mm.h>. Please update.
127# endif
128
129 mask = ( (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
130 | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)
131 | (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT));
132
133 if ((vma->vm_flags & mask) != mask)
134 goto bad_area;
135
136 survive:
137 /*
138 * If for any reason at all we couldn't handle the fault, make
139 * sure we exit gracefully rather than endlessly redo the
140 * fault.
141 */
142 switch (handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0)) {
143 case VM_FAULT_MINOR:
144 ++current->min_flt;
145 break;
146 case VM_FAULT_MAJOR:
147 ++current->maj_flt;
148 break;
149 case VM_FAULT_SIGBUS:
150 /*
151 * We ran out of memory, or some other thing happened
152 * to us that made us unable to handle the page fault
153 * gracefully.
154 */
155 signal = SIGBUS;
156 goto bad_area;
157 case VM_FAULT_OOM:
158 goto out_of_memory;
159 default:
160 BUG();
161 }
162 up_read(&mm->mmap_sem);
163 return;
164
165 check_expansion:
166 if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
167 if (!(vma->vm_flags & VM_GROWSDOWN))
168 goto bad_area;
169 if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
170 || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
171 goto bad_area;
172 if (expand_stack(vma, address))
173 goto bad_area;
174 } else {
175 vma = prev_vma;
176 if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
177 || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
178 goto bad_area;
179 if (expand_backing_store(vma, address))
180 goto bad_area;
181 }
182 goto good_area;
183
184 bad_area:
185 up_read(&mm->mmap_sem);
186#ifdef CONFIG_VIRTUAL_MEM_MAP
187 bad_area_no_up:
188#endif
189 if ((isr & IA64_ISR_SP)
190 || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
191 {
192 /*
193 * This fault was due to a speculative load or lfetch.fault, set the "ed"
194 * bit in the psr to ensure forward progress. (Target register will get a
195 * NaT for ld.s, lfetch will be canceled.)
196 */
197 ia64_psr(regs)->ed = 1;
198 return;
199 }
200 if (user_mode(regs)) {
201 si.si_signo = signal;
202 si.si_errno = 0;
203 si.si_code = code;
204 si.si_addr = (void __user *) address;
205 si.si_isr = isr;
206 si.si_flags = __ISR_VALID;
207 force_sig_info(signal, &si, current);
208 return;
209 }
210
211 no_context:
212 if (isr & IA64_ISR_SP) {
213 /*
214 * This fault was due to a speculative load set the "ed" bit in the psr to
215 * ensure forward progress (target register will get a NaT).
216 */
217 ia64_psr(regs)->ed = 1;
218 return;
219 }
220
221 if (ia64_done_with_exception(regs))
222 return;
223
224 /*
225 * Since we have no vma's for region 5, we might get here even if the address is
226 * valid, due to the VHPT walker inserting a non present translation that becomes
227 * stale. If that happens, the non present fault handler already purged the stale
228 * translation, which fixed the problem. So, we check to see if the translation is
229 * valid, and return if it is.
230 */
231 if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address))
232 return;
233
234 /*
235 * Oops. The kernel tried to access some bad page. We'll have to terminate things
236 * with extreme prejudice.
237 */
238 bust_spinlocks(1);
239
240 if (address < PAGE_SIZE)
241 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
242 else
243 printk(KERN_ALERT "Unable to handle kernel paging request at "
244 "virtual address %016lx\n", address);
245 die("Oops", regs, isr);
246 bust_spinlocks(0);
247 do_exit(SIGKILL);
248 return;
249
250 out_of_memory:
251 up_read(&mm->mmap_sem);
252 if (current->pid == 1) {
253 yield();
254 down_read(&mm->mmap_sem);
255 goto survive;
256 }
257 printk(KERN_CRIT "VM: killing process %s\n", current->comm);
258 if (user_mode(regs))
259 do_exit(SIGKILL);
260 goto no_context;
261}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
new file mode 100644
index 000000000000..40ad8328ffd5
--- /dev/null
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -0,0 +1,357 @@
1/*
2 * IA-64 Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com>
5 * Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com>
6 *
7 * Sep, 2003: add numa support
8 * Feb, 2004: dynamic hugetlb page size via boot parameter
9 */
10
11#include <linux/config.h>
12#include <linux/init.h>
13#include <linux/fs.h>
14#include <linux/mm.h>
15#include <linux/hugetlb.h>
16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/slab.h>
19#include <linux/sysctl.h>
20#include <asm/mman.h>
21#include <asm/pgalloc.h>
22#include <asm/tlb.h>
23#include <asm/tlbflush.h>
24
25unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
26
27static pte_t *
28huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
29{
30 unsigned long taddr = htlbpage_to_page(addr);
31 pgd_t *pgd;
32 pud_t *pud;
33 pmd_t *pmd;
34 pte_t *pte = NULL;
35
36 pgd = pgd_offset(mm, taddr);
37 pud = pud_alloc(mm, pgd, taddr);
38 if (pud) {
39 pmd = pmd_alloc(mm, pud, taddr);
40 if (pmd)
41 pte = pte_alloc_map(mm, pmd, taddr);
42 }
43 return pte;
44}
45
46static pte_t *
47huge_pte_offset (struct mm_struct *mm, unsigned long addr)
48{
49 unsigned long taddr = htlbpage_to_page(addr);
50 pgd_t *pgd;
51 pud_t *pud;
52 pmd_t *pmd;
53 pte_t *pte = NULL;
54
55 pgd = pgd_offset(mm, taddr);
56 if (pgd_present(*pgd)) {
57 pud = pud_offset(pgd, taddr);
58 if (pud_present(*pud)) {
59 pmd = pmd_offset(pud, taddr);
60 if (pmd_present(*pmd))
61 pte = pte_offset_map(pmd, taddr);
62 }
63 }
64
65 return pte;
66}
67
68#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
69
70static void
71set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma,
72 struct page *page, pte_t * page_table, int write_access)
73{
74 pte_t entry;
75
76 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
77 if (write_access) {
78 entry =
79 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
80 } else
81 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
82 entry = pte_mkyoung(entry);
83 mk_pte_huge(entry);
84 set_pte(page_table, entry);
85 return;
86}
87/*
88 * This function checks for proper alignment of input addr and len parameters.
89 */
90int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
91{
92 if (len & ~HPAGE_MASK)
93 return -EINVAL;
94 if (addr & ~HPAGE_MASK)
95 return -EINVAL;
96 if (REGION_NUMBER(addr) != REGION_HPAGE)
97 return -EINVAL;
98
99 return 0;
100}
101
102int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
103 struct vm_area_struct *vma)
104{
105 pte_t *src_pte, *dst_pte, entry;
106 struct page *ptepage;
107 unsigned long addr = vma->vm_start;
108 unsigned long end = vma->vm_end;
109
110 while (addr < end) {
111 dst_pte = huge_pte_alloc(dst, addr);
112 if (!dst_pte)
113 goto nomem;
114 src_pte = huge_pte_offset(src, addr);
115 entry = *src_pte;
116 ptepage = pte_page(entry);
117 get_page(ptepage);
118 set_pte(dst_pte, entry);
119 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
120 addr += HPAGE_SIZE;
121 }
122 return 0;
123nomem:
124 return -ENOMEM;
125}
126
127int
128follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
129 struct page **pages, struct vm_area_struct **vmas,
130 unsigned long *st, int *length, int i)
131{
132 pte_t *ptep, pte;
133 unsigned long start = *st;
134 unsigned long pstart;
135 int len = *length;
136 struct page *page;
137
138 do {
139 pstart = start & HPAGE_MASK;
140 ptep = huge_pte_offset(mm, start);
141 pte = *ptep;
142
143back1:
144 page = pte_page(pte);
145 if (pages) {
146 page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
147 get_page(page);
148 pages[i] = page;
149 }
150 if (vmas)
151 vmas[i] = vma;
152 i++;
153 len--;
154 start += PAGE_SIZE;
155 if (((start & HPAGE_MASK) == pstart) && len &&
156 (start < vma->vm_end))
157 goto back1;
158 } while (len && start < vma->vm_end);
159 *length = len;
160 *st = start;
161 return i;
162}
163
164struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
165{
166 struct page *page;
167 pte_t *ptep;
168
169 if (REGION_NUMBER(addr) != REGION_HPAGE)
170 return ERR_PTR(-EINVAL);
171
172 ptep = huge_pte_offset(mm, addr);
173 if (!ptep || pte_none(*ptep))
174 return NULL;
175 page = pte_page(*ptep);
176 page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
177 return page;
178}
179int pmd_huge(pmd_t pmd)
180{
181 return 0;
182}
183struct page *
184follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
185{
186 return NULL;
187}
188
189/*
190 * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
191 * are hugetlb region specific.
192 */
193void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
194 unsigned long start, unsigned long end)
195{
196 unsigned long first = start & HUGETLB_PGDIR_MASK;
197 unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
198 struct mm_struct *mm = tlb->mm;
199
200 if (!prev) {
201 prev = mm->mmap;
202 if (!prev)
203 goto no_mmaps;
204 if (prev->vm_end > start) {
205 if (last > prev->vm_start)
206 last = prev->vm_start;
207 goto no_mmaps;
208 }
209 }
210 for (;;) {
211 struct vm_area_struct *next = prev->vm_next;
212
213 if (next) {
214 if (next->vm_start < start) {
215 prev = next;
216 continue;
217 }
218 if (last > next->vm_start)
219 last = next->vm_start;
220 }
221 if (prev->vm_end > first)
222 first = prev->vm_end;
223 break;
224 }
225no_mmaps:
226 if (last < first) /* for arches with discontiguous pgd indices */
227 return;
228 clear_page_range(tlb, first, last);
229}
230
231void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
232{
233 struct mm_struct *mm = vma->vm_mm;
234 unsigned long address;
235 pte_t *pte;
236 struct page *page;
237
238 BUG_ON(start & (HPAGE_SIZE - 1));
239 BUG_ON(end & (HPAGE_SIZE - 1));
240
241 for (address = start; address < end; address += HPAGE_SIZE) {
242 pte = huge_pte_offset(mm, address);
243 if (pte_none(*pte))
244 continue;
245 page = pte_page(*pte);
246 put_page(page);
247 pte_clear(mm, address, pte);
248 }
249 add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT));
250 flush_tlb_range(vma, start, end);
251}
252
253int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
254{
255 struct mm_struct *mm = current->mm;
256 unsigned long addr;
257 int ret = 0;
258
259 BUG_ON(vma->vm_start & ~HPAGE_MASK);
260 BUG_ON(vma->vm_end & ~HPAGE_MASK);
261
262 spin_lock(&mm->page_table_lock);
263 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
264 unsigned long idx;
265 pte_t *pte = huge_pte_alloc(mm, addr);
266 struct page *page;
267
268 if (!pte) {
269 ret = -ENOMEM;
270 goto out;
271 }
272 if (!pte_none(*pte))
273 continue;
274
275 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
276 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
277 page = find_get_page(mapping, idx);
278 if (!page) {
279 /* charge the fs quota first */
280 if (hugetlb_get_quota(mapping)) {
281 ret = -ENOMEM;
282 goto out;
283 }
284 page = alloc_huge_page();
285 if (!page) {
286 hugetlb_put_quota(mapping);
287 ret = -ENOMEM;
288 goto out;
289 }
290 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
291 if (! ret) {
292 unlock_page(page);
293 } else {
294 hugetlb_put_quota(mapping);
295 page_cache_release(page);
296 goto out;
297 }
298 }
299 set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
300 }
301out:
302 spin_unlock(&mm->page_table_lock);
303 return ret;
304}
305
306unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
307 unsigned long pgoff, unsigned long flags)
308{
309 struct vm_area_struct *vmm;
310
311 if (len > RGN_MAP_LIMIT)
312 return -ENOMEM;
313 if (len & ~HPAGE_MASK)
314 return -EINVAL;
315 /* This code assumes that REGION_HPAGE != 0. */
316 if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1)))
317 addr = HPAGE_REGION_BASE;
318 else
319 addr = ALIGN(addr, HPAGE_SIZE);
320 for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
321 /* At this point: (!vmm || addr < vmm->vm_end). */
322 if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
323 return -ENOMEM;
324 if (!vmm || (addr + len) <= vmm->vm_start)
325 return addr;
326 addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
327 }
328}
329
330static int __init hugetlb_setup_sz(char *str)
331{
332 u64 tr_pages;
333 unsigned long long size;
334
335 if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0)
336 /*
337 * shouldn't happen, but just in case.
338 */
339 tr_pages = 0x15557000UL;
340
341 size = memparse(str, &str);
342 if (*str || (size & (size-1)) || !(tr_pages & size) ||
343 size <= PAGE_SIZE ||
344 size >= (1UL << PAGE_SHIFT << MAX_ORDER)) {
345 printk(KERN_WARNING "Invalid huge page size specified\n");
346 return 1;
347 }
348
349 hpage_shift = __ffs(size);
350 /*
351 * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT
352 * override here with new page shift.
353 */
354 ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2);
355 return 1;
356}
357__setup("hugepagesz=", hugetlb_setup_sz);
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
new file mode 100644
index 000000000000..65cf839573ea
--- /dev/null
+++ b/arch/ia64/mm/init.c
@@ -0,0 +1,597 @@
1/*
2 * Initialize MMU support.
3 *
4 * Copyright (C) 1998-2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 */
7#include <linux/config.h>
8#include <linux/kernel.h>
9#include <linux/init.h>
10
11#include <linux/bootmem.h>
12#include <linux/efi.h>
13#include <linux/elf.h>
14#include <linux/mm.h>
15#include <linux/mmzone.h>
16#include <linux/module.h>
17#include <linux/personality.h>
18#include <linux/reboot.h>
19#include <linux/slab.h>
20#include <linux/swap.h>
21#include <linux/proc_fs.h>
22#include <linux/bitops.h>
23
24#include <asm/a.out.h>
25#include <asm/dma.h>
26#include <asm/ia32.h>
27#include <asm/io.h>
28#include <asm/machvec.h>
29#include <asm/numa.h>
30#include <asm/patch.h>
31#include <asm/pgalloc.h>
32#include <asm/sal.h>
33#include <asm/sections.h>
34#include <asm/system.h>
35#include <asm/tlb.h>
36#include <asm/uaccess.h>
37#include <asm/unistd.h>
38#include <asm/mca.h>
39
40DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
41
42extern void ia64_tlb_init (void);
43
44unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
45
46#ifdef CONFIG_VIRTUAL_MEM_MAP
47unsigned long vmalloc_end = VMALLOC_END_INIT;
48EXPORT_SYMBOL(vmalloc_end);
49struct page *vmem_map;
50EXPORT_SYMBOL(vmem_map);
51#endif
52
53static int pgt_cache_water[2] = { 25, 50 };
54
55struct page *zero_page_memmap_ptr; /* map entry for zero page */
56EXPORT_SYMBOL(zero_page_memmap_ptr);
57
58void
59check_pgt_cache (void)
60{
61 int low, high;
62
63 low = pgt_cache_water[0];
64 high = pgt_cache_water[1];
65
66 preempt_disable();
67 if (pgtable_cache_size > (u64) high) {
68 do {
69 if (pgd_quicklist)
70 free_page((unsigned long)pgd_alloc_one_fast(NULL));
71 if (pmd_quicklist)
72 free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
73 } while (pgtable_cache_size > (u64) low);
74 }
75 preempt_enable();
76}
77
78void
79lazy_mmu_prot_update (pte_t pte)
80{
81 unsigned long addr;
82 struct page *page;
83
84 if (!pte_exec(pte))
85 return; /* not an executable page... */
86
87 page = pte_page(pte);
88 addr = (unsigned long) page_address(page);
89
90 if (test_bit(PG_arch_1, &page->flags))
91 return; /* i-cache is already coherent with d-cache */
92
93 flush_icache_range(addr, addr + PAGE_SIZE);
94 set_bit(PG_arch_1, &page->flags); /* mark page as clean */
95}
96
97inline void
98ia64_set_rbs_bot (void)
99{
100 unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
101
102 if (stack_size > MAX_USER_STACK_SIZE)
103 stack_size = MAX_USER_STACK_SIZE;
104 current->thread.rbs_bot = STACK_TOP - stack_size;
105}
106
107/*
108 * This performs some platform-dependent address space initialization.
109 * On IA-64, we want to setup the VM area for the register backing
110 * store (which grows upwards) and install the gateway page which is
111 * used for signal trampolines, etc.
112 */
113void
114ia64_init_addr_space (void)
115{
116 struct vm_area_struct *vma;
117
118 ia64_set_rbs_bot();
119
120 /*
121 * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
122 * the problem. When the process attempts to write to the register backing store
123 * for the first time, it will get a SEGFAULT in this case.
124 */
125 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
126 if (vma) {
127 memset(vma, 0, sizeof(*vma));
128 vma->vm_mm = current->mm;
129 vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
130 vma->vm_end = vma->vm_start + PAGE_SIZE;
131 vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
132 vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
133 down_write(&current->mm->mmap_sem);
134 if (insert_vm_struct(current->mm, vma)) {
135 up_write(&current->mm->mmap_sem);
136 kmem_cache_free(vm_area_cachep, vma);
137 return;
138 }
139 up_write(&current->mm->mmap_sem);
140 }
141
142 /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
143 if (!(current->personality & MMAP_PAGE_ZERO)) {
144 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
145 if (vma) {
146 memset(vma, 0, sizeof(*vma));
147 vma->vm_mm = current->mm;
148 vma->vm_end = PAGE_SIZE;
149 vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
150 vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
151 down_write(&current->mm->mmap_sem);
152 if (insert_vm_struct(current->mm, vma)) {
153 up_write(&current->mm->mmap_sem);
154 kmem_cache_free(vm_area_cachep, vma);
155 return;
156 }
157 up_write(&current->mm->mmap_sem);
158 }
159 }
160}
161
162void
163free_initmem (void)
164{
165 unsigned long addr, eaddr;
166
167 addr = (unsigned long) ia64_imva(__init_begin);
168 eaddr = (unsigned long) ia64_imva(__init_end);
169 while (addr < eaddr) {
170 ClearPageReserved(virt_to_page(addr));
171 set_page_count(virt_to_page(addr), 1);
172 free_page(addr);
173 ++totalram_pages;
174 addr += PAGE_SIZE;
175 }
176 printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n",
177 (__init_end - __init_begin) >> 10);
178}
179
180void
181free_initrd_mem (unsigned long start, unsigned long end)
182{
183 struct page *page;
184 /*
185 * EFI uses 4KB pages while the kernel can use 4KB or bigger.
186 * Thus EFI and the kernel may have different page sizes. It is
187 * therefore possible to have the initrd share the same page as
188 * the end of the kernel (given current setup).
189 *
190 * To avoid freeing/using the wrong page (kernel sized) we:
191 * - align up the beginning of initrd
192 * - align down the end of initrd
193 *
194 * | |
195 * |=============| a000
196 * | |
197 * | |
198 * | | 9000
199 * |/////////////|
200 * |/////////////|
201 * |=============| 8000
202 * |///INITRD////|
203 * |/////////////|
204 * |/////////////| 7000
205 * | |
206 * |KKKKKKKKKKKKK|
207 * |=============| 6000
208 * |KKKKKKKKKKKKK|
209 * |KKKKKKKKKKKKK|
210 * K=kernel using 8KB pages
211 *
212 * In this example, we must free page 8000 ONLY. So we must align up
213 * initrd_start and keep initrd_end as is.
214 */
215 start = PAGE_ALIGN(start);
216 end = end & PAGE_MASK;
217
218 if (start < end)
219 printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);
220
221 for (; start < end; start += PAGE_SIZE) {
222 if (!virt_addr_valid(start))
223 continue;
224 page = virt_to_page(start);
225 ClearPageReserved(page);
226 set_page_count(page, 1);
227 free_page(start);
228 ++totalram_pages;
229 }
230}
231
232/*
233 * This installs a clean page in the kernel's page table.
234 */
235struct page *
236put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
237{
238 pgd_t *pgd;
239 pud_t *pud;
240 pmd_t *pmd;
241 pte_t *pte;
242
243 if (!PageReserved(page))
244 printk(KERN_ERR "put_kernel_page: page at 0x%p not in reserved memory\n",
245 page_address(page));
246
247 pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */
248
249 spin_lock(&init_mm.page_table_lock);
250 {
251 pud = pud_alloc(&init_mm, pgd, address);
252 if (!pud)
253 goto out;
254
255 pmd = pmd_alloc(&init_mm, pud, address);
256 if (!pmd)
257 goto out;
258 pte = pte_alloc_map(&init_mm, pmd, address);
259 if (!pte)
260 goto out;
261 if (!pte_none(*pte)) {
262 pte_unmap(pte);
263 goto out;
264 }
265 set_pte(pte, mk_pte(page, pgprot));
266 pte_unmap(pte);
267 }
268 out: spin_unlock(&init_mm.page_table_lock);
269 /* no need for flush_tlb */
270 return page;
271}
272
273static void
274setup_gate (void)
275{
276 struct page *page;
277
278 /*
279 * Map the gate page twice: once read-only to export the ELF headers etc. and once
280 * execute-only page to enable privilege-promotion via "epc":
281 */
282 page = virt_to_page(ia64_imva(__start_gate_section));
283 put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
284#ifdef HAVE_BUGGY_SEGREL
285 page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
286 put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
287#else
288 put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
289#endif
290 ia64_patch_gate();
291}
292
293void __devinit
294ia64_mmu_init (void *my_cpu_data)
295{
296 unsigned long psr, pta, impl_va_bits;
297 extern void __devinit tlb_init (void);
298
299#ifdef CONFIG_DISABLE_VHPT
300# define VHPT_ENABLE_BIT 0
301#else
302# define VHPT_ENABLE_BIT 1
303#endif
304
305 /* Pin mapping for percpu area into TLB */
306 psr = ia64_clear_ic();
307 ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
308 pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
309 PERCPU_PAGE_SHIFT);
310
311 ia64_set_psr(psr);
312 ia64_srlz_i();
313
314 /*
315 * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
316 * address space. The IA-64 architecture guarantees that at least 50 bits of
317 * virtual address space are implemented but if we pick a large enough page size
318 * (e.g., 64KB), the mapped address space is big enough that it will overlap with
319 * VMLPT. I assume that once we run on machines big enough to warrant 64KB pages,
320 * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
321 * problem in practice. Alternatively, we could truncate the top of the mapped
322 * address space to not permit mappings that would overlap with the VMLPT.
323 * --davidm 00/12/06
324 */
325# define pte_bits 3
326# define mapped_space_bits (3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
327 /*
328 * The virtual page table has to cover the entire implemented address space within
329 * a region even though not all of this space may be mappable. The reason for
330 * this is that the Access bit and Dirty bit fault handlers perform
331 * non-speculative accesses to the virtual page table, so the address range of the
332 * virtual page table itself needs to be covered by virtual page table.
333 */
334# define vmlpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits)
335# define POW2(n) (1ULL << (n))
336
337 impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
338
339 if (impl_va_bits < 51 || impl_va_bits > 61)
340 panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
341
342 /* place the VMLPT at the end of each page-table mapped region: */
343 pta = POW2(61) - POW2(vmlpt_bits);
344
345 if (POW2(mapped_space_bits) >= pta)
346 panic("mm/init: overlap between virtually mapped linear page table and "
347 "mapped kernel space!");
348 /*
349 * Set the (virtually mapped linear) page table address. Bit
350 * 8 selects between the short and long format, bits 2-7 the
351 * size of the table, and bit 0 whether the VHPT walker is
352 * enabled.
353 */
354 ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
355
356 ia64_tlb_init();
357
358#ifdef CONFIG_HUGETLB_PAGE
359 ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
360 ia64_srlz_d();
361#endif
362}
363
364#ifdef CONFIG_VIRTUAL_MEM_MAP
365
366int
367create_mem_map_page_table (u64 start, u64 end, void *arg)
368{
369 unsigned long address, start_page, end_page;
370 struct page *map_start, *map_end;
371 int node;
372 pgd_t *pgd;
373 pud_t *pud;
374 pmd_t *pmd;
375 pte_t *pte;
376
377 map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
378 map_end = vmem_map + (__pa(end) >> PAGE_SHIFT);
379
380 start_page = (unsigned long) map_start & PAGE_MASK;
381 end_page = PAGE_ALIGN((unsigned long) map_end);
382 node = paddr_to_nid(__pa(start));
383
384 for (address = start_page; address < end_page; address += PAGE_SIZE) {
385 pgd = pgd_offset_k(address);
386 if (pgd_none(*pgd))
387 pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
388 pud = pud_offset(pgd, address);
389
390 if (pud_none(*pud))
391 pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
392 pmd = pmd_offset(pud, address);
393
394 if (pmd_none(*pmd))
395 pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
396 pte = pte_offset_kernel(pmd, address);
397
398 if (pte_none(*pte))
399 set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
400 PAGE_KERNEL));
401 }
402 return 0;
403}
404
405struct memmap_init_callback_data {
406 struct page *start;
407 struct page *end;
408 int nid;
409 unsigned long zone;
410};
411
412static int
413virtual_memmap_init (u64 start, u64 end, void *arg)
414{
415 struct memmap_init_callback_data *args;
416 struct page *map_start, *map_end;
417
418 args = (struct memmap_init_callback_data *) arg;
419 map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
420 map_end = vmem_map + (__pa(end) >> PAGE_SHIFT);
421
422 if (map_start < args->start)
423 map_start = args->start;
424 if (map_end > args->end)
425 map_end = args->end;
426
427 /*
428 * We have to initialize "out of bounds" struct page elements that fit completely
429 * on the same pages that were allocated for the "in bounds" elements because they
430 * may be referenced later (and found to be "reserved").
431 */
432 map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page);
433 map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end)
434 / sizeof(struct page));
435
436 if (map_start < map_end)
437 memmap_init_zone((unsigned long)(map_end - map_start),
438 args->nid, args->zone, page_to_pfn(map_start));
439 return 0;
440}
441
442void
443memmap_init (unsigned long size, int nid, unsigned long zone,
444 unsigned long start_pfn)
445{
446 if (!vmem_map)
447 memmap_init_zone(size, nid, zone, start_pfn);
448 else {
449 struct page *start;
450 struct memmap_init_callback_data args;
451
452 start = pfn_to_page(start_pfn);
453 args.start = start;
454 args.end = start + size;
455 args.nid = nid;
456 args.zone = zone;
457
458 efi_memmap_walk(virtual_memmap_init, &args);
459 }
460}
461
462int
463ia64_pfn_valid (unsigned long pfn)
464{
465 char byte;
466 struct page *pg = pfn_to_page(pfn);
467
468 return (__get_user(byte, (char __user *) pg) == 0)
469 && ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK))
470 || (__get_user(byte, (char __user *) (pg + 1) - 1) == 0));
471}
472EXPORT_SYMBOL(ia64_pfn_valid);
473
474int
475find_largest_hole (u64 start, u64 end, void *arg)
476{
477 u64 *max_gap = arg;
478
479 static u64 last_end = PAGE_OFFSET;
480
481 /* NOTE: this algorithm assumes efi memmap table is ordered */
482
483 if (*max_gap < (start - last_end))
484 *max_gap = start - last_end;
485 last_end = end;
486 return 0;
487}
488#endif /* CONFIG_VIRTUAL_MEM_MAP */
489
490static int
491count_reserved_pages (u64 start, u64 end, void *arg)
492{
493 unsigned long num_reserved = 0;
494 unsigned long *count = arg;
495
496 for (; start < end; start += PAGE_SIZE)
497 if (PageReserved(virt_to_page(start)))
498 ++num_reserved;
499 *count += num_reserved;
500 return 0;
501}
502
503/*
504 * Boot command-line option "nolwsys" can be used to disable the use of any light-weight
505 * system call handler. When this option is in effect, all fsyscalls will end up bubbling
506 * down into the kernel and calling the normal (heavy-weight) syscall handler. This is
507 * useful for performance testing, but conceivably could also come in handy for debugging
508 * purposes.
509 */
510
511static int nolwsys;
512
513static int __init
514nolwsys_setup (char *s)
515{
516 nolwsys = 1;
517 return 1;
518}
519
520__setup("nolwsys", nolwsys_setup);
521
522void
523mem_init (void)
524{
525 long reserved_pages, codesize, datasize, initsize;
526 unsigned long num_pgt_pages;
527 pg_data_t *pgdat;
528 int i;
529 static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
530
531#ifdef CONFIG_PCI
532 /*
533 * This needs to be called _after_ the command line has been parsed but _before_
534 * any drivers that may need the PCI DMA interface are initialized or bootmem has
535 * been freed.
536 */
537 platform_dma_init();
538#endif
539
540#ifndef CONFIG_DISCONTIGMEM
541 if (!mem_map)
542 BUG();
543 max_mapnr = max_low_pfn;
544#endif
545
546 high_memory = __va(max_low_pfn * PAGE_SIZE);
547
548 kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
549 kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
550 kclist_add(&kcore_kernel, _stext, _end - _stext);
551
552 for_each_pgdat(pgdat)
553 totalram_pages += free_all_bootmem_node(pgdat);
554
555 reserved_pages = 0;
556 efi_memmap_walk(count_reserved_pages, &reserved_pages);
557
558 codesize = (unsigned long) _etext - (unsigned long) _stext;
559 datasize = (unsigned long) _edata - (unsigned long) _etext;
560 initsize = (unsigned long) __init_end - (unsigned long) __init_begin;
561
562 printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, "
563 "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
564 num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
565 reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
566
567 /*
568 * Allow for enough (cached) page table pages so that we can map the entire memory
569 * at least once. Each task also needs a couple of page tables pages, so add in a
570 * fudge factor for that (don't use "threads-max" here; that would be wrong!).
571 * Don't allow the cache to be more than 10% of total memory, though.
572 */
573# define NUM_TASKS 500 /* typical number of tasks */
574 num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
575 if (num_pgt_pages > nr_free_pages() / 10)
576 num_pgt_pages = nr_free_pages() / 10;
577 if (num_pgt_pages > (u64) pgt_cache_water[1])
578 pgt_cache_water[1] = num_pgt_pages;
579
580 /*
581 * For fsyscall entrpoints with no light-weight handler, use the ordinary
582 * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
583 * code can tell them apart.
584 */
585 for (i = 0; i < NR_syscalls; ++i) {
586 extern unsigned long fsyscall_table[NR_syscalls];
587 extern unsigned long sys_call_table[NR_syscalls];
588
589 if (!fsyscall_table[i] || nolwsys)
590 fsyscall_table[i] = sys_call_table[i] | 1;
591 }
592 setup_gate();
593
594#ifdef CONFIG_IA32_SUPPORT
595 ia32_mem_init();
596#endif
597}
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
new file mode 100644
index 000000000000..77118bbf3d8b
--- /dev/null
+++ b/arch/ia64/mm/numa.c
@@ -0,0 +1,49 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * This file contains NUMA specific variables and functions which can
7 * be split away from DISCONTIGMEM and are used on NUMA machines with
8 * contiguous memory.
9 *
10 * 2002/08/07 Erich Focht <efocht@ess.nec.de>
11 */
12
13#include <linux/config.h>
14#include <linux/cpu.h>
15#include <linux/kernel.h>
16#include <linux/mm.h>
17#include <linux/node.h>
18#include <linux/init.h>
19#include <linux/bootmem.h>
20#include <asm/mmzone.h>
21#include <asm/numa.h>
22
23
24/*
25 * The following structures are usually initialized by ACPI or
26 * similar mechanisms and describe the NUMA characteristics of the machine.
27 */
28int num_node_memblks;
29struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
30struct node_cpuid_s node_cpuid[NR_CPUS];
31/*
32 * This is a matrix with "distances" between nodes, they should be
33 * proportional to the memory access latency ratios.
34 */
35u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
36
37/* Identify which cnode a physical address resides on */
38int
39paddr_to_nid(unsigned long paddr)
40{
41 int i;
42
43 for (i = 0; i < num_node_memblks; i++)
44 if (paddr >= node_memblk[i].start_paddr &&
45 paddr < node_memblk[i].start_paddr + node_memblk[i].size)
46 break;
47
48 return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
49}
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
new file mode 100644
index 000000000000..464557e4ed82
--- /dev/null
+++ b/arch/ia64/mm/tlb.c
@@ -0,0 +1,190 @@
1/*
2 * TLB support routines.
3 *
4 * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 *
7 * 08/02/00 A. Mallick <asit.k.mallick@intel.com>
8 * Modified RID allocation for SMP
9 * Goutham Rao <goutham.rao@intel.com>
10 * IPI based ptc implementation and A-step IPI implementation.
11 */
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/smp.h>
18#include <linux/mm.h>
19
20#include <asm/delay.h>
21#include <asm/mmu_context.h>
22#include <asm/pgalloc.h>
23#include <asm/pal.h>
24#include <asm/tlbflush.h>
25
26static struct {
27 unsigned long mask; /* mask of supported purge page-sizes */
28 unsigned long max_bits; /* log2() of largest supported purge page-size */
29} purge;
30
31struct ia64_ctx ia64_ctx = {
32 .lock = SPIN_LOCK_UNLOCKED,
33 .next = 1,
34 .limit = (1 << 15) - 1, /* start out with the safe (architected) limit */
35 .max_ctx = ~0U
36};
37
38DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
39
40/*
41 * Acquire the ia64_ctx.lock before calling this function!
42 */
43void
44wrap_mmu_context (struct mm_struct *mm)
45{
46 unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
47 struct task_struct *tsk;
48 int i;
49
50 if (ia64_ctx.next > max_ctx)
51 ia64_ctx.next = 300; /* skip daemons */
52 ia64_ctx.limit = max_ctx + 1;
53
54 /*
55 * Scan all the task's mm->context and set proper safe range
56 */
57
58 read_lock(&tasklist_lock);
59 repeat:
60 for_each_process(tsk) {
61 if (!tsk->mm)
62 continue;
63 tsk_context = tsk->mm->context;
64 if (tsk_context == ia64_ctx.next) {
65 if (++ia64_ctx.next >= ia64_ctx.limit) {
66 /* empty range: reset the range limit and start over */
67 if (ia64_ctx.next > max_ctx)
68 ia64_ctx.next = 300;
69 ia64_ctx.limit = max_ctx + 1;
70 goto repeat;
71 }
72 }
73 if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
74 ia64_ctx.limit = tsk_context;
75 }
76 read_unlock(&tasklist_lock);
77 /* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
78 {
79 int cpu = get_cpu(); /* prevent preemption/migration */
80 for (i = 0; i < NR_CPUS; ++i)
81 if (cpu_online(i) && (i != cpu))
82 per_cpu(ia64_need_tlb_flush, i) = 1;
83 put_cpu();
84 }
85 local_flush_tlb_all();
86}
87
88void
89ia64_global_tlb_purge (unsigned long start, unsigned long end, unsigned long nbits)
90{
91 static DEFINE_SPINLOCK(ptcg_lock);
92
93 /* HW requires global serialization of ptc.ga. */
94 spin_lock(&ptcg_lock);
95 {
96 do {
97 /*
98 * Flush ALAT entries also.
99 */
100 ia64_ptcga(start, (nbits<<2));
101 ia64_srlz_i();
102 start += (1UL << nbits);
103 } while (start < end);
104 }
105 spin_unlock(&ptcg_lock);
106}
107
108void
109local_flush_tlb_all (void)
110{
111 unsigned long i, j, flags, count0, count1, stride0, stride1, addr;
112
113 addr = local_cpu_data->ptce_base;
114 count0 = local_cpu_data->ptce_count[0];
115 count1 = local_cpu_data->ptce_count[1];
116 stride0 = local_cpu_data->ptce_stride[0];
117 stride1 = local_cpu_data->ptce_stride[1];
118
119 local_irq_save(flags);
120 for (i = 0; i < count0; ++i) {
121 for (j = 0; j < count1; ++j) {
122 ia64_ptce(addr);
123 addr += stride1;
124 }
125 addr += stride0;
126 }
127 local_irq_restore(flags);
128 ia64_srlz_i(); /* srlz.i implies srlz.d */
129}
130
131void
132flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end)
133{
134 struct mm_struct *mm = vma->vm_mm;
135 unsigned long size = end - start;
136 unsigned long nbits;
137
138 if (mm != current->active_mm) {
139 /* this does happen, but perhaps it's not worth optimizing for? */
140#ifdef CONFIG_SMP
141 flush_tlb_all();
142#else
143 mm->context = 0;
144#endif
145 return;
146 }
147
148 nbits = ia64_fls(size + 0xfff);
149 while (unlikely (((1UL << nbits) & purge.mask) == 0) && (nbits < purge.max_bits))
150 ++nbits;
151 if (nbits > purge.max_bits)
152 nbits = purge.max_bits;
153 start &= ~((1UL << nbits) - 1);
154
155# ifdef CONFIG_SMP
156 platform_global_tlb_purge(start, end, nbits);
157# else
158 do {
159 ia64_ptcl(start, (nbits<<2));
160 start += (1UL << nbits);
161 } while (start < end);
162# endif
163
164 ia64_srlz_i(); /* srlz.i implies srlz.d */
165}
166EXPORT_SYMBOL(flush_tlb_range);
167
168void __devinit
169ia64_tlb_init (void)
170{
171 ia64_ptce_info_t ptce_info;
172 unsigned long tr_pgbits;
173 long status;
174
175 if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) {
176 printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld;"
177 "defaulting to architected purge page-sizes.\n", status);
178 purge.mask = 0x115557000UL;
179 }
180 purge.max_bits = ia64_fls(purge.mask);
181
182 ia64_get_ptce(&ptce_info);
183 local_cpu_data->ptce_base = ptce_info.base;
184 local_cpu_data->ptce_count[0] = ptce_info.count[0];
185 local_cpu_data->ptce_count[1] = ptce_info.count[1];
186 local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
187 local_cpu_data->ptce_stride[1] = ptce_info.stride[1];
188
189 local_flush_tlb_all(); /* nuke left overs from bootstrapping... */
190}