aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2007-10-16 04:24:13 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:51 -0400
commit8f6aac419bd590f535fb110875a51f7db2b62b5b (patch)
tree64e73e9f7a4b5a68648a2b4b16e66307c3d8d3cf /mm
parent540557b9439ec19668553830c90222f9fb0c2e95 (diff)
Generic Virtual Memmap support for SPARSEMEM
SPARSEMEM is a pretty nice framework that unifies quite a bit of code over all the arches. It would be great if it could be the default so that we can get rid of various forms of DISCONTIG and other variations on memory maps. So far what has hindered this are the additional lookups that SPARSEMEM introduces for virt_to_page and page_address. This goes so far that the code to do this has to be kept in a separate function and cannot be used inline. This patch introduces a virtual memmap mode for SPARSEMEM, in which the memmap is mapped into a virtually contigious area, only the active sections are physically backed. This allows virt_to_page page_address and cohorts become simple shift/add operations. No page flag fields, no table lookups, nothing involving memory is required. The two key operations pfn_to_page and page_to_page become: #define __pfn_to_page(pfn) (vmemmap + (pfn)) #define __page_to_pfn(page) ((page) - vmemmap) By having a virtual mapping for the memmap we allow simple access without wasting physical memory. As kernel memory is typically already mapped 1:1 this introduces no additional overhead. The virtual mapping must be big enough to allow a struct page to be allocated and mapped for all valid physical pages. This vill make a virtual memmap difficult to use on 32 bit platforms that support 36 address bits. However, if there is enough virtual space available and the arch already maps its 1-1 kernel space using TLBs (f.e. true of IA64 and x86_64) then this technique makes SPARSEMEM lookups even more efficient than CONFIG_FLATMEM. FLATMEM needs to read the contents of the mem_map variable to get the start of the memmap and then add the offset to the required entry. vmemmap is a constant to which we can simply add the offset. This patch has the potential to allow us to make SPARSMEM the default (and even the only) option for most systems. It should be optimal on UP, SMP and NUMA on most platforms. Then we may even be able to remove the other memory models: FLATMEM, DISCONTIG etc. [apw@shadowen.org: config cleanups, resplit code etc] [kamezawa.hiroyu@jp.fujitsu.com: Fix sparsemem_vmemmap init] [apw@shadowen.org: vmemmap: remove excess debugging] [apw@shadowen.org: simplify initialisation code and reduce duplication] [apw@shadowen.org: pull out the vmemmap code into its own file] Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Andi Kleen <ak@suse.de> Cc: "David S. Miller" <davem@davemloft.net> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile1
-rw-r--r--mm/sparse-vmemmap.c181
-rw-r--r--mm/sparse.c21
3 files changed, 199 insertions, 4 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 245e33ab00c..d28f63e05b4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
18obj-$(CONFIG_HUGETLBFS) += hugetlb.o 18obj-$(CONFIG_HUGETLBFS) += hugetlb.o
19obj-$(CONFIG_NUMA) += mempolicy.o 19obj-$(CONFIG_NUMA) += mempolicy.o
20obj-$(CONFIG_SPARSEMEM) += sparse.o 20obj-$(CONFIG_SPARSEMEM) += sparse.o
21obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
21obj-$(CONFIG_SHMEM) += shmem.o 22obj-$(CONFIG_SHMEM) += shmem.o
22obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 23obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
23obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 24obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 00000000000..7bb7a4b96d7
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,181 @@
1/*
2 * Virtual Memory Map support
3 *
4 * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
5 *
6 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
7 * virt_to_page, page_address() to be implemented as a base offset
8 * calculation without memory access.
9 *
10 * However, virtual mappings need a page table and TLBs. Many Linux
11 * architectures already map their physical space using 1-1 mappings
12 * via TLBs. For those arches the virtual memmory map is essentially
13 * for free if we use the same page size as the 1-1 mappings. In that
14 * case the overhead consists of a few additional pages that are
15 * allocated to create a view of memory for vmemmap.
16 *
17 * Special Kconfig settings:
18 *
19 * CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
20 *
21 * The architecture has its own functions to populate the memory
22 * map and provides a vmemmap_populate function.
23 *
24 * CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD
25 *
26 * The architecture provides functions to populate the pmd level
27 * of the vmemmap mappings. Allowing mappings using large pages
28 * where available.
29 *
30 * If neither are set then PAGE_SIZE mappings are generated which
31 * require one PTE/TLB per PAGE_SIZE chunk of the virtual memory map.
32 */
33#include <linux/mm.h>
34#include <linux/mmzone.h>
35#include <linux/bootmem.h>
36#include <linux/highmem.h>
37#include <linux/module.h>
38#include <linux/spinlock.h>
39#include <linux/vmalloc.h>
40#include <asm/dma.h>
41#include <asm/pgalloc.h>
42#include <asm/pgtable.h>
43
44/*
45 * Allocate a block of memory to be used to back the virtual memory map
46 * or to back the page tables that are used to create the mapping.
47 * Uses the main allocators if they are available, else bootmem.
48 */
49void * __meminit vmemmap_alloc_block(unsigned long size, int node)
50{
51 /* If the main allocator is up use that, fallback to bootmem. */
52 if (slab_is_available()) {
53 struct page *page = alloc_pages_node(node,
54 GFP_KERNEL | __GFP_ZERO, get_order(size));
55 if (page)
56 return page_address(page);
57 return NULL;
58 } else
59 return __alloc_bootmem_node(NODE_DATA(node), size, size,
60 __pa(MAX_DMA_ADDRESS));
61}
62
63#ifndef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
64void __meminit vmemmap_verify(pte_t *pte, int node,
65 unsigned long start, unsigned long end)
66{
67 unsigned long pfn = pte_pfn(*pte);
68 int actual_node = early_pfn_to_nid(pfn);
69
70 if (actual_node != node)
71 printk(KERN_WARNING "[%lx-%lx] potential offnode "
72 "page_structs\n", start, end - 1);
73}
74
75#ifndef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD
76static int __meminit vmemmap_populate_pte(pmd_t *pmd, unsigned long addr,
77 unsigned long end, int node)
78{
79 pte_t *pte;
80
81 for (pte = pte_offset_kernel(pmd, addr); addr < end;
82 pte++, addr += PAGE_SIZE)
83 if (pte_none(*pte)) {
84 pte_t entry;
85 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
86 if (!p)
87 return -ENOMEM;
88
89 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
90 set_pte(pte, entry);
91
92 } else
93 vmemmap_verify(pte, node, addr + PAGE_SIZE, end);
94
95 return 0;
96}
97
98int __meminit vmemmap_populate_pmd(pud_t *pud, unsigned long addr,
99 unsigned long end, int node)
100{
101 pmd_t *pmd;
102 int error = 0;
103 unsigned long next;
104
105 for (pmd = pmd_offset(pud, addr); addr < end && !error;
106 pmd++, addr = next) {
107 if (pmd_none(*pmd)) {
108 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
109 if (!p)
110 return -ENOMEM;
111
112 pmd_populate_kernel(&init_mm, pmd, p);
113 } else
114 vmemmap_verify((pte_t *)pmd, node,
115 pmd_addr_end(addr, end), end);
116 next = pmd_addr_end(addr, end);
117 error = vmemmap_populate_pte(pmd, addr, next, node);
118 }
119 return error;
120}
121#endif /* CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD */
122
123static int __meminit vmemmap_populate_pud(pgd_t *pgd, unsigned long addr,
124 unsigned long end, int node)
125{
126 pud_t *pud;
127 int error = 0;
128 unsigned long next;
129
130 for (pud = pud_offset(pgd, addr); addr < end && !error;
131 pud++, addr = next) {
132 if (pud_none(*pud)) {
133 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
134 if (!p)
135 return -ENOMEM;
136
137 pud_populate(&init_mm, pud, p);
138 }
139 next = pud_addr_end(addr, end);
140 error = vmemmap_populate_pmd(pud, addr, next, node);
141 }
142 return error;
143}
144
145int __meminit vmemmap_populate(struct page *start_page,
146 unsigned long nr, int node)
147{
148 pgd_t *pgd;
149 unsigned long addr = (unsigned long)start_page;
150 unsigned long end = (unsigned long)(start_page + nr);
151 unsigned long next;
152 int error = 0;
153
154 printk(KERN_DEBUG "[%lx-%lx] Virtual memory section"
155 " (%ld pages) node %d\n", addr, end - 1, nr, node);
156
157 for (pgd = pgd_offset_k(addr); addr < end && !error;
158 pgd++, addr = next) {
159 if (pgd_none(*pgd)) {
160 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
161 if (!p)
162 return -ENOMEM;
163
164 pgd_populate(&init_mm, pgd, p);
165 }
166 next = pgd_addr_end(addr,end);
167 error = vmemmap_populate_pud(pgd, addr, next, node);
168 }
169 return error;
170}
171#endif /* !CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP */
172
173struct page __init *sparse_early_mem_map_populate(unsigned long pnum, int nid)
174{
175 struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
176 int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
177 if (error)
178 return NULL;
179
180 return map;
181}
diff --git a/mm/sparse.c b/mm/sparse.c
index 54f3940406c..52843a76fee 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -9,6 +9,8 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/vmalloc.h> 10#include <linux/vmalloc.h>
11#include <asm/dma.h> 11#include <asm/dma.h>
12#include <asm/pgalloc.h>
13#include <asm/pgtable.h>
12 14
13/* 15/*
14 * Permanent SPARSEMEM data: 16 * Permanent SPARSEMEM data:
@@ -222,11 +224,10 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
222 return NULL; 224 return NULL;
223} 225}
224 226
225static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 227#ifndef CONFIG_SPARSEMEM_VMEMMAP
228struct page __init *sparse_early_mem_map_populate(unsigned long pnum, int nid)
226{ 229{
227 struct page *map; 230 struct page *map;
228 struct mem_section *ms = __nr_to_section(pnum);
229 int nid = sparse_early_nid(ms);
230 231
231 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 232 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
232 if (map) 233 if (map)
@@ -239,10 +240,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
239 240
240 map = alloc_bootmem_node(NODE_DATA(nid), 241 map = alloc_bootmem_node(NODE_DATA(nid),
241 sizeof(struct page) * PAGES_PER_SECTION); 242 sizeof(struct page) * PAGES_PER_SECTION);
243 return map;
244}
245#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
246
247struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
248{
249 struct page *map;
250 struct mem_section *ms = __nr_to_section(pnum);
251 int nid = sparse_early_nid(ms);
252
253 map = sparse_early_mem_map_populate(pnum, nid);
242 if (map) 254 if (map)
243 return map; 255 return map;
244 256
245 printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); 257 printk(KERN_ERR "%s: sparsemem memory map backing failed "
258 "some memory will not be available.\n", __FUNCTION__);
246 ms->section_mem_map = 0; 259 ms->section_mem_map = 0;
247 return NULL; 260 return NULL;
248} 261}