aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-11-05 11:25:53 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-14 22:55:13 -0500
commita2f1b424900715ed9d1699c3bb88a434a2b42bc0 (patch)
tree8ef440f840656365166ff2d71aa445c224c53546
parent56720367cd89ef5265f39da2d674c5b92cd4cd87 (diff)
[PATCH] x86_64: Add 4GB DMA32 zone
Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones. As a bit of historical background: when the x86-64 port was originally designed we had some discussion if we should use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or both. Both was ruled out at this point because it was in early 2.4 when VM is still quite shakey and had bad troubles even dealing with one DMA zone. We settled on the 16MB DMA zone mainly because we worried about older soundcards and the floppy. But this has always caused problems since then because device drivers had trouble getting enough DMA able memory. These days the VM works much better and the wide use of NUMA has proven it can deal with many zones successfully. So this patch adds both zones. This helps drivers who need a lot of memory below 4GB because their hardware is not accessing more (graphic drivers - proprietary and free ones, video frame buffer drivers, sound drivers etc.). Previously they could only use IOMMU+16MB GFP_DMA, which was not enough memory. Another common problem is that hardware who has full memory addressing for >4GB misses it for some control structures in memory (like transmit rings or other metadata). They tended to allocate memory in the 16MB GFP_DMA or the IOMMU/swiotlb then using pci_alloc_consistent, but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory (even on AMD systems the IOMMU tends to be quite small) especially if you have many devices. With the new zone pci_alloc_consistent can just put this stuff into memory below 4GB which works better. One argument was still if the zone should be 4GB or 2GB. The main motivation for 2GB would be an unnamed not so unpopular hardware raid controller (mostly found in older machines from a particular four letter company) who has a strange 2GB restriction in firmware. But that one works ok with swiotlb/IOMMU anyways, so it doesn't really need GFP_DMA32. I chose 4GB to be compatible with IA64 and because it seems to be the most common restriction. The new zone is so far added only for x86-64. For other architectures who don't set up this new zone nothing changes. Architectures can set a compatibility define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32 as GFP_DMA. Otherwise it's a nop because on 32bit architectures it's normally not needed because GFP_NORMAL (=0) is DMA able enough. One problem is still that GFP_DMA means different things on different architectures. e.g. some drivers used to have #ifdef ia64 use GFP_DMA (trusting it to be 4GB) #elif __x86_64__ (use other hacks like the swiotlb because 16MB is not enough) ... . This was quite ugly and is now obsolete. These should be now converted to use GFP_DMA32 unconditionally. I haven't done this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent which will use GFP_DMA32 transparently. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/x86_64/mm/init.c65
-rw-r--r--arch/x86_64/mm/numa.c25
-rw-r--r--include/asm-x86_64/dma.h11
-rw-r--r--include/asm-x86_64/proto.h2
-rw-r--r--include/linux/gfp.h11
-rw-r--r--include/linux/mmzone.h16
-rw-r--r--mm/page_alloc.c15
7 files changed, 90 insertions, 55 deletions
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e60a1a848de8..a1ad4cc423a7 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -318,32 +318,51 @@ void zap_low_mappings(void)
318 flush_tlb_all(); 318 flush_tlb_all();
319} 319}
320 320
321/* Compute zone sizes for the DMA and DMA32 zones in a node. */
322__init void
323size_zones(unsigned long *z, unsigned long *h,
324 unsigned long start_pfn, unsigned long end_pfn)
325{
326 int i;
327 unsigned long w;
328
329 for (i = 0; i < MAX_NR_ZONES; i++)
330 z[i] = 0;
331
332 if (start_pfn < MAX_DMA_PFN)
333 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
334 if (start_pfn < MAX_DMA32_PFN) {
335 unsigned long dma32_pfn = MAX_DMA32_PFN;
336 if (dma32_pfn > end_pfn)
337 dma32_pfn = end_pfn;
338 z[ZONE_DMA32] = dma32_pfn - start_pfn;
339 }
340 z[ZONE_NORMAL] = end_pfn - start_pfn;
341
342 /* Remove lower zones from higher ones. */
343 w = 0;
344 for (i = 0; i < MAX_NR_ZONES; i++) {
345 if (z[i])
346 z[i] -= w;
347 w += z[i];
348 }
349
350 /* Compute holes */
351 w = 0;
352 for (i = 0; i < MAX_NR_ZONES; i++) {
353 unsigned long s = w;
354 w += z[i];
355 h[i] = e820_hole_size(s, w);
356 }
357}
358
321#ifndef CONFIG_NUMA 359#ifndef CONFIG_NUMA
322void __init paging_init(void) 360void __init paging_init(void)
323{ 361{
324 { 362 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
325 unsigned long zones_size[MAX_NR_ZONES]; 363 size_zones(zones, holes, 0, end_pfn);
326 unsigned long holes[MAX_NR_ZONES]; 364 free_area_init_node(0, NODE_DATA(0), zones,
327 unsigned int max_dma; 365 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
328
329 memset(zones_size, 0, sizeof(zones_size));
330 memset(holes, 0, sizeof(holes));
331
332 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
333
334 if (end_pfn < max_dma) {
335 zones_size[ZONE_DMA] = end_pfn;
336 holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
337 } else {
338 zones_size[ZONE_DMA] = max_dma;
339 holes[ZONE_DMA] = e820_hole_size(0, max_dma);
340 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
341 holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
342 }
343 free_area_init_node(0, NODE_DATA(0), zones_size,
344 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
345 }
346 return;
347} 366}
348#endif 367#endif
349 368
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 214803821001..18e86e2eac2d 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -132,29 +132,14 @@ void __init setup_node_zones(int nodeid)
132 unsigned long start_pfn, end_pfn; 132 unsigned long start_pfn, end_pfn;
133 unsigned long zones[MAX_NR_ZONES]; 133 unsigned long zones[MAX_NR_ZONES];
134 unsigned long holes[MAX_NR_ZONES]; 134 unsigned long holes[MAX_NR_ZONES];
135 unsigned long dma_end_pfn;
136 135
137 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 136 start_pfn = node_start_pfn(nodeid);
138 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); 137 end_pfn = node_end_pfn(nodeid);
139 138
140 start_pfn = node_start_pfn(nodeid); 139 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
141 end_pfn = node_end_pfn(nodeid); 140 nodeid, start_pfn, end_pfn);
142 141
143 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); 142 size_zones(zones, holes, start_pfn, end_pfn);
144
145 /* All nodes > 0 have a zero length zone DMA */
146 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
147 if (start_pfn < dma_end_pfn) {
148 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
149 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
150 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
151 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
152
153 } else {
154 zones[ZONE_NORMAL] = end_pfn - start_pfn;
155 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
156 }
157
158 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 143 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
159 start_pfn, holes); 144 start_pfn, holes);
160} 145}
diff --git a/include/asm-x86_64/dma.h b/include/asm-x86_64/dma.h
index 16fa3a064d0c..6f2a817b6a7c 100644
--- a/include/asm-x86_64/dma.h
+++ b/include/asm-x86_64/dma.h
@@ -72,8 +72,15 @@
72 72
73#define MAX_DMA_CHANNELS 8 73#define MAX_DMA_CHANNELS 8
74 74
75/* The maximum address that we can perform a DMA transfer to on this platform */ 75
76#define MAX_DMA_ADDRESS (PAGE_OFFSET+0x1000000) 76/* 16MB ISA DMA zone */
77#define MAX_DMA_PFN ((16*1024*1024) >> PAGE_SHIFT)
78
79/* 4GB broken PCI/AGP hardware bus master zone */
80#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
81
82/* Compat define for old dma zone */
83#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
77 84
78/* 8237 DMA controllers */ 85/* 8237 DMA controllers */
79#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */ 86#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index dbb37b0adb43..c251152a0658 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -22,6 +22,8 @@ extern void mtrr_bp_init(void);
22#define mtrr_bp_init() do {} while (0) 22#define mtrr_bp_init() do {} while (0)
23#endif 23#endif
24extern void init_memory_mapping(unsigned long start, unsigned long end); 24extern void init_memory_mapping(unsigned long start, unsigned long end);
25extern void size_zones(unsigned long *z, unsigned long *h,
26 unsigned long start_pfn, unsigned long end_pfn);
25 27
26extern void system_call(void); 28extern void system_call(void);
27extern int kernel_syscall(void); 29extern int kernel_syscall(void);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c3779432a723..4351e6bb5a79 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -14,6 +14,13 @@ struct vm_area_struct;
14/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ 14/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
15#define __GFP_DMA ((__force gfp_t)0x01u) 15#define __GFP_DMA ((__force gfp_t)0x01u)
16#define __GFP_HIGHMEM ((__force gfp_t)0x02u) 16#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
17#ifdef CONFIG_DMA_IS_DMA32
18#define __GFP_DMA32 ((__force gfp_t)0x01) /* ZONE_DMA is ZONE_DMA32 */
19#elif BITS_PER_LONG < 64
20#define __GFP_DMA32 ((__force gfp_t)0x00) /* ZONE_NORMAL is ZONE_DMA32 */
21#else
22#define __GFP_DMA32 ((__force gfp_t)0x04) /* Has own ZONE_DMA32 */
23#endif
17 24
18/* 25/*
19 * Action modifiers - doesn't change the zoning 26 * Action modifiers - doesn't change the zoning
@@ -64,6 +71,10 @@ struct vm_area_struct;
64 71
65#define GFP_DMA __GFP_DMA 72#define GFP_DMA __GFP_DMA
66 73
74/* 4GB DMA on some platforms */
75#define GFP_DMA32 __GFP_DMA32
76
77
67#define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK)) 78#define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK))
68 79
69/* 80/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f5fa3082fd6a..da7a829f8561 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -71,10 +71,11 @@ struct per_cpu_pageset {
71#endif 71#endif
72 72
73#define ZONE_DMA 0 73#define ZONE_DMA 0
74#define ZONE_NORMAL 1 74#define ZONE_DMA32 1
75#define ZONE_HIGHMEM 2 75#define ZONE_NORMAL 2
76#define ZONE_HIGHMEM 3
76 77
77#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ 78#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
78#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ 79#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
79 80
80 81
@@ -108,9 +109,10 @@ struct per_cpu_pageset {
108 109
109/* 110/*
110 * On machines where it is needed (eg PCs) we divide physical memory 111 * On machines where it is needed (eg PCs) we divide physical memory
111 * into multiple physical zones. On a PC we have 3 zones: 112 * into multiple physical zones. On a PC we have 4 zones:
112 * 113 *
113 * ZONE_DMA < 16 MB ISA DMA capable memory 114 * ZONE_DMA < 16 MB ISA DMA capable memory
115 * ZONE_DMA32 0 MB Empty
114 * ZONE_NORMAL 16-896 MB direct mapped by the kernel 116 * ZONE_NORMAL 16-896 MB direct mapped by the kernel
115 * ZONE_HIGHMEM > 896 MB only page cache and user processes 117 * ZONE_HIGHMEM > 896 MB only page cache and user processes
116 */ 118 */
@@ -455,10 +457,10 @@ extern struct pglist_data contig_page_data;
455 457
456#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 458#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
457/* 459/*
458 * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 460 * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
459 * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. 461 * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
460 */ 462 */
461#define FLAGS_RESERVED 8 463#define FLAGS_RESERVED 9
462 464
463#elif BITS_PER_LONG == 64 465#elif BITS_PER_LONG == 64
464/* 466/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2dbdd98426fd..9b43511dbefd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -60,8 +60,11 @@ long nr_swap_pages;
60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
63 *
64 * TBD: should special case ZONE_DMA32 machines here - in those we normally
65 * don't need any ZONE_NORMAL reservation
63 */ 66 */
64int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 67int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
65 68
66EXPORT_SYMBOL(totalram_pages); 69EXPORT_SYMBOL(totalram_pages);
67EXPORT_SYMBOL(nr_swap_pages); 70EXPORT_SYMBOL(nr_swap_pages);
@@ -73,7 +76,7 @@ EXPORT_SYMBOL(nr_swap_pages);
73struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 76struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
74EXPORT_SYMBOL(zone_table); 77EXPORT_SYMBOL(zone_table);
75 78
76static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 79static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
77int min_free_kbytes = 1024; 80int min_free_kbytes = 1024;
78 81
79unsigned long __initdata nr_kernel_pages; 82unsigned long __initdata nr_kernel_pages;
@@ -1442,6 +1445,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
1442 zone = pgdat->node_zones + ZONE_NORMAL; 1445 zone = pgdat->node_zones + ZONE_NORMAL;
1443 if (zone->present_pages) 1446 if (zone->present_pages)
1444 zonelist->zones[j++] = zone; 1447 zonelist->zones[j++] = zone;
1448 case ZONE_DMA32:
1449 zone = pgdat->node_zones + ZONE_DMA32;
1450 if (zone->present_pages)
1451 zonelist->zones[j++] = zone;
1445 case ZONE_DMA: 1452 case ZONE_DMA:
1446 zone = pgdat->node_zones + ZONE_DMA; 1453 zone = pgdat->node_zones + ZONE_DMA;
1447 if (zone->present_pages) 1454 if (zone->present_pages)
@@ -1456,6 +1463,8 @@ static inline int highest_zone(int zone_bits)
1456 int res = ZONE_NORMAL; 1463 int res = ZONE_NORMAL;
1457 if (zone_bits & (__force int)__GFP_HIGHMEM) 1464 if (zone_bits & (__force int)__GFP_HIGHMEM)
1458 res = ZONE_HIGHMEM; 1465 res = ZONE_HIGHMEM;
1466 if (zone_bits & (__force int)__GFP_DMA32)
1467 res = ZONE_DMA32;
1459 if (zone_bits & (__force int)__GFP_DMA) 1468 if (zone_bits & (__force int)__GFP_DMA)
1460 res = ZONE_DMA; 1469 res = ZONE_DMA;
1461 return res; 1470 return res;
@@ -1976,7 +1985,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1976 if (zholes_size) 1985 if (zholes_size)
1977 realsize -= zholes_size[j]; 1986 realsize -= zholes_size[j];
1978 1987
1979 if (j == ZONE_DMA || j == ZONE_NORMAL) 1988 if (j < ZONE_HIGHMEM)
1980 nr_kernel_pages += realsize; 1989 nr_kernel_pages += realsize;
1981 nr_all_pages += realsize; 1990 nr_all_pages += realsize;
1982 1991