aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Makefile3
-rw-r--r--mm/allocpercpu.c26
-rw-r--r--mm/bootmem.c948
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/filemap.c432
-rw-r--r--mm/filemap_xip.c70
-rw-r--r--mm/fremap.c3
-rw-r--r--mm/highmem.c6
-rw-r--r--mm/hugetlb.c1681
-rw-r--r--mm/internal.h61
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memcontrol.c385
-rw-r--r--mm/memory.c397
-rw-r--r--mm/memory_hotplug.c80
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c65
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c152
-rw-r--r--mm/mmap.c189
-rw-r--r--mm/mmu_notifier.c277
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c21
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c25
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c288
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/pdflush.c4
-rw-r--r--mm/quicklist.c9
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c69
-rw-r--r--mm/shmem.c106
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/slab.c35
-rw-r--r--mm/slob.c28
-rw-r--r--mm/slub.c148
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c116
-rw-r--r--mm/swap.c17
-rw-r--r--mm/swap_state.c40
-rw-r--r--mm/swapfile.c65
-rw-r--r--mm/tiny-shmem.c26
-rw-r--r--mm/truncate.c16
-rw-r--r--mm/util.c70
-rw-r--r--mm/vmalloc.c26
-rw-r--r--mm/vmscan.c93
-rw-r--r--mm/vmstat.c22
49 files changed, 4336 insertions, 1758 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa819d628c1..0bd9c2dbb2a0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,7 +129,7 @@ config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH) 132 depends on (IA64 || X86 || PPC64 || SUPERH || S390)
133 133
134comment "Memory hotplug is currently incompatible with Software Suspend" 134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION 135 depends on SPARSEMEM && HOTPLUG && HIBERNATION
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
174config MIGRATION 174config MIGRATION
175 bool "Page migration" 175 bool "Page migration"
176 def_bool y 176 def_bool y
177 depends on NUMA 177 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
178 help 178 help
179 Allows the migration of the physical location of pages of processes 179 Allows the migration of the physical location of pages of processes
180 while the virtual addresses are not changed. This is useful for 180 while the virtual addresses are not changed. This is useful for
@@ -199,9 +199,12 @@ config BOUNCE
199config NR_QUICK 199config NR_QUICK
200 int 200 int
201 depends on QUICKLIST 201 depends on QUICKLIST
202 default "2" if SUPERH 202 default "2" if SUPERH || AVR32
203 default "1" 203 default "1"
204 204
205config VIRT_TO_BUS 205config VIRT_TO_BUS
206 def_bool y 206 def_bool y
207 depends on !ARCH_NO_VIRT_TO_BUS 207 depends on !ARCH_NO_VIRT_TO_BUS
208
209config MMU_NOTIFIER
210 bool
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..da4ccf015aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 17obj-$(CONFIG_BOUNCE) += bounce.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 27obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index f4026bae6eed..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/mm/allocpercpu.c 2 * linux/mm/allocpercpu.c
3 * 3 *
4 * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com> 4 * Separated from slab.c August 11, 2006 Christoph Lameter
5 */ 5 */
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
@@ -18,27 +18,28 @@
18 * Depopulating per-cpu data for a cpu going offline would be a typical 18 * Depopulating per-cpu data for a cpu going offline would be a typical
19 * use case. You need to register a cpu hotplug handler for that purpose. 19 * use case. You need to register a cpu hotplug handler for that purpose.
20 */ 20 */
21void percpu_depopulate(void *__pdata, int cpu) 21static void percpu_depopulate(void *__pdata, int cpu)
22{ 22{
23 struct percpu_data *pdata = __percpu_disguise(__pdata); 23 struct percpu_data *pdata = __percpu_disguise(__pdata);
24 24
25 kfree(pdata->ptrs[cpu]); 25 kfree(pdata->ptrs[cpu]);
26 pdata->ptrs[cpu] = NULL; 26 pdata->ptrs[cpu] = NULL;
27} 27}
28EXPORT_SYMBOL_GPL(percpu_depopulate);
29 28
30/** 29/**
31 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's 30 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
32 * @__pdata: per-cpu data to depopulate 31 * @__pdata: per-cpu data to depopulate
33 * @mask: depopulate per-cpu data for cpu's selected through mask bits 32 * @mask: depopulate per-cpu data for cpu's selected through mask bits
34 */ 33 */
35void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
36{ 35{
37 int cpu; 36 int cpu;
38 for_each_cpu_mask(cpu, *mask) 37 for_each_cpu_mask_nr(cpu, *mask)
39 percpu_depopulate(__pdata, cpu); 38 percpu_depopulate(__pdata, cpu);
40} 39}
41EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); 40
41#define percpu_depopulate_mask(__pdata, mask) \
42 __percpu_depopulate_mask((__pdata), &(mask))
42 43
43/** 44/**
44 * percpu_populate - populate per-cpu data for given cpu 45 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
51 * use case. You need to register a cpu hotplug handler for that purpose. 52 * use case. You need to register a cpu hotplug handler for that purpose.
52 * Per-cpu object is populated with zeroed buffer. 53 * Per-cpu object is populated with zeroed buffer.
53 */ 54 */
54void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) 55static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
55{ 56{
56 struct percpu_data *pdata = __percpu_disguise(__pdata); 57 struct percpu_data *pdata = __percpu_disguise(__pdata);
57 int node = cpu_to_node(cpu); 58 int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
68 pdata->ptrs[cpu] = kzalloc(size, gfp); 69 pdata->ptrs[cpu] = kzalloc(size, gfp);
69 return pdata->ptrs[cpu]; 70 return pdata->ptrs[cpu];
70} 71}
71EXPORT_SYMBOL_GPL(percpu_populate);
72 72
73/** 73/**
74 * percpu_populate_mask - populate per-cpu data for more cpu's 74 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate);
79 * 79 *
80 * Per-cpu objects are populated with zeroed buffers. 80 * Per-cpu objects are populated with zeroed buffers.
81 */ 81 */
82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, 82static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
83 cpumask_t *mask) 83 cpumask_t *mask)
84{ 84{
85 cpumask_t populated; 85 cpumask_t populated;
86 int cpu; 86 int cpu;
87 87
88 cpus_clear(populated); 88 cpus_clear(populated);
89 for_each_cpu_mask(cpu, *mask) 89 for_each_cpu_mask_nr(cpu, *mask)
90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { 90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
91 __percpu_depopulate_mask(__pdata, &populated); 91 __percpu_depopulate_mask(__pdata, &populated);
92 return -ENOMEM; 92 return -ENOMEM;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
94 cpu_set(cpu, populated); 94 cpu_set(cpu, populated);
95 return 0; 95 return 0;
96} 96}
97EXPORT_SYMBOL_GPL(__percpu_populate_mask); 97
98#define percpu_populate_mask(__pdata, size, gfp, mask) \
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
98 100
99/** 101/**
100 * percpu_alloc_mask - initial setup of per-cpu data 102 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8fb927392b9..ad8eec6e44a8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/mm/bootmem.c 2 * bootmem - A boot-time physical memory allocator and configurator
3 * 3 *
4 * Copyright (C) 1999 Ingo Molnar 4 * Copyright (C) 1999 Ingo Molnar
5 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
6 * 7 *
7 * simple boot-time physical memory area allocator and 8 * Access to this subsystem has to be serialized externally (which is true
8 * free memory collector. It's used to deal with reserved 9 * for the boot process anyway).
9 * system memory and memory holes as well.
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
@@ -19,15 +19,10 @@
19 19
20#include "internal.h" 20#include "internal.h"
21 21
22/*
23 * Access to this subsystem has to be serialized externally. (this is
24 * true for the boot process anyway)
25 */
26unsigned long max_low_pfn; 22unsigned long max_low_pfn;
27unsigned long min_low_pfn; 23unsigned long min_low_pfn;
28unsigned long max_pfn; 24unsigned long max_pfn;
29 25
30static LIST_HEAD(bdata_list);
31#ifdef CONFIG_CRASH_DUMP 26#ifdef CONFIG_CRASH_DUMP
32/* 27/*
33 * If we have booted due to a crash, max_pfn will be a very low value. We need 28 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
36unsigned long saved_max_pfn; 31unsigned long saved_max_pfn;
37#endif 32#endif
38 33
39/* return the number of _pages_ that will be allocated for the boot bitmap */ 34bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
40unsigned long __init bootmem_bootmap_pages(unsigned long pages) 35
36static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
37
38static int bootmem_debug;
39
40static int __init bootmem_debug_setup(char *buf)
41{ 41{
42 unsigned long mapsize; 42 bootmem_debug = 1;
43 return 0;
44}
45early_param("bootmem_debug", bootmem_debug_setup);
43 46
44 mapsize = (pages+7)/8; 47#define bdebug(fmt, args...) ({ \
45 mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; 48 if (unlikely(bootmem_debug)) \
46 mapsize >>= PAGE_SHIFT; 49 printk(KERN_INFO \
50 "bootmem::%s " fmt, \
51 __FUNCTION__, ## args); \
52})
47 53
48 return mapsize; 54static unsigned long __init bootmap_bytes(unsigned long pages)
55{
56 unsigned long bytes = (pages + 7) / 8;
57
58 return ALIGN(bytes, sizeof(long));
49} 59}
50 60
51/* 61/**
52 * link bdata in order 62 * bootmem_bootmap_pages - calculate bitmap size in pages
63 * @pages: number of pages the bitmap has to represent
53 */ 64 */
54static void __init link_bootmem(bootmem_data_t *bdata) 65unsigned long __init bootmem_bootmap_pages(unsigned long pages)
55{ 66{
56 bootmem_data_t *ent; 67 unsigned long bytes = bootmap_bytes(pages);
57 68
58 if (list_empty(&bdata_list)) { 69 return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
59 list_add(&bdata->list, &bdata_list);
60 return;
61 }
62 /* insert in order */
63 list_for_each_entry(ent, &bdata_list, list) {
64 if (bdata->node_boot_start < ent->node_boot_start) {
65 list_add_tail(&bdata->list, &ent->list);
66 return;
67 }
68 }
69 list_add_tail(&bdata->list, &bdata_list);
70} 70}
71 71
72/* 72/*
73 * Given an initialised bdata, it returns the size of the boot bitmap 73 * link bdata in order
74 */ 74 */
75static unsigned long __init get_mapsize(bootmem_data_t *bdata) 75static void __init link_bootmem(bootmem_data_t *bdata)
76{ 76{
77 unsigned long mapsize; 77 struct list_head *iter;
78 unsigned long start = PFN_DOWN(bdata->node_boot_start);
79 unsigned long end = bdata->node_low_pfn;
80 78
81 mapsize = ((end - start) + 7) / 8; 79 list_for_each(iter, &bdata_list) {
82 return ALIGN(mapsize, sizeof(long)); 80 bootmem_data_t *ent;
81
82 ent = list_entry(iter, bootmem_data_t, list);
83 if (bdata->node_min_pfn < ent->node_min_pfn)
84 break;
85 }
86 list_add_tail(&bdata->list, iter);
83} 87}
84 88
85/* 89/*
86 * Called once to set up the allocator itself. 90 * Called once to set up the allocator itself.
87 */ 91 */
88static unsigned long __init init_bootmem_core(pg_data_t *pgdat, 92static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
89 unsigned long mapstart, unsigned long start, unsigned long end) 93 unsigned long mapstart, unsigned long start, unsigned long end)
90{ 94{
91 bootmem_data_t *bdata = pgdat->bdata;
92 unsigned long mapsize; 95 unsigned long mapsize;
93 96
97 mminit_validate_memmodel_limits(&start, &end);
94 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); 98 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
95 bdata->node_boot_start = PFN_PHYS(start); 99 bdata->node_min_pfn = start;
96 bdata->node_low_pfn = end; 100 bdata->node_low_pfn = end;
97 link_bootmem(bdata); 101 link_bootmem(bdata);
98 102
@@ -100,427 +104,484 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
100 * Initially all pages are reserved - setup_arch() has to 104 * Initially all pages are reserved - setup_arch() has to
101 * register free RAM areas explicitly. 105 * register free RAM areas explicitly.
102 */ 106 */
103 mapsize = get_mapsize(bdata); 107 mapsize = bootmap_bytes(end - start);
104 memset(bdata->node_bootmem_map, 0xff, mapsize); 108 memset(bdata->node_bootmem_map, 0xff, mapsize);
105 109
110 bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
111 bdata - bootmem_node_data, start, mapstart, end, mapsize);
112
106 return mapsize; 113 return mapsize;
107} 114}
108 115
109/* 116/**
110 * Marks a particular physical memory range as unallocatable. Usable RAM 117 * init_bootmem_node - register a node as boot memory
111 * might be used for boot-time allocations - or it might get added 118 * @pgdat: node to register
112 * to the free page pool later on. 119 * @freepfn: pfn where the bitmap for this node is to be placed
120 * @startpfn: first pfn on the node
121 * @endpfn: first pfn after the node
122 *
123 * Returns the number of bytes needed to hold the bitmap for this node.
113 */ 124 */
114static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, 125unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
115 unsigned long addr, unsigned long size, int flags) 126 unsigned long startpfn, unsigned long endpfn)
116{ 127{
117 unsigned long sidx, eidx; 128 return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
118 unsigned long i; 129}
119 130
120 BUG_ON(!size); 131/**
132 * init_bootmem - register boot memory
133 * @start: pfn where the bitmap is to be placed
134 * @pages: number of available physical pages
135 *
136 * Returns the number of bytes needed to hold the bitmap.
137 */
138unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
139{
140 max_low_pfn = pages;
141 min_low_pfn = start;
142 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
143}
121 144
122 /* out of range, don't hold other */ 145static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
123 if (addr + size < bdata->node_boot_start || 146{
124 PFN_DOWN(addr) > bdata->node_low_pfn) 147 int aligned;
148 struct page *page;
149 unsigned long start, end, pages, count = 0;
150
151 if (!bdata->node_bootmem_map)
125 return 0; 152 return 0;
126 153
154 start = bdata->node_min_pfn;
155 end = bdata->node_low_pfn;
156
127 /* 157 /*
128 * Round up to index to the range. 158 * If the start is aligned to the machines wordsize, we might
159 * be able to free pages in bulks of that order.
129 */ 160 */
130 if (addr > bdata->node_boot_start) 161 aligned = !(start & (BITS_PER_LONG - 1));
131 sidx= PFN_DOWN(addr - bdata->node_boot_start);
132 else
133 sidx = 0;
134 162
135 eidx = PFN_UP(addr + size - bdata->node_boot_start); 163 bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
136 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 164 bdata - bootmem_node_data, start, end, aligned);
137 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
138 165
139 for (i = sidx; i < eidx; i++) { 166 while (start < end) {
140 if (test_bit(i, bdata->node_bootmem_map)) { 167 unsigned long *map, idx, vec;
141 if (flags & BOOTMEM_EXCLUSIVE) 168
142 return -EBUSY; 169 map = bdata->node_bootmem_map;
170 idx = start - bdata->node_min_pfn;
171 vec = ~map[idx / BITS_PER_LONG];
172
173 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
174 int order = ilog2(BITS_PER_LONG);
175
176 __free_pages_bootmem(pfn_to_page(start), order);
177 count += BITS_PER_LONG;
178 } else {
179 unsigned long off = 0;
180
181 while (vec && off < BITS_PER_LONG) {
182 if (vec & 1) {
183 page = pfn_to_page(start + off);
184 __free_pages_bootmem(page, 0);
185 count++;
186 }
187 vec >>= 1;
188 off++;
189 }
143 } 190 }
191 start += BITS_PER_LONG;
144 } 192 }
145 193
146 return 0; 194 page = virt_to_page(bdata->node_bootmem_map);
195 pages = bdata->node_low_pfn - bdata->node_min_pfn;
196 pages = bootmem_bootmap_pages(pages);
197 count += pages;
198 while (pages--)
199 __free_pages_bootmem(page++, 0);
200
201 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
147 202
203 return count;
148} 204}
149 205
150static void __init reserve_bootmem_core(bootmem_data_t *bdata, 206/**
151 unsigned long addr, unsigned long size, int flags) 207 * free_all_bootmem_node - release a node's free pages to the buddy allocator
208 * @pgdat: node to be released
209 *
210 * Returns the number of pages actually released.
211 */
212unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
152{ 213{
153 unsigned long sidx, eidx; 214 register_page_bootmem_info_node(pgdat);
154 unsigned long i; 215 return free_all_bootmem_core(pgdat->bdata);
216}
155 217
156 BUG_ON(!size); 218/**
219 * free_all_bootmem - release free pages to the buddy allocator
220 *
221 * Returns the number of pages actually released.
222 */
223unsigned long __init free_all_bootmem(void)
224{
225 return free_all_bootmem_core(NODE_DATA(0)->bdata);
226}
227
228static void __init __free(bootmem_data_t *bdata,
229 unsigned long sidx, unsigned long eidx)
230{
231 unsigned long idx;
157 232
158 /* out of range */ 233 bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
159 if (addr + size < bdata->node_boot_start || 234 sidx + bdata->node_min_pfn,
160 PFN_DOWN(addr) > bdata->node_low_pfn) 235 eidx + bdata->node_min_pfn);
161 return;
162 236
163 /* 237 if (bdata->hint_idx > sidx)
164 * Round up to index to the range. 238 bdata->hint_idx = sidx;
165 */
166 if (addr > bdata->node_boot_start)
167 sidx= PFN_DOWN(addr - bdata->node_boot_start);
168 else
169 sidx = 0;
170 239
171 eidx = PFN_UP(addr + size - bdata->node_boot_start); 240 for (idx = sidx; idx < eidx; idx++)
172 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 241 if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
173 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); 242 BUG();
243}
174 244
175 for (i = sidx; i < eidx; i++) { 245static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
176 if (test_and_set_bit(i, bdata->node_bootmem_map)) { 246 unsigned long eidx, int flags)
177#ifdef CONFIG_DEBUG_BOOTMEM 247{
178 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); 248 unsigned long idx;
179#endif 249 int exclusive = flags & BOOTMEM_EXCLUSIVE;
250
251 bdebug("nid=%td start=%lx end=%lx flags=%x\n",
252 bdata - bootmem_node_data,
253 sidx + bdata->node_min_pfn,
254 eidx + bdata->node_min_pfn,
255 flags);
256
257 for (idx = sidx; idx < eidx; idx++)
258 if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
259 if (exclusive) {
260 __free(bdata, sidx, idx);
261 return -EBUSY;
262 }
263 bdebug("silent double reserve of PFN %lx\n",
264 idx + bdata->node_min_pfn);
180 } 265 }
181 } 266 return 0;
182} 267}
183 268
184static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 269static int __init mark_bootmem_node(bootmem_data_t *bdata,
185 unsigned long size) 270 unsigned long start, unsigned long end,
271 int reserve, int flags)
186{ 272{
187 unsigned long sidx, eidx; 273 unsigned long sidx, eidx;
188 unsigned long i;
189 274
190 BUG_ON(!size); 275 bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
276 bdata - bootmem_node_data, start, end, reserve, flags);
191 277
192 /* out range */ 278 BUG_ON(start < bdata->node_min_pfn);
193 if (addr + size < bdata->node_boot_start || 279 BUG_ON(end > bdata->node_low_pfn);
194 PFN_DOWN(addr) > bdata->node_low_pfn)
195 return;
196 /*
197 * round down end of usable mem, partially free pages are
198 * considered reserved.
199 */
200 280
201 if (addr >= bdata->node_boot_start && addr < bdata->last_success) 281 sidx = start - bdata->node_min_pfn;
202 bdata->last_success = addr; 282 eidx = end - bdata->node_min_pfn;
203 283
204 /* 284 if (reserve)
205 * Round up to index to the range. 285 return __reserve(bdata, sidx, eidx, flags);
206 */
207 if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
208 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
209 else 286 else
210 sidx = 0; 287 __free(bdata, sidx, eidx);
288 return 0;
289}
211 290
212 eidx = PFN_DOWN(addr + size - bdata->node_boot_start); 291static int __init mark_bootmem(unsigned long start, unsigned long end,
213 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 292 int reserve, int flags)
214 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); 293{
294 unsigned long pos;
295 bootmem_data_t *bdata;
215 296
216 for (i = sidx; i < eidx; i++) { 297 pos = start;
217 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 298 list_for_each_entry(bdata, &bdata_list, list) {
218 BUG(); 299 int err;
300 unsigned long max;
301
302 if (pos < bdata->node_min_pfn ||
303 pos >= bdata->node_low_pfn) {
304 BUG_ON(pos != start);
305 continue;
306 }
307
308 max = min(bdata->node_low_pfn, end);
309
310 err = mark_bootmem_node(bdata, pos, max, reserve, flags);
311 if (reserve && err) {
312 mark_bootmem(start, pos, 0, 0);
313 return err;
314 }
315
316 if (max == end)
317 return 0;
318 pos = bdata->node_low_pfn;
219 } 319 }
320 BUG();
220} 321}
221 322
222/* 323/**
223 * We 'merge' subsequent allocations to save space. We might 'lose' 324 * free_bootmem_node - mark a page range as usable
224 * some fraction of a page if allocations cannot be satisfied due to 325 * @pgdat: node the range resides on
225 * size constraints on boxes where there is physical RAM space 326 * @physaddr: starting address of the range
226 * fragmentation - in these cases (mostly large memory boxes) this 327 * @size: size of the range in bytes
227 * is not a problem.
228 * 328 *
229 * On low memory boxes we get it right in 100% of the cases. 329 * Partial pages will be considered reserved and left as they are.
230 * 330 *
231 * alignment has to be a power of 2 value. 331 * The range must reside completely on the specified node.
232 *
233 * NOTE: This function is _not_ reentrant.
234 */ 332 */
235void * __init 333void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
236__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, 334 unsigned long size)
237 unsigned long align, unsigned long goal, unsigned long limit)
238{ 335{
239 unsigned long areasize, preferred; 336 unsigned long start, end;
240 unsigned long i, start = 0, incr, eidx, end_pfn;
241 void *ret;
242 unsigned long node_boot_start;
243 void *node_bootmem_map;
244
245 if (!size) {
246 printk("__alloc_bootmem_core(): zero-sized request\n");
247 BUG();
248 }
249 BUG_ON(align & (align-1));
250 337
251 /* on nodes without memory - bootmem_map is NULL */ 338 start = PFN_UP(physaddr);
252 if (!bdata->node_bootmem_map) 339 end = PFN_DOWN(physaddr + size);
253 return NULL;
254 340
255 /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ 341 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
256 node_boot_start = bdata->node_boot_start; 342}
257 node_bootmem_map = bdata->node_bootmem_map;
258 if (align) {
259 node_boot_start = ALIGN(bdata->node_boot_start, align);
260 if (node_boot_start > bdata->node_boot_start)
261 node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
262 PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
263 }
264
265 if (limit && node_boot_start >= limit)
266 return NULL;
267
268 end_pfn = bdata->node_low_pfn;
269 limit = PFN_DOWN(limit);
270 if (limit && end_pfn > limit)
271 end_pfn = limit;
272 343
273 eidx = end_pfn - PFN_DOWN(node_boot_start); 344/**
345 * free_bootmem - mark a page range as usable
346 * @addr: starting address of the range
347 * @size: size of the range in bytes
348 *
349 * Partial pages will be considered reserved and left as they are.
350 *
351 * The range must be contiguous but may span node boundaries.
352 */
353void __init free_bootmem(unsigned long addr, unsigned long size)
354{
355 unsigned long start, end;
274 356
275 /* 357 start = PFN_UP(addr);
276 * We try to allocate bootmem pages above 'goal' 358 end = PFN_DOWN(addr + size);
277 * first, then we try to allocate lower pages.
278 */
279 preferred = 0;
280 if (goal && PFN_DOWN(goal) < end_pfn) {
281 if (goal > node_boot_start)
282 preferred = goal - node_boot_start;
283
284 if (bdata->last_success > node_boot_start &&
285 bdata->last_success - node_boot_start >= preferred)
286 if (!limit || (limit && limit > bdata->last_success))
287 preferred = bdata->last_success - node_boot_start;
288 }
289 359
290 preferred = PFN_DOWN(ALIGN(preferred, align)); 360 mark_bootmem(start, end, 0, 0);
291 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; 361}
292 incr = align >> PAGE_SHIFT ? : 1;
293 362
294restart_scan: 363/**
295 for (i = preferred; i < eidx;) { 364 * reserve_bootmem_node - mark a page range as reserved
296 unsigned long j; 365 * @pgdat: node the range resides on
366 * @physaddr: starting address of the range
367 * @size: size of the range in bytes
368 * @flags: reservation flags (see linux/bootmem.h)
369 *
370 * Partial pages will be reserved.
371 *
372 * The range must reside completely on the specified node.
373 */
374int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size, int flags)
376{
377 unsigned long start, end;
297 378
298 i = find_next_zero_bit(node_bootmem_map, eidx, i); 379 start = PFN_DOWN(physaddr);
299 i = ALIGN(i, incr); 380 end = PFN_UP(physaddr + size);
300 if (i >= eidx)
301 break;
302 if (test_bit(i, node_bootmem_map)) {
303 i += incr;
304 continue;
305 }
306 for (j = i + 1; j < i + areasize; ++j) {
307 if (j >= eidx)
308 goto fail_block;
309 if (test_bit(j, node_bootmem_map))
310 goto fail_block;
311 }
312 start = i;
313 goto found;
314 fail_block:
315 i = ALIGN(j, incr);
316 if (i == j)
317 i += incr;
318 }
319 381
320 if (preferred > 0) { 382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
321 preferred = 0; 383}
322 goto restart_scan;
323 }
324 return NULL;
325 384
326found: 385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
327 bdata->last_success = PFN_PHYS(start) + node_boot_start; 386/**
328 BUG_ON(start >= eidx); 387 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range
389 * @size: size of the range in bytes
390 * @flags: reservation flags (see linux/bootmem.h)
391 *
392 * Partial pages will be reserved.
393 *
394 * The range must be contiguous but may span node boundaries.
395 */
396int __init reserve_bootmem(unsigned long addr, unsigned long size,
397 int flags)
398{
399 unsigned long start, end;
329 400
330 /* 401 start = PFN_DOWN(addr);
331 * Is the next page of the previous allocation-end the start 402 end = PFN_UP(addr + size);
332 * of this allocation's buffer? If yes then we can 'merge'
333 * the previous partial page with this allocation.
334 */
335 if (align < PAGE_SIZE &&
336 bdata->last_offset && bdata->last_pos+1 == start) {
337 unsigned long offset, remaining_size;
338 offset = ALIGN(bdata->last_offset, align);
339 BUG_ON(offset > PAGE_SIZE);
340 remaining_size = PAGE_SIZE - offset;
341 if (size < remaining_size) {
342 areasize = 0;
343 /* last_pos unchanged */
344 bdata->last_offset = offset + size;
345 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
346 offset + node_boot_start);
347 } else {
348 remaining_size = size - remaining_size;
349 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
350 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
351 offset + node_boot_start);
352 bdata->last_pos = start + areasize - 1;
353 bdata->last_offset = remaining_size;
354 }
355 bdata->last_offset &= ~PAGE_MASK;
356 } else {
357 bdata->last_pos = start + areasize - 1;
358 bdata->last_offset = size & ~PAGE_MASK;
359 ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
360 }
361 403
362 /* 404 return mark_bootmem(start, end, 1, flags);
363 * Reserve the area now:
364 */
365 for (i = start; i < start + areasize; i++)
366 if (unlikely(test_and_set_bit(i, node_bootmem_map)))
367 BUG();
368 memset(ret, 0, size);
369 return ret;
370} 405}
406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
371 407
372static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 408static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
409 unsigned long step)
373{ 410{
374 struct page *page; 411 unsigned long base = bdata->node_min_pfn;
375 unsigned long pfn;
376 bootmem_data_t *bdata = pgdat->bdata;
377 unsigned long i, count, total = 0;
378 unsigned long idx;
379 unsigned long *map;
380 int gofast = 0;
381
382 BUG_ON(!bdata->node_bootmem_map);
383
384 count = 0;
385 /* first extant page of the node */
386 pfn = PFN_DOWN(bdata->node_boot_start);
387 idx = bdata->node_low_pfn - pfn;
388 map = bdata->node_bootmem_map;
389 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
390 if (bdata->node_boot_start == 0 ||
391 ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
392 gofast = 1;
393 for (i = 0; i < idx; ) {
394 unsigned long v = ~map[i / BITS_PER_LONG];
395
396 if (gofast && v == ~0UL) {
397 int order;
398
399 page = pfn_to_page(pfn);
400 count += BITS_PER_LONG;
401 order = ffs(BITS_PER_LONG) - 1;
402 __free_pages_bootmem(page, order);
403 i += BITS_PER_LONG;
404 page += BITS_PER_LONG;
405 } else if (v) {
406 unsigned long m;
407
408 page = pfn_to_page(pfn);
409 for (m = 1; m && i < idx; m<<=1, page++, i++) {
410 if (v & m) {
411 count++;
412 __free_pages_bootmem(page, 0);
413 }
414 }
415 } else {
416 i += BITS_PER_LONG;
417 }
418 pfn += BITS_PER_LONG;
419 }
420 total += count;
421 412
422 /* 413 /*
423 * Now free the allocator bitmap itself, it's not 414 * Align the index with respect to the node start so that the
424 * needed anymore: 415 * combination of both satisfies the requested alignment.
425 */ 416 */
426 page = virt_to_page(bdata->node_bootmem_map);
427 count = 0;
428 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
429 for (i = 0; i < idx; i++, page++) {
430 __free_pages_bootmem(page, 0);
431 count++;
432 }
433 total += count;
434 bdata->node_bootmem_map = NULL;
435 417
436 return total; 418 return ALIGN(base + idx, step) - base;
437} 419}
438 420
439unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, 421static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
440 unsigned long startpfn, unsigned long endpfn) 422 unsigned long align)
441{ 423{
442 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); 424 unsigned long base = PFN_PHYS(bdata->node_min_pfn);
443}
444 425
445void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 426 /* Same as align_idx for byte offsets */
446 unsigned long size, int flags)
447{
448 int ret;
449 427
450 ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); 428 return ALIGN(base + off, align) - base;
451 if (ret < 0)
452 return;
453 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
454} 429}
455 430
456void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 431static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
457 unsigned long size) 432 unsigned long size, unsigned long align,
433 unsigned long goal, unsigned long limit)
458{ 434{
459 free_bootmem_core(pgdat->bdata, physaddr, size); 435 unsigned long fallback = 0;
460} 436 unsigned long min, max, start, sidx, midx, step;
461 437
462unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 438 BUG_ON(!size);
463{ 439 BUG_ON(align & (align - 1));
464 register_page_bootmem_info_node(pgdat); 440 BUG_ON(limit && goal + size > limit);
465 return free_all_bootmem_core(pgdat);
466}
467 441
468unsigned long __init init_bootmem(unsigned long start, unsigned long pages) 442 if (!bdata->node_bootmem_map)
469{ 443 return NULL;
470 max_low_pfn = pages;
471 min_low_pfn = start;
472 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
473}
474 444
475#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 445 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
476int __init reserve_bootmem(unsigned long addr, unsigned long size, 446 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
477 int flags) 447 align, goal, limit);
478{
479 bootmem_data_t *bdata;
480 int ret;
481 448
482 list_for_each_entry(bdata, &bdata_list, list) { 449 min = bdata->node_min_pfn;
483 ret = can_reserve_bootmem_core(bdata, addr, size, flags); 450 max = bdata->node_low_pfn;
484 if (ret < 0) 451
485 return ret; 452 goal >>= PAGE_SHIFT;
453 limit >>= PAGE_SHIFT;
454
455 if (limit && max > limit)
456 max = limit;
457 if (max <= min)
458 return NULL;
459
460 step = max(align >> PAGE_SHIFT, 1UL);
461
462 if (goal && min < goal && goal < max)
463 start = ALIGN(goal, step);
464 else
465 start = ALIGN(min, step);
466
467 sidx = start - bdata->node_min_pfn;
468 midx = max - bdata->node_min_pfn;
469
470 if (bdata->hint_idx > sidx) {
471 /*
472 * Handle the valid case of sidx being zero and still
473 * catch the fallback below.
474 */
475 fallback = sidx + 1;
476 sidx = align_idx(bdata, bdata->hint_idx, step);
486 } 477 }
487 list_for_each_entry(bdata, &bdata_list, list)
488 reserve_bootmem_core(bdata, addr, size, flags);
489 478
490 return 0; 479 while (1) {
491} 480 int merge;
492#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 481 void *region;
482 unsigned long eidx, i, start_off, end_off;
483find_block:
484 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
485 sidx = align_idx(bdata, sidx, step);
486 eidx = sidx + PFN_UP(size);
493 487
494void __init free_bootmem(unsigned long addr, unsigned long size) 488 if (sidx >= midx || eidx > midx)
495{ 489 break;
496 bootmem_data_t *bdata;
497 list_for_each_entry(bdata, &bdata_list, list)
498 free_bootmem_core(bdata, addr, size);
499}
500 490
501unsigned long __init free_all_bootmem(void) 491 for (i = sidx; i < eidx; i++)
502{ 492 if (test_bit(i, bdata->node_bootmem_map)) {
503 return free_all_bootmem_core(NODE_DATA(0)); 493 sidx = align_idx(bdata, i, step);
494 if (sidx == i)
495 sidx += step;
496 goto find_block;
497 }
498
499 if (bdata->last_end_off & (PAGE_SIZE - 1) &&
500 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
501 start_off = align_off(bdata, bdata->last_end_off, align);
502 else
503 start_off = PFN_PHYS(sidx);
504
505 merge = PFN_DOWN(start_off) < sidx;
506 end_off = start_off + size;
507
508 bdata->last_end_off = end_off;
509 bdata->hint_idx = PFN_UP(end_off);
510
511 /*
512 * Reserve the area now:
513 */
514 if (__reserve(bdata, PFN_DOWN(start_off) + merge,
515 PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
516 BUG();
517
518 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
519 start_off);
520 memset(region, 0, size);
521 return region;
522 }
523
524 if (fallback) {
525 sidx = align_idx(bdata, fallback - 1, step);
526 fallback = 0;
527 goto find_block;
528 }
529
530 return NULL;
504} 531}
505 532
506void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 533static void * __init ___alloc_bootmem_nopanic(unsigned long size,
507 unsigned long goal) 534 unsigned long align,
535 unsigned long goal,
536 unsigned long limit)
508{ 537{
509 bootmem_data_t *bdata; 538 bootmem_data_t *bdata;
510 void *ptr;
511 539
540restart:
512 list_for_each_entry(bdata, &bdata_list, list) { 541 list_for_each_entry(bdata, &bdata_list, list) {
513 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); 542 void *region;
514 if (ptr) 543
515 return ptr; 544 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
545 continue;
546 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
547 break;
548
549 region = alloc_bootmem_core(bdata, size, align, goal, limit);
550 if (region)
551 return region;
552 }
553
554 if (goal) {
555 goal = 0;
556 goto restart;
516 } 557 }
558
517 return NULL; 559 return NULL;
518} 560}
519 561
520void * __init __alloc_bootmem(unsigned long size, unsigned long align, 562/**
521 unsigned long goal) 563 * __alloc_bootmem_nopanic - allocate boot memory without panicking
564 * @size: size of the request in bytes
565 * @align: alignment of the region
566 * @goal: preferred starting address of the region
567 *
568 * The goal is dropped if it can not be satisfied and the allocation will
569 * fall back to memory below @goal.
570 *
571 * Allocation may happen on any node in the system.
572 *
573 * Returns NULL on failure.
574 */
575void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
576 unsigned long goal)
522{ 577{
523 void *mem = __alloc_bootmem_nopanic(size,align,goal); 578 return ___alloc_bootmem_nopanic(size, align, goal, 0);
579}
580
581static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
582 unsigned long goal, unsigned long limit)
583{
584 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
524 585
525 if (mem) 586 if (mem)
526 return mem; 587 return mem;
@@ -532,78 +593,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
532 return NULL; 593 return NULL;
533} 594}
534 595
596/**
597 * __alloc_bootmem - allocate boot memory
598 * @size: size of the request in bytes
599 * @align: alignment of the region
600 * @goal: preferred starting address of the region
601 *
602 * The goal is dropped if it can not be satisfied and the allocation will
603 * fall back to memory below @goal.
604 *
605 * Allocation may happen on any node in the system.
606 *
607 * The function panics if the request can not be satisfied.
608 */
609void * __init __alloc_bootmem(unsigned long size, unsigned long align,
610 unsigned long goal)
611{
612 return ___alloc_bootmem(size, align, goal, 0);
613}
535 614
536void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 615static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
537 unsigned long align, unsigned long goal) 616 unsigned long size, unsigned long align,
617 unsigned long goal, unsigned long limit)
538{ 618{
539 void *ptr; 619 void *ptr;
540 620
541 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 621 ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
542 if (ptr) 622 if (ptr)
543 return ptr; 623 return ptr;
544 624
545 return __alloc_bootmem(size, align, goal); 625 return ___alloc_bootmem(size, align, goal, limit);
626}
627
628/**
629 * __alloc_bootmem_node - allocate boot memory from a specific node
630 * @pgdat: node to allocate from
631 * @size: size of the request in bytes
632 * @align: alignment of the region
633 * @goal: preferred starting address of the region
634 *
635 * The goal is dropped if it can not be satisfied and the allocation will
636 * fall back to memory below @goal.
637 *
638 * Allocation may fall back to any node in the system if the specified node
639 * can not hold the requested memory.
640 *
641 * The function panics if the request can not be satisfied.
642 */
643void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
644 unsigned long align, unsigned long goal)
645{
646 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
546} 647}
547 648
548#ifdef CONFIG_SPARSEMEM 649#ifdef CONFIG_SPARSEMEM
650/**
651 * alloc_bootmem_section - allocate boot memory from a specific section
652 * @size: size of the request in bytes
653 * @section_nr: sparse map section to allocate from
654 *
655 * Return NULL on failure.
656 */
549void * __init alloc_bootmem_section(unsigned long size, 657void * __init alloc_bootmem_section(unsigned long size,
550 unsigned long section_nr) 658 unsigned long section_nr)
551{ 659{
552 void *ptr; 660 bootmem_data_t *bdata;
553 unsigned long limit, goal, start_nr, end_nr, pfn; 661 unsigned long pfn, goal, limit;
554 struct pglist_data *pgdat;
555 662
556 pfn = section_nr_to_pfn(section_nr); 663 pfn = section_nr_to_pfn(section_nr);
557 goal = PFN_PHYS(pfn); 664 goal = pfn << PAGE_SHIFT;
558 limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; 665 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
559 pgdat = NODE_DATA(early_pfn_to_nid(pfn)); 666 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
560 ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
561 limit);
562 667
563 if (!ptr) 668 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
564 return NULL; 669}
670#endif
565 671
566 start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); 672void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
567 end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); 673 unsigned long align, unsigned long goal)
568 if (start_nr != section_nr || end_nr != section_nr) { 674{
569 printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", 675 void *ptr;
570 section_nr);
571 free_bootmem_core(pgdat->bdata, __pa(ptr), size);
572 ptr = NULL;
573 }
574 676
575 return ptr; 677 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
678 if (ptr)
679 return ptr;
680
681 return __alloc_bootmem_nopanic(size, align, goal);
576} 682}
577#endif
578 683
579#ifndef ARCH_LOW_ADDRESS_LIMIT 684#ifndef ARCH_LOW_ADDRESS_LIMIT
580#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 685#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
581#endif 686#endif
582 687
688/**
689 * __alloc_bootmem_low - allocate low boot memory
690 * @size: size of the request in bytes
691 * @align: alignment of the region
692 * @goal: preferred starting address of the region
693 *
694 * The goal is dropped if it can not be satisfied and the allocation will
695 * fall back to memory below @goal.
696 *
697 * Allocation may happen on any node in the system.
698 *
699 * The function panics if the request can not be satisfied.
700 */
583void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, 701void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
584 unsigned long goal) 702 unsigned long goal)
585{ 703{
586 bootmem_data_t *bdata; 704 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
587 void *ptr;
588
589 list_for_each_entry(bdata, &bdata_list, list) {
590 ptr = __alloc_bootmem_core(bdata, size, align, goal,
591 ARCH_LOW_ADDRESS_LIMIT);
592 if (ptr)
593 return ptr;
594 }
595
596 /*
597 * Whoops, we cannot satisfy the allocation request.
598 */
599 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
600 panic("Out of low memory");
601 return NULL;
602} 705}
603 706
707/**
708 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
709 * @pgdat: node to allocate from
710 * @size: size of the request in bytes
711 * @align: alignment of the region
712 * @goal: preferred starting address of the region
713 *
714 * The goal is dropped if it can not be satisfied and the allocation will
715 * fall back to memory below @goal.
716 *
717 * Allocation may fall back to any node in the system if the specified node
718 * can not hold the requested memory.
719 *
720 * The function panics if the request can not be satisfied.
721 */
604void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 722void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
605 unsigned long align, unsigned long goal) 723 unsigned long align, unsigned long goal)
606{ 724{
607 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, 725 return ___alloc_bootmem_node(pgdat->bdata, size, align,
608 ARCH_LOW_ADDRESS_LIMIT); 726 goal, ARCH_LOW_ADDRESS_LIMIT);
609} 727}
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019b..06722c403058 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
267 /* 267 /*
268 * Data-less bio, nothing to bounce 268 * Data-less bio, nothing to bounce
269 */ 269 */
270 if (bio_empty_barrier(*bio_orig)) 270 if (!bio_has_data(*bio_orig))
271 return; 271 return;
272 272
273 /* 273 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6a7d34874f..876bc595d0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,9 +42,6 @@
42 42
43#include <asm/mman.h> 43#include <asm/mman.h>
44 44
45static ssize_t
46generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
47 loff_t offset, unsigned long nr_segs);
48 45
49/* 46/*
50 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
112/* 109/*
113 * Remove a page from the page cache and free it. Caller has to make 110 * Remove a page from the page cache and free it. Caller has to make
114 * sure the page is locked and that nobody else uses it - or that usage 111 * sure the page is locked and that nobody else uses it - or that usage
115 * is safe. The caller must hold a write_lock on the mapping's tree_lock. 112 * is safe. The caller must hold the mapping's tree_lock.
116 */ 113 */
117void __remove_from_page_cache(struct page *page) 114void __remove_from_page_cache(struct page *page)
118{ 115{
119 struct address_space *mapping = page->mapping; 116 struct address_space *mapping = page->mapping;
120 117
121 mem_cgroup_uncharge_page(page); 118 mem_cgroup_uncharge_cache_page(page);
122 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
123 page->mapping = NULL; 120 page->mapping = NULL;
124 mapping->nrpages--; 121 mapping->nrpages--;
@@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page)
144 141
145 BUG_ON(!PageLocked(page)); 142 BUG_ON(!PageLocked(page));
146 143
147 write_lock_irq(&mapping->tree_lock); 144 spin_lock_irq(&mapping->tree_lock);
148 __remove_from_page_cache(page); 145 __remove_from_page_cache(page);
149 write_unlock_irq(&mapping->tree_lock); 146 spin_unlock_irq(&mapping->tree_lock);
150} 147}
151 148
152static int sync_page(void *word) 149static int sync_page(void *word)
@@ -236,11 +233,12 @@ int filemap_fdatawrite(struct address_space *mapping)
236} 233}
237EXPORT_SYMBOL(filemap_fdatawrite); 234EXPORT_SYMBOL(filemap_fdatawrite);
238 235
239static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 236int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
240 loff_t end) 237 loff_t end)
241{ 238{
242 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 239 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
243} 240}
241EXPORT_SYMBOL(filemap_fdatawrite_range);
244 242
245/** 243/**
246 * filemap_flush - mostly a non-blocking flush 244 * filemap_flush - mostly a non-blocking flush
@@ -444,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
444} 442}
445 443
446/** 444/**
447 * add_to_page_cache - add newly allocated pagecache pages 445 * add_to_page_cache_locked - add a locked page to the pagecache
448 * @page: page to add 446 * @page: page to add
449 * @mapping: the page's address_space 447 * @mapping: the page's address_space
450 * @offset: page index 448 * @offset: page index
451 * @gfp_mask: page allocation mode 449 * @gfp_mask: page allocation mode
452 * 450 *
453 * This function is used to add newly allocated pagecache pages; 451 * This function is used to add a page to the pagecache. It must be locked.
454 * the page is new, so we can just run SetPageLocked() against it.
455 * The other page state flags were set by rmqueue().
456 *
457 * This function does not add the page to the LRU. The caller must do that. 452 * This function does not add the page to the LRU. The caller must do that.
458 */ 453 */
459int add_to_page_cache(struct page *page, struct address_space *mapping, 454int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
460 pgoff_t offset, gfp_t gfp_mask) 455 pgoff_t offset, gfp_t gfp_mask)
461{ 456{
462 int error = mem_cgroup_cache_charge(page, current->mm, 457 int error;
458
459 VM_BUG_ON(!PageLocked(page));
460
461 error = mem_cgroup_cache_charge(page, current->mm,
463 gfp_mask & ~__GFP_HIGHMEM); 462 gfp_mask & ~__GFP_HIGHMEM);
464 if (error) 463 if (error)
465 goto out; 464 goto out;
466 465
467 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 466 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
468 if (error == 0) { 467 if (error == 0) {
469 write_lock_irq(&mapping->tree_lock); 468 page_cache_get(page);
469 page->mapping = mapping;
470 page->index = offset;
471
472 spin_lock_irq(&mapping->tree_lock);
470 error = radix_tree_insert(&mapping->page_tree, offset, page); 473 error = radix_tree_insert(&mapping->page_tree, offset, page);
471 if (!error) { 474 if (likely(!error)) {
472 page_cache_get(page);
473 SetPageLocked(page);
474 page->mapping = mapping;
475 page->index = offset;
476 mapping->nrpages++; 475 mapping->nrpages++;
477 __inc_zone_page_state(page, NR_FILE_PAGES); 476 __inc_zone_page_state(page, NR_FILE_PAGES);
478 } else 477 } else {
479 mem_cgroup_uncharge_page(page); 478 page->mapping = NULL;
479 mem_cgroup_uncharge_cache_page(page);
480 page_cache_release(page);
481 }
480 482
481 write_unlock_irq(&mapping->tree_lock); 483 spin_unlock_irq(&mapping->tree_lock);
482 radix_tree_preload_end(); 484 radix_tree_preload_end();
483 } else 485 } else
484 mem_cgroup_uncharge_page(page); 486 mem_cgroup_uncharge_cache_page(page);
485out: 487out:
486 return error; 488 return error;
487} 489}
488EXPORT_SYMBOL(add_to_page_cache); 490EXPORT_SYMBOL(add_to_page_cache_locked);
489 491
490int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 492int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
491 pgoff_t offset, gfp_t gfp_mask) 493 pgoff_t offset, gfp_t gfp_mask)
@@ -556,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
556 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 558 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
557 * 559 *
558 * The first mb is necessary to safely close the critical section opened by the 560 * The first mb is necessary to safely close the critical section opened by the
559 * TestSetPageLocked(), the second mb is necessary to enforce ordering between 561 * test_and_set_bit() to lock the page; the second mb is necessary to enforce
560 * the clear_bit and the read of the waitqueue (to avoid SMP races with a 562 * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
561 * parallel wait_on_page_locked()). 563 * races with a parallel wait_on_page_locked()).
562 */ 564 */
563void unlock_page(struct page *page) 565void unlock_page(struct page *page)
564{ 566{
565 smp_mb__before_clear_bit(); 567 smp_mb__before_clear_bit();
566 if (!TestClearPageLocked(page)) 568 if (!test_and_clear_bit(PG_locked, &page->flags))
567 BUG(); 569 BUG();
568 smp_mb__after_clear_bit(); 570 smp_mb__after_clear_bit();
569 wake_up_page(page, PG_locked); 571 wake_up_page(page, PG_locked);
@@ -635,15 +637,35 @@ void __lock_page_nosync(struct page *page)
635 * Is there a pagecache struct page at the given (mapping, offset) tuple? 637 * Is there a pagecache struct page at the given (mapping, offset) tuple?
636 * If yes, increment its refcount and return it; if no, return NULL. 638 * If yes, increment its refcount and return it; if no, return NULL.
637 */ 639 */
638struct page * find_get_page(struct address_space *mapping, pgoff_t offset) 640struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
639{ 641{
642 void **pagep;
640 struct page *page; 643 struct page *page;
641 644
642 read_lock_irq(&mapping->tree_lock); 645 rcu_read_lock();
643 page = radix_tree_lookup(&mapping->page_tree, offset); 646repeat:
644 if (page) 647 page = NULL;
645 page_cache_get(page); 648 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
646 read_unlock_irq(&mapping->tree_lock); 649 if (pagep) {
650 page = radix_tree_deref_slot(pagep);
651 if (unlikely(!page || page == RADIX_TREE_RETRY))
652 goto repeat;
653
654 if (!page_cache_get_speculative(page))
655 goto repeat;
656
657 /*
658 * Has the page moved?
659 * This is part of the lockless pagecache protocol. See
660 * include/linux/pagemap.h for details.
661 */
662 if (unlikely(page != *pagep)) {
663 page_cache_release(page);
664 goto repeat;
665 }
666 }
667 rcu_read_unlock();
668
647 return page; 669 return page;
648} 670}
649EXPORT_SYMBOL(find_get_page); 671EXPORT_SYMBOL(find_get_page);
@@ -658,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
658 * 680 *
659 * Returns zero if the page was not present. find_lock_page() may sleep. 681 * Returns zero if the page was not present. find_lock_page() may sleep.
660 */ 682 */
661struct page *find_lock_page(struct address_space *mapping, 683struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
662 pgoff_t offset)
663{ 684{
664 struct page *page; 685 struct page *page;
665 686
666repeat: 687repeat:
667 read_lock_irq(&mapping->tree_lock); 688 page = find_get_page(mapping, offset);
668 page = radix_tree_lookup(&mapping->page_tree, offset);
669 if (page) { 689 if (page) {
670 page_cache_get(page); 690 lock_page(page);
671 if (TestSetPageLocked(page)) { 691 /* Has the page been truncated? */
672 read_unlock_irq(&mapping->tree_lock); 692 if (unlikely(page->mapping != mapping)) {
673 __lock_page(page); 693 unlock_page(page);
674 694 page_cache_release(page);
675 /* Has the page been truncated while we slept? */ 695 goto repeat;
676 if (unlikely(page->mapping != mapping)) {
677 unlock_page(page);
678 page_cache_release(page);
679 goto repeat;
680 }
681 VM_BUG_ON(page->index != offset);
682 goto out;
683 } 696 }
697 VM_BUG_ON(page->index != offset);
684 } 698 }
685 read_unlock_irq(&mapping->tree_lock);
686out:
687 return page; 699 return page;
688} 700}
689EXPORT_SYMBOL(find_lock_page); 701EXPORT_SYMBOL(find_lock_page);
@@ -749,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
749{ 761{
750 unsigned int i; 762 unsigned int i;
751 unsigned int ret; 763 unsigned int ret;
764 unsigned int nr_found;
765
766 rcu_read_lock();
767restart:
768 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
769 (void ***)pages, start, nr_pages);
770 ret = 0;
771 for (i = 0; i < nr_found; i++) {
772 struct page *page;
773repeat:
774 page = radix_tree_deref_slot((void **)pages[i]);
775 if (unlikely(!page))
776 continue;
777 /*
778 * this can only trigger if nr_found == 1, making livelock
779 * a non issue.
780 */
781 if (unlikely(page == RADIX_TREE_RETRY))
782 goto restart;
752 783
753 read_lock_irq(&mapping->tree_lock); 784 if (!page_cache_get_speculative(page))
754 ret = radix_tree_gang_lookup(&mapping->page_tree, 785 goto repeat;
755 (void **)pages, start, nr_pages); 786
756 for (i = 0; i < ret; i++) 787 /* Has the page moved? */
757 page_cache_get(pages[i]); 788 if (unlikely(page != *((void **)pages[i]))) {
758 read_unlock_irq(&mapping->tree_lock); 789 page_cache_release(page);
790 goto repeat;
791 }
792
793 pages[ret] = page;
794 ret++;
795 }
796 rcu_read_unlock();
759 return ret; 797 return ret;
760} 798}
761 799
@@ -776,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
776{ 814{
777 unsigned int i; 815 unsigned int i;
778 unsigned int ret; 816 unsigned int ret;
817 unsigned int nr_found;
818
819 rcu_read_lock();
820restart:
821 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
822 (void ***)pages, index, nr_pages);
823 ret = 0;
824 for (i = 0; i < nr_found; i++) {
825 struct page *page;
826repeat:
827 page = radix_tree_deref_slot((void **)pages[i]);
828 if (unlikely(!page))
829 continue;
830 /*
831 * this can only trigger if nr_found == 1, making livelock
832 * a non issue.
833 */
834 if (unlikely(page == RADIX_TREE_RETRY))
835 goto restart;
779 836
780 read_lock_irq(&mapping->tree_lock); 837 if (page->mapping == NULL || page->index != index)
781 ret = radix_tree_gang_lookup(&mapping->page_tree,
782 (void **)pages, index, nr_pages);
783 for (i = 0; i < ret; i++) {
784 if (pages[i]->mapping == NULL || pages[i]->index != index)
785 break; 838 break;
786 839
787 page_cache_get(pages[i]); 840 if (!page_cache_get_speculative(page))
841 goto repeat;
842
843 /* Has the page moved? */
844 if (unlikely(page != *((void **)pages[i]))) {
845 page_cache_release(page);
846 goto repeat;
847 }
848
849 pages[ret] = page;
850 ret++;
788 index++; 851 index++;
789 } 852 }
790 read_unlock_irq(&mapping->tree_lock); 853 rcu_read_unlock();
791 return i; 854 return ret;
792} 855}
793EXPORT_SYMBOL(find_get_pages_contig); 856EXPORT_SYMBOL(find_get_pages_contig);
794 857
@@ -808,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
808{ 871{
809 unsigned int i; 872 unsigned int i;
810 unsigned int ret; 873 unsigned int ret;
874 unsigned int nr_found;
875
876 rcu_read_lock();
877restart:
878 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
879 (void ***)pages, *index, nr_pages, tag);
880 ret = 0;
881 for (i = 0; i < nr_found; i++) {
882 struct page *page;
883repeat:
884 page = radix_tree_deref_slot((void **)pages[i]);
885 if (unlikely(!page))
886 continue;
887 /*
888 * this can only trigger if nr_found == 1, making livelock
889 * a non issue.
890 */
891 if (unlikely(page == RADIX_TREE_RETRY))
892 goto restart;
893
894 if (!page_cache_get_speculative(page))
895 goto repeat;
896
897 /* Has the page moved? */
898 if (unlikely(page != *((void **)pages[i]))) {
899 page_cache_release(page);
900 goto repeat;
901 }
902
903 pages[ret] = page;
904 ret++;
905 }
906 rcu_read_unlock();
811 907
812 read_lock_irq(&mapping->tree_lock);
813 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
814 (void **)pages, *index, nr_pages, tag);
815 for (i = 0; i < ret; i++)
816 page_cache_get(pages[i]);
817 if (ret) 908 if (ret)
818 *index = pages[ret - 1]->index + 1; 909 *index = pages[ret - 1]->index + 1;
819 read_unlock_irq(&mapping->tree_lock); 910
820 return ret; 911 return ret;
821} 912}
822EXPORT_SYMBOL(find_get_pages_tag); 913EXPORT_SYMBOL(find_get_pages_tag);
@@ -840,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
840 struct page *page = find_get_page(mapping, index); 931 struct page *page = find_get_page(mapping, index);
841 932
842 if (page) { 933 if (page) {
843 if (!TestSetPageLocked(page)) 934 if (trylock_page(page))
844 return page; 935 return page;
845 page_cache_release(page); 936 page_cache_release(page);
846 return NULL; 937 return NULL;
@@ -932,8 +1023,17 @@ find_page:
932 ra, filp, page, 1023 ra, filp, page,
933 index, last_index - index); 1024 index, last_index - index);
934 } 1025 }
935 if (!PageUptodate(page)) 1026 if (!PageUptodate(page)) {
936 goto page_not_up_to_date; 1027 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1028 !mapping->a_ops->is_partially_uptodate)
1029 goto page_not_up_to_date;
1030 if (!trylock_page(page))
1031 goto page_not_up_to_date;
1032 if (!mapping->a_ops->is_partially_uptodate(page,
1033 desc, offset))
1034 goto page_not_up_to_date_locked;
1035 unlock_page(page);
1036 }
937page_ok: 1037page_ok:
938 /* 1038 /*
939 * i_size must be checked after we know the page is Uptodate. 1039 * i_size must be checked after we know the page is Uptodate.
@@ -1003,6 +1103,7 @@ page_not_up_to_date:
1003 if (lock_page_killable(page)) 1103 if (lock_page_killable(page))
1004 goto readpage_eio; 1104 goto readpage_eio;
1005 1105
1106page_not_up_to_date_locked:
1006 /* Did it get truncated before we got the lock? */ 1107 /* Did it get truncated before we got the lock? */
1007 if (!page->mapping) { 1108 if (!page->mapping) {
1008 unlock_page(page); 1109 unlock_page(page);
@@ -1199,42 +1300,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1199 1300
1200 mapping = filp->f_mapping; 1301 mapping = filp->f_mapping;
1201 inode = mapping->host; 1302 inode = mapping->host;
1202 retval = 0;
1203 if (!count) 1303 if (!count)
1204 goto out; /* skip atime */ 1304 goto out; /* skip atime */
1205 size = i_size_read(inode); 1305 size = i_size_read(inode);
1206 if (pos < size) { 1306 if (pos < size) {
1207 retval = generic_file_direct_IO(READ, iocb, 1307 retval = filemap_write_and_wait(mapping);
1208 iov, pos, nr_segs); 1308 if (!retval) {
1309 retval = mapping->a_ops->direct_IO(READ, iocb,
1310 iov, pos, nr_segs);
1311 }
1209 if (retval > 0) 1312 if (retval > 0)
1210 *ppos = pos + retval; 1313 *ppos = pos + retval;
1211 } 1314 if (retval) {
1212 if (likely(retval != 0)) { 1315 file_accessed(filp);
1213 file_accessed(filp); 1316 goto out;
1214 goto out; 1317 }
1215 } 1318 }
1216 } 1319 }
1217 1320
1218 retval = 0; 1321 for (seg = 0; seg < nr_segs; seg++) {
1219 if (count) { 1322 read_descriptor_t desc;
1220 for (seg = 0; seg < nr_segs; seg++) {
1221 read_descriptor_t desc;
1222 1323
1223 desc.written = 0; 1324 desc.written = 0;
1224 desc.arg.buf = iov[seg].iov_base; 1325 desc.arg.buf = iov[seg].iov_base;
1225 desc.count = iov[seg].iov_len; 1326 desc.count = iov[seg].iov_len;
1226 if (desc.count == 0) 1327 if (desc.count == 0)
1227 continue; 1328 continue;
1228 desc.error = 0; 1329 desc.error = 0;
1229 do_generic_file_read(filp,ppos,&desc,file_read_actor); 1330 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1230 retval += desc.written; 1331 retval += desc.written;
1231 if (desc.error) { 1332 if (desc.error) {
1232 retval = retval ?: desc.error; 1333 retval = retval ?: desc.error;
1233 break; 1334 break;
1234 }
1235 if (desc.count > 0)
1236 break;
1237 } 1335 }
1336 if (desc.count > 0)
1337 break;
1238 } 1338 }
1239out: 1339out:
1240 return retval; 1340 return retval;
@@ -1668,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
1668 return notify_change(dentry, &newattrs); 1768 return notify_change(dentry, &newattrs);
1669} 1769}
1670 1770
1671int remove_suid(struct dentry *dentry) 1771int file_remove_suid(struct file *file)
1672{ 1772{
1773 struct dentry *dentry = file->f_path.dentry;
1673 int killsuid = should_remove_suid(dentry); 1774 int killsuid = should_remove_suid(dentry);
1674 int killpriv = security_inode_need_killpriv(dentry); 1775 int killpriv = security_inode_need_killpriv(dentry);
1675 int error = 0; 1776 int error = 0;
@@ -1683,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
1683 1784
1684 return error; 1785 return error;
1685} 1786}
1686EXPORT_SYMBOL(remove_suid); 1787EXPORT_SYMBOL(file_remove_suid);
1687 1788
1688static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1789static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1689 const struct iovec *iov, size_t base, size_t bytes) 1790 const struct iovec *iov, size_t base, size_t bytes)
@@ -1778,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
1778 * The !iov->iov_len check ensures we skip over unlikely 1879 * The !iov->iov_len check ensures we skip over unlikely
1779 * zero-length segments (without overruning the iovec). 1880 * zero-length segments (without overruning the iovec).
1780 */ 1881 */
1781 while (bytes || unlikely(!iov->iov_len && i->count)) { 1882 while (bytes || unlikely(i->count && !iov->iov_len)) {
1782 int copy; 1883 int copy;
1783 1884
1784 copy = min(bytes, iov->iov_len - base); 1885 copy = min(bytes, iov->iov_len - base);
@@ -2003,11 +2104,62 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2003 struct address_space *mapping = file->f_mapping; 2104 struct address_space *mapping = file->f_mapping;
2004 struct inode *inode = mapping->host; 2105 struct inode *inode = mapping->host;
2005 ssize_t written; 2106 ssize_t written;
2107 size_t write_len;
2108 pgoff_t end;
2006 2109
2007 if (count != ocount) 2110 if (count != ocount)
2008 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2111 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2009 2112
2010 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2113 /*
2114 * Unmap all mmappings of the file up-front.
2115 *
2116 * This will cause any pte dirty bits to be propagated into the
2117 * pageframes for the subsequent filemap_write_and_wait().
2118 */
2119 write_len = iov_length(iov, *nr_segs);
2120 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2121 if (mapping_mapped(mapping))
2122 unmap_mapping_range(mapping, pos, write_len, 0);
2123
2124 written = filemap_write_and_wait(mapping);
2125 if (written)
2126 goto out;
2127
2128 /*
2129 * After a write we want buffered reads to be sure to go to disk to get
2130 * the new data. We invalidate clean cached page from the region we're
2131 * about to write. We do this *before* the write so that we can return
2132 * without clobbering -EIOCBQUEUED from ->direct_IO().
2133 */
2134 if (mapping->nrpages) {
2135 written = invalidate_inode_pages2_range(mapping,
2136 pos >> PAGE_CACHE_SHIFT, end);
2137 /*
2138 * If a page can not be invalidated, return 0 to fall back
2139 * to buffered write.
2140 */
2141 if (written) {
2142 if (written == -EBUSY)
2143 return 0;
2144 goto out;
2145 }
2146 }
2147
2148 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2149
2150 /*
2151 * Finally, try again to invalidate clean pages which might have been
2152 * cached by non-direct readahead, or faulted in by get_user_pages()
2153 * if the source of the write was an mmap'ed region of the file
2154 * we're writing. Either one is a pretty crazy thing to do,
2155 * so we don't support it 100%. If this invalidation
2156 * fails, tough, the write still worked...
2157 */
2158 if (mapping->nrpages) {
2159 invalidate_inode_pages2_range(mapping,
2160 pos >> PAGE_CACHE_SHIFT, end);
2161 }
2162
2011 if (written > 0) { 2163 if (written > 0) {
2012 loff_t end = pos + written; 2164 loff_t end = pos + written;
2013 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2165 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2023,6 +2175,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2023 * i_mutex is held, which protects generic_osync_inode() from 2175 * i_mutex is held, which protects generic_osync_inode() from
2024 * livelocking. AIO O_DIRECT ops attempt to sync metadata here. 2176 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2025 */ 2177 */
2178out:
2026 if ((written >= 0 || written == -EIOCBQUEUED) && 2179 if ((written >= 0 || written == -EIOCBQUEUED) &&
2027 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2180 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2028 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2181 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2394,7 +2547,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2394 if (count == 0) 2547 if (count == 0)
2395 goto out; 2548 goto out;
2396 2549
2397 err = remove_suid(file->f_path.dentry); 2550 err = file_remove_suid(file);
2398 if (err) 2551 if (err)
2399 goto out; 2552 goto out;
2400 2553
@@ -2510,66 +2663,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2510} 2663}
2511EXPORT_SYMBOL(generic_file_aio_write); 2664EXPORT_SYMBOL(generic_file_aio_write);
2512 2665
2513/*
2514 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2515 * went wrong during pagecache shootdown.
2516 */
2517static ssize_t
2518generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2519 loff_t offset, unsigned long nr_segs)
2520{
2521 struct file *file = iocb->ki_filp;
2522 struct address_space *mapping = file->f_mapping;
2523 ssize_t retval;
2524 size_t write_len;
2525 pgoff_t end = 0; /* silence gcc */
2526
2527 /*
2528 * If it's a write, unmap all mmappings of the file up-front. This
2529 * will cause any pte dirty bits to be propagated into the pageframes
2530 * for the subsequent filemap_write_and_wait().
2531 */
2532 if (rw == WRITE) {
2533 write_len = iov_length(iov, nr_segs);
2534 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2535 if (mapping_mapped(mapping))
2536 unmap_mapping_range(mapping, offset, write_len, 0);
2537 }
2538
2539 retval = filemap_write_and_wait(mapping);
2540 if (retval)
2541 goto out;
2542
2543 /*
2544 * After a write we want buffered reads to be sure to go to disk to get
2545 * the new data. We invalidate clean cached page from the region we're
2546 * about to write. We do this *before* the write so that we can return
2547 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2548 */
2549 if (rw == WRITE && mapping->nrpages) {
2550 retval = invalidate_inode_pages2_range(mapping,
2551 offset >> PAGE_CACHE_SHIFT, end);
2552 if (retval)
2553 goto out;
2554 }
2555
2556 retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2557
2558 /*
2559 * Finally, try again to invalidate clean pages which might have been
2560 * cached by non-direct readahead, or faulted in by get_user_pages()
2561 * if the source of the write was an mmap'ed region of the file
2562 * we're writing. Either one is a pretty crazy thing to do,
2563 * so we don't support it 100%. If this invalidation
2564 * fails, tough, the write still worked...
2565 */
2566 if (rw == WRITE && mapping->nrpages) {
2567 invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2568 }
2569out:
2570 return retval;
2571}
2572
2573/** 2666/**
2574 * try_to_release_page() - release old fs-specific metadata on a page 2667 * try_to_release_page() - release old fs-specific metadata on a page
2575 * 2668 *
@@ -2581,9 +2674,8 @@ out:
2581 * Otherwise return zero. 2674 * Otherwise return zero.
2582 * 2675 *
2583 * The @gfp_mask argument specifies whether I/O may be performed to release 2676 * The @gfp_mask argument specifies whether I/O may be performed to release
2584 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). 2677 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2585 * 2678 *
2586 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2587 */ 2679 */
2588int try_to_release_page(struct page *page, gfp_t gfp_mask) 2680int try_to_release_page(struct page *page, gfp_t gfp_mask)
2589{ 2681{
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..b5167dfb2f2d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,7 +13,10 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/mmu_notifier.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/seqlock.h>
19#include <linux/mutex.h>
17#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
18#include <asm/io.h> 21#include <asm/io.h>
19 22
@@ -21,22 +24,18 @@
21 * We do use our own empty page to avoid interference with other users 24 * We do use our own empty page to avoid interference with other users
22 * of ZERO_PAGE(), such as /dev/zero 25 * of ZERO_PAGE(), such as /dev/zero
23 */ 26 */
27static DEFINE_MUTEX(xip_sparse_mutex);
28static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
24static struct page *__xip_sparse_page; 29static struct page *__xip_sparse_page;
25 30
31/* called under xip_sparse_mutex */
26static struct page *xip_sparse_page(void) 32static struct page *xip_sparse_page(void)
27{ 33{
28 if (!__xip_sparse_page) { 34 if (!__xip_sparse_page) {
29 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); 35 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
30 36
31 if (page) { 37 if (page)
32 static DEFINE_SPINLOCK(xip_alloc_lock); 38 __xip_sparse_page = page;
33 spin_lock(&xip_alloc_lock);
34 if (!__xip_sparse_page)
35 __xip_sparse_page = page;
36 else
37 __free_page(page);
38 spin_unlock(&xip_alloc_lock);
39 }
40 } 39 }
41 return __xip_sparse_page; 40 return __xip_sparse_page;
42} 41}
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping,
173 pte_t pteval; 172 pte_t pteval;
174 spinlock_t *ptl; 173 spinlock_t *ptl;
175 struct page *page; 174 struct page *page;
175 unsigned count;
176 int locked = 0;
177
178 count = read_seqcount_begin(&xip_sparse_seq);
176 179
177 page = __xip_sparse_page; 180 page = __xip_sparse_page;
178 if (!page) 181 if (!page)
179 return; 182 return;
180 183
184retry:
181 spin_lock(&mapping->i_mmap_lock); 185 spin_lock(&mapping->i_mmap_lock);
182 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 186 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
183 mm = vma->vm_mm; 187 mm = vma->vm_mm;
184 address = vma->vm_start + 188 address = vma->vm_start +
185 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
186 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 190 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
187 pte = page_check_address(page, mm, address, &ptl); 191 pte = page_check_address(page, mm, address, &ptl, 1);
188 if (pte) { 192 if (pte) {
189 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
190 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
191 pteval = ptep_clear_flush(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
192 page_remove_rmap(page, vma); 196 page_remove_rmap(page, vma);
193 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
197 } 201 }
198 } 202 }
199 spin_unlock(&mapping->i_mmap_lock); 203 spin_unlock(&mapping->i_mmap_lock);
204
205 if (locked) {
206 mutex_unlock(&xip_sparse_mutex);
207 } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
208 mutex_lock(&xip_sparse_mutex);
209 locked = 1;
210 goto retry;
211 }
200} 212}
201 213
202/* 214/*
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
217 int error; 229 int error;
218 230
219 /* XXX: are VM_FAULT_ codes OK? */ 231 /* XXX: are VM_FAULT_ codes OK? */
220 232again:
221 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 233 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
222 if (vmf->pgoff >= size) 234 if (vmf->pgoff >= size)
223 return VM_FAULT_SIGBUS; 235 return VM_FAULT_SIGBUS;
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
236 int err; 248 int err;
237 249
238 /* maybe shared writable, allocate new block */ 250 /* maybe shared writable, allocate new block */
251 mutex_lock(&xip_sparse_mutex);
239 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, 252 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
240 &xip_mem, &xip_pfn); 253 &xip_mem, &xip_pfn);
254 mutex_unlock(&xip_sparse_mutex);
241 if (error) 255 if (error)
242 return VM_FAULT_SIGBUS; 256 return VM_FAULT_SIGBUS;
243 /* unmap sparse mappings at pgoff from all other vmas */ 257 /* unmap sparse mappings at pgoff from all other vmas */
@@ -251,14 +265,34 @@ found:
251 BUG_ON(err); 265 BUG_ON(err);
252 return VM_FAULT_NOPAGE; 266 return VM_FAULT_NOPAGE;
253 } else { 267 } else {
268 int err, ret = VM_FAULT_OOM;
269
270 mutex_lock(&xip_sparse_mutex);
271 write_seqcount_begin(&xip_sparse_seq);
272 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
273 &xip_mem, &xip_pfn);
274 if (unlikely(!error)) {
275 write_seqcount_end(&xip_sparse_seq);
276 mutex_unlock(&xip_sparse_mutex);
277 goto again;
278 }
279 if (error != -ENODATA)
280 goto out;
254 /* not shared and writable, use xip_sparse_page() */ 281 /* not shared and writable, use xip_sparse_page() */
255 page = xip_sparse_page(); 282 page = xip_sparse_page();
256 if (!page) 283 if (!page)
257 return VM_FAULT_OOM; 284 goto out;
285 err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
286 page);
287 if (err == -ENOMEM)
288 goto out;
258 289
259 page_cache_get(page); 290 ret = VM_FAULT_NOPAGE;
260 vmf->page = page; 291out:
261 return 0; 292 write_seqcount_end(&xip_sparse_seq);
293 mutex_unlock(&xip_sparse_mutex);
294
295 return ret;
262 } 296 }
263} 297}
264 298
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
307 &xip_mem, &xip_pfn); 341 &xip_mem, &xip_pfn);
308 if (status == -ENODATA) { 342 if (status == -ENODATA) {
309 /* we allocate a new page unmap it */ 343 /* we allocate a new page unmap it */
344 mutex_lock(&xip_sparse_mutex);
310 status = a_ops->get_xip_mem(mapping, index, 1, 345 status = a_ops->get_xip_mem(mapping, index, 1,
311 &xip_mem, &xip_pfn); 346 &xip_mem, &xip_pfn);
347 mutex_unlock(&xip_sparse_mutex);
312 if (!status) 348 if (!status)
313 /* unmap page at pgoff from all other vmas */ 349 /* unmap page at pgoff from all other vmas */
314 __xip_unmap(mapping, index); 350 __xip_unmap(mapping, index);
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
380 if (count == 0) 416 if (count == 0)
381 goto out_backing; 417 goto out_backing;
382 418
383 ret = remove_suid(filp->f_path.dentry); 419 ret = file_remove_suid(filp);
384 if (ret) 420 if (ret)
385 goto out_backing; 421 goto out_backing;
386 422
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7881638e4a12 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h>
18 19
19#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
20#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
214 spin_unlock(&mapping->i_mmap_lock); 215 spin_unlock(&mapping->i_mmap_lock);
215 } 216 }
216 217
218 mmu_notifier_invalidate_range_start(mm, start, start + size);
217 err = populate_range(mm, vma, start, size, pgoff); 219 err = populate_range(mm, vma, start, size, pgoff);
220 mmu_notifier_invalidate_range_end(mm, start, start + size);
218 if (!err && !(flags & MAP_NONBLOCK)) { 221 if (!err && !(flags & MAP_NONBLOCK)) {
219 if (unlikely(has_write_lock)) { 222 if (unlikely(has_write_lock)) {
220 downgrade_write(&mm->mmap_sem); 223 downgrade_write(&mm->mmap_sem);
diff --git a/mm/highmem.c b/mm/highmem.c
index 7da4a7b6af11..b36b83b920ff 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -40,6 +40,7 @@
40#ifdef CONFIG_HIGHMEM 40#ifdef CONFIG_HIGHMEM
41 41
42unsigned long totalhigh_pages __read_mostly; 42unsigned long totalhigh_pages __read_mostly;
43EXPORT_SYMBOL(totalhigh_pages);
43 44
44unsigned int nr_free_highpages (void) 45unsigned int nr_free_highpages (void)
45{ 46{
@@ -69,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
69static void flush_all_zero_pkmaps(void) 70static void flush_all_zero_pkmaps(void)
70{ 71{
71 int i; 72 int i;
73 int need_flush = 0;
72 74
73 flush_cache_kmaps(); 75 flush_cache_kmaps();
74 76
@@ -100,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
100 &pkmap_page_table[i]); 102 &pkmap_page_table[i]);
101 103
102 set_page_address(page, NULL); 104 set_page_address(page, NULL);
105 need_flush = 1;
103 } 106 }
104 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); 107 if (need_flush)
108 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
105} 109}
106 110
107/** 111/**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..67a71191136e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,43 +9,357 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/sysctl.h> 10#include <linux/sysctl.h>
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
12#include <linux/nodemask.h> 13#include <linux/nodemask.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 15#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 16#include <linux/cpuset.h>
16#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/bootmem.h>
19#include <linux/sysfs.h>
17 20
18#include <asm/page.h> 21#include <asm/page.h>
19#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23#include <asm/io.h>
20 24
21#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
22#include "internal.h" 26#include "internal.h"
23 27
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 28const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 29static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 30unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid; 31
32static int max_hstate;
33unsigned int default_hstate_idx;
34struct hstate hstates[HUGE_MAX_HSTATE];
35
36__initdata LIST_HEAD(huge_boot_pages);
37
38/* for command line parsing */
39static struct hstate * __initdata parsed_hstate;
40static unsigned long __initdata default_hstate_max_huge_pages;
41static unsigned long __initdata default_hstate_size;
42
43#define for_each_hstate(h) \
44 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
37 45
38/* 46/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 47 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
40 */ 48 */
41static DEFINE_SPINLOCK(hugetlb_lock); 49static DEFINE_SPINLOCK(hugetlb_lock);
42 50
43static void clear_huge_page(struct page *page, unsigned long addr) 51/*
52 * Region tracking -- allows tracking of reservations and instantiated pages
53 * across the pages in a mapping.
54 *
55 * The region data structures are protected by a combination of the mmap_sem
56 * and the hugetlb_instantion_mutex. To access or modify a region the caller
57 * must either hold the mmap_sem for write, or the mmap_sem for read and
58 * the hugetlb_instantiation mutex:
59 *
60 * down_write(&mm->mmap_sem);
61 * or
62 * down_read(&mm->mmap_sem);
63 * mutex_lock(&hugetlb_instantiation_mutex);
64 */
65struct file_region {
66 struct list_head link;
67 long from;
68 long to;
69};
70
71static long region_add(struct list_head *head, long f, long t)
72{
73 struct file_region *rg, *nrg, *trg;
74
75 /* Locate the region we are either in or before. */
76 list_for_each_entry(rg, head, link)
77 if (f <= rg->to)
78 break;
79
80 /* Round our left edge to the current segment if it encloses us. */
81 if (f > rg->from)
82 f = rg->from;
83
84 /* Check for and consume any regions we now overlap with. */
85 nrg = rg;
86 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
87 if (&rg->link == head)
88 break;
89 if (rg->from > t)
90 break;
91
92 /* If this area reaches higher then extend our area to
93 * include it completely. If this is not the first area
94 * which we intend to reuse, free it. */
95 if (rg->to > t)
96 t = rg->to;
97 if (rg != nrg) {
98 list_del(&rg->link);
99 kfree(rg);
100 }
101 }
102 nrg->from = f;
103 nrg->to = t;
104 return 0;
105}
106
107static long region_chg(struct list_head *head, long f, long t)
108{
109 struct file_region *rg, *nrg;
110 long chg = 0;
111
112 /* Locate the region we are before or in. */
113 list_for_each_entry(rg, head, link)
114 if (f <= rg->to)
115 break;
116
117 /* If we are below the current region then a new region is required.
118 * Subtle, allocate a new region at the position but make it zero
119 * size such that we can guarantee to record the reservation. */
120 if (&rg->link == head || t < rg->from) {
121 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
122 if (!nrg)
123 return -ENOMEM;
124 nrg->from = f;
125 nrg->to = f;
126 INIT_LIST_HEAD(&nrg->link);
127 list_add(&nrg->link, rg->link.prev);
128
129 return t - f;
130 }
131
132 /* Round our left edge to the current segment if it encloses us. */
133 if (f > rg->from)
134 f = rg->from;
135 chg = t - f;
136
137 /* Check for and consume any regions we now overlap with. */
138 list_for_each_entry(rg, rg->link.prev, link) {
139 if (&rg->link == head)
140 break;
141 if (rg->from > t)
142 return chg;
143
144 /* We overlap with this area, if it extends futher than
145 * us then we must extend ourselves. Account for its
146 * existing reservation. */
147 if (rg->to > t) {
148 chg += rg->to - t;
149 t = rg->to;
150 }
151 chg -= rg->to - rg->from;
152 }
153 return chg;
154}
155
156static long region_truncate(struct list_head *head, long end)
157{
158 struct file_region *rg, *trg;
159 long chg = 0;
160
161 /* Locate the region we are either in or before. */
162 list_for_each_entry(rg, head, link)
163 if (end <= rg->to)
164 break;
165 if (&rg->link == head)
166 return 0;
167
168 /* If we are in the middle of a region then adjust it. */
169 if (end > rg->from) {
170 chg = rg->to - end;
171 rg->to = end;
172 rg = list_entry(rg->link.next, typeof(*rg), link);
173 }
174
175 /* Drop any remaining regions. */
176 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
177 if (&rg->link == head)
178 break;
179 chg += rg->to - rg->from;
180 list_del(&rg->link);
181 kfree(rg);
182 }
183 return chg;
184}
185
186static long region_count(struct list_head *head, long f, long t)
187{
188 struct file_region *rg;
189 long chg = 0;
190
191 /* Locate each segment we overlap with, and count that overlap. */
192 list_for_each_entry(rg, head, link) {
193 int seg_from;
194 int seg_to;
195
196 if (rg->to <= f)
197 continue;
198 if (rg->from >= t)
199 break;
200
201 seg_from = max(rg->from, f);
202 seg_to = min(rg->to, t);
203
204 chg += seg_to - seg_from;
205 }
206
207 return chg;
208}
209
210/*
211 * Convert the address within this vma to the page offset within
212 * the mapping, in pagecache page units; huge pages here.
213 */
214static pgoff_t vma_hugecache_offset(struct hstate *h,
215 struct vm_area_struct *vma, unsigned long address)
216{
217 return ((address - vma->vm_start) >> huge_page_shift(h)) +
218 (vma->vm_pgoff >> huge_page_order(h));
219}
220
221/*
222 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
223 * bits of the reservation map pointer, which are always clear due to
224 * alignment.
225 */
226#define HPAGE_RESV_OWNER (1UL << 0)
227#define HPAGE_RESV_UNMAPPED (1UL << 1)
228#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
229
230/*
231 * These helpers are used to track how many pages are reserved for
232 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
233 * is guaranteed to have their future faults succeed.
234 *
235 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
236 * the reserve counters are updated with the hugetlb_lock held. It is safe
237 * to reset the VMA at fork() time as it is not in use yet and there is no
238 * chance of the global counters getting corrupted as a result of the values.
239 *
240 * The private mapping reservation is represented in a subtly different
241 * manner to a shared mapping. A shared mapping has a region map associated
242 * with the underlying file, this region map represents the backing file
243 * pages which have ever had a reservation assigned which this persists even
244 * after the page is instantiated. A private mapping has a region map
245 * associated with the original mmap which is attached to all VMAs which
246 * reference it, this region map represents those offsets which have consumed
247 * reservation ie. where pages have been instantiated.
248 */
249static unsigned long get_vma_private_data(struct vm_area_struct *vma)
250{
251 return (unsigned long)vma->vm_private_data;
252}
253
254static void set_vma_private_data(struct vm_area_struct *vma,
255 unsigned long value)
256{
257 vma->vm_private_data = (void *)value;
258}
259
260struct resv_map {
261 struct kref refs;
262 struct list_head regions;
263};
264
265struct resv_map *resv_map_alloc(void)
266{
267 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
268 if (!resv_map)
269 return NULL;
270
271 kref_init(&resv_map->refs);
272 INIT_LIST_HEAD(&resv_map->regions);
273
274 return resv_map;
275}
276
277void resv_map_release(struct kref *ref)
278{
279 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
280
281 /* Clear out any active regions before we release the map. */
282 region_truncate(&resv_map->regions, 0);
283 kfree(resv_map);
284}
285
286static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
287{
288 VM_BUG_ON(!is_vm_hugetlb_page(vma));
289 if (!(vma->vm_flags & VM_SHARED))
290 return (struct resv_map *)(get_vma_private_data(vma) &
291 ~HPAGE_RESV_MASK);
292 return 0;
293}
294
295static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
296{
297 VM_BUG_ON(!is_vm_hugetlb_page(vma));
298 VM_BUG_ON(vma->vm_flags & VM_SHARED);
299
300 set_vma_private_data(vma, (get_vma_private_data(vma) &
301 HPAGE_RESV_MASK) | (unsigned long)map);
302}
303
304static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
305{
306 VM_BUG_ON(!is_vm_hugetlb_page(vma));
307 VM_BUG_ON(vma->vm_flags & VM_SHARED);
308
309 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
310}
311
312static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
313{
314 VM_BUG_ON(!is_vm_hugetlb_page(vma));
315
316 return (get_vma_private_data(vma) & flag) != 0;
317}
318
319/* Decrement the reserved pages in the hugepage pool by one */
320static void decrement_hugepage_resv_vma(struct hstate *h,
321 struct vm_area_struct *vma)
322{
323 if (vma->vm_flags & VM_NORESERVE)
324 return;
325
326 if (vma->vm_flags & VM_SHARED) {
327 /* Shared mappings always use reserves */
328 h->resv_huge_pages--;
329 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
330 /*
331 * Only the process that called mmap() has reserves for
332 * private mappings.
333 */
334 h->resv_huge_pages--;
335 }
336}
337
338/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
339void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
340{
341 VM_BUG_ON(!is_vm_hugetlb_page(vma));
342 if (!(vma->vm_flags & VM_SHARED))
343 vma->vm_private_data = (void *)0;
344}
345
346/* Returns true if the VMA has associated reserve pages */
347static int vma_has_reserves(struct vm_area_struct *vma)
348{
349 if (vma->vm_flags & VM_SHARED)
350 return 1;
351 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
352 return 1;
353 return 0;
354}
355
356static void clear_huge_page(struct page *page,
357 unsigned long addr, unsigned long sz)
44{ 358{
45 int i; 359 int i;
46 360
47 might_sleep(); 361 might_sleep();
48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 362 for (i = 0; i < sz/PAGE_SIZE; i++) {
49 cond_resched(); 363 cond_resched();
50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 364 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
51 } 365 }
@@ -55,42 +369,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
55 unsigned long addr, struct vm_area_struct *vma) 369 unsigned long addr, struct vm_area_struct *vma)
56{ 370{
57 int i; 371 int i;
372 struct hstate *h = hstate_vma(vma);
58 373
59 might_sleep(); 374 might_sleep();
60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 375 for (i = 0; i < pages_per_huge_page(h); i++) {
61 cond_resched(); 376 cond_resched();
62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 377 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
63 } 378 }
64} 379}
65 380
66static void enqueue_huge_page(struct page *page) 381static void enqueue_huge_page(struct hstate *h, struct page *page)
67{ 382{
68 int nid = page_to_nid(page); 383 int nid = page_to_nid(page);
69 list_add(&page->lru, &hugepage_freelists[nid]); 384 list_add(&page->lru, &h->hugepage_freelists[nid]);
70 free_huge_pages++; 385 h->free_huge_pages++;
71 free_huge_pages_node[nid]++; 386 h->free_huge_pages_node[nid]++;
72} 387}
73 388
74static struct page *dequeue_huge_page(void) 389static struct page *dequeue_huge_page(struct hstate *h)
75{ 390{
76 int nid; 391 int nid;
77 struct page *page = NULL; 392 struct page *page = NULL;
78 393
79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 394 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
80 if (!list_empty(&hugepage_freelists[nid])) { 395 if (!list_empty(&h->hugepage_freelists[nid])) {
81 page = list_entry(hugepage_freelists[nid].next, 396 page = list_entry(h->hugepage_freelists[nid].next,
82 struct page, lru); 397 struct page, lru);
83 list_del(&page->lru); 398 list_del(&page->lru);
84 free_huge_pages--; 399 h->free_huge_pages--;
85 free_huge_pages_node[nid]--; 400 h->free_huge_pages_node[nid]--;
86 break; 401 break;
87 } 402 }
88 } 403 }
89 return page; 404 return page;
90} 405}
91 406
92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 407static struct page *dequeue_huge_page_vma(struct hstate *h,
93 unsigned long address) 408 struct vm_area_struct *vma,
409 unsigned long address, int avoid_reserve)
94{ 410{
95 int nid; 411 int nid;
96 struct page *page = NULL; 412 struct page *page = NULL;
@@ -101,18 +417,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
101 struct zone *zone; 417 struct zone *zone;
102 struct zoneref *z; 418 struct zoneref *z;
103 419
420 /*
421 * A child process with MAP_PRIVATE mappings created by their parent
422 * have no page reserves. This check ensures that reservations are
423 * not "stolen". The child may still get SIGKILLed
424 */
425 if (!vma_has_reserves(vma) &&
426 h->free_huge_pages - h->resv_huge_pages == 0)
427 return NULL;
428
429 /* If reserves cannot be used, ensure enough pages are in the pool */
430 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
431 return NULL;
432
104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 433 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) { 434 MAX_NR_ZONES - 1, nodemask) {
106 nid = zone_to_nid(zone); 435 nid = zone_to_nid(zone);
107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 436 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
108 !list_empty(&hugepage_freelists[nid])) { 437 !list_empty(&h->hugepage_freelists[nid])) {
109 page = list_entry(hugepage_freelists[nid].next, 438 page = list_entry(h->hugepage_freelists[nid].next,
110 struct page, lru); 439 struct page, lru);
111 list_del(&page->lru); 440 list_del(&page->lru);
112 free_huge_pages--; 441 h->free_huge_pages--;
113 free_huge_pages_node[nid]--; 442 h->free_huge_pages_node[nid]--;
114 if (vma && vma->vm_flags & VM_MAYSHARE) 443
115 resv_huge_pages--; 444 if (!avoid_reserve)
445 decrement_hugepage_resv_vma(h, vma);
446
116 break; 447 break;
117 } 448 }
118 } 449 }
@@ -120,12 +451,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
120 return page; 451 return page;
121} 452}
122 453
123static void update_and_free_page(struct page *page) 454static void update_and_free_page(struct hstate *h, struct page *page)
124{ 455{
125 int i; 456 int i;
126 nr_huge_pages--; 457
127 nr_huge_pages_node[page_to_nid(page)]--; 458 h->nr_huge_pages--;
128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 459 h->nr_huge_pages_node[page_to_nid(page)]--;
460 for (i = 0; i < pages_per_huge_page(h); i++) {
129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 461 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 462 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
131 1 << PG_private | 1<< PG_writeback); 463 1 << PG_private | 1<< PG_writeback);
@@ -133,11 +465,27 @@ static void update_and_free_page(struct page *page)
133 set_compound_page_dtor(page, NULL); 465 set_compound_page_dtor(page, NULL);
134 set_page_refcounted(page); 466 set_page_refcounted(page);
135 arch_release_hugepage(page); 467 arch_release_hugepage(page);
136 __free_pages(page, HUGETLB_PAGE_ORDER); 468 __free_pages(page, huge_page_order(h));
469}
470
471struct hstate *size_to_hstate(unsigned long size)
472{
473 struct hstate *h;
474
475 for_each_hstate(h) {
476 if (huge_page_size(h) == size)
477 return h;
478 }
479 return NULL;
137} 480}
138 481
139static void free_huge_page(struct page *page) 482static void free_huge_page(struct page *page)
140{ 483{
484 /*
485 * Can't pass hstate in here because it is called from the
486 * compound page destructor.
487 */
488 struct hstate *h = page_hstate(page);
141 int nid = page_to_nid(page); 489 int nid = page_to_nid(page);
142 struct address_space *mapping; 490 struct address_space *mapping;
143 491
@@ -147,12 +495,12 @@ static void free_huge_page(struct page *page)
147 INIT_LIST_HEAD(&page->lru); 495 INIT_LIST_HEAD(&page->lru);
148 496
149 spin_lock(&hugetlb_lock); 497 spin_lock(&hugetlb_lock);
150 if (surplus_huge_pages_node[nid]) { 498 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
151 update_and_free_page(page); 499 update_and_free_page(h, page);
152 surplus_huge_pages--; 500 h->surplus_huge_pages--;
153 surplus_huge_pages_node[nid]--; 501 h->surplus_huge_pages_node[nid]--;
154 } else { 502 } else {
155 enqueue_huge_page(page); 503 enqueue_huge_page(h, page);
156 } 504 }
157 spin_unlock(&hugetlb_lock); 505 spin_unlock(&hugetlb_lock);
158 if (mapping) 506 if (mapping)
@@ -164,7 +512,7 @@ static void free_huge_page(struct page *page)
164 * balanced by operating on them in a round-robin fashion. 512 * balanced by operating on them in a round-robin fashion.
165 * Returns 1 if an adjustment was made. 513 * Returns 1 if an adjustment was made.
166 */ 514 */
167static int adjust_pool_surplus(int delta) 515static int adjust_pool_surplus(struct hstate *h, int delta)
168{ 516{
169 static int prev_nid; 517 static int prev_nid;
170 int nid = prev_nid; 518 int nid = prev_nid;
@@ -177,15 +525,15 @@ static int adjust_pool_surplus(int delta)
177 nid = first_node(node_online_map); 525 nid = first_node(node_online_map);
178 526
179 /* To shrink on this node, there must be a surplus page */ 527 /* To shrink on this node, there must be a surplus page */
180 if (delta < 0 && !surplus_huge_pages_node[nid]) 528 if (delta < 0 && !h->surplus_huge_pages_node[nid])
181 continue; 529 continue;
182 /* Surplus cannot exceed the total number of pages */ 530 /* Surplus cannot exceed the total number of pages */
183 if (delta > 0 && surplus_huge_pages_node[nid] >= 531 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
184 nr_huge_pages_node[nid]) 532 h->nr_huge_pages_node[nid])
185 continue; 533 continue;
186 534
187 surplus_huge_pages += delta; 535 h->surplus_huge_pages += delta;
188 surplus_huge_pages_node[nid] += delta; 536 h->surplus_huge_pages_node[nid] += delta;
189 ret = 1; 537 ret = 1;
190 break; 538 break;
191 } while (nid != prev_nid); 539 } while (nid != prev_nid);
@@ -194,59 +542,74 @@ static int adjust_pool_surplus(int delta)
194 return ret; 542 return ret;
195} 543}
196 544
197static struct page *alloc_fresh_huge_page_node(int nid) 545static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
546{
547 set_compound_page_dtor(page, free_huge_page);
548 spin_lock(&hugetlb_lock);
549 h->nr_huge_pages++;
550 h->nr_huge_pages_node[nid]++;
551 spin_unlock(&hugetlb_lock);
552 put_page(page); /* free it into the hugepage allocator */
553}
554
555static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
198{ 556{
199 struct page *page; 557 struct page *page;
200 558
559 if (h->order >= MAX_ORDER)
560 return NULL;
561
201 page = alloc_pages_node(nid, 562 page = alloc_pages_node(nid,
202 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 563 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
203 __GFP_REPEAT|__GFP_NOWARN, 564 __GFP_REPEAT|__GFP_NOWARN,
204 HUGETLB_PAGE_ORDER); 565 huge_page_order(h));
205 if (page) { 566 if (page) {
206 if (arch_prepare_hugepage(page)) { 567 if (arch_prepare_hugepage(page)) {
207 __free_pages(page, HUGETLB_PAGE_ORDER); 568 __free_pages(page, huge_page_order(h));
208 return NULL; 569 return NULL;
209 } 570 }
210 set_compound_page_dtor(page, free_huge_page); 571 prep_new_huge_page(h, page, nid);
211 spin_lock(&hugetlb_lock);
212 nr_huge_pages++;
213 nr_huge_pages_node[nid]++;
214 spin_unlock(&hugetlb_lock);
215 put_page(page); /* free it into the hugepage allocator */
216 } 572 }
217 573
218 return page; 574 return page;
219} 575}
220 576
221static int alloc_fresh_huge_page(void) 577/*
578 * Use a helper variable to find the next node and then
579 * copy it back to hugetlb_next_nid afterwards:
580 * otherwise there's a window in which a racer might
581 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
582 * But we don't need to use a spin_lock here: it really
583 * doesn't matter if occasionally a racer chooses the
584 * same nid as we do. Move nid forward in the mask even
585 * if we just successfully allocated a hugepage so that
586 * the next caller gets hugepages on the next node.
587 */
588static int hstate_next_node(struct hstate *h)
589{
590 int next_nid;
591 next_nid = next_node(h->hugetlb_next_nid, node_online_map);
592 if (next_nid == MAX_NUMNODES)
593 next_nid = first_node(node_online_map);
594 h->hugetlb_next_nid = next_nid;
595 return next_nid;
596}
597
598static int alloc_fresh_huge_page(struct hstate *h)
222{ 599{
223 struct page *page; 600 struct page *page;
224 int start_nid; 601 int start_nid;
225 int next_nid; 602 int next_nid;
226 int ret = 0; 603 int ret = 0;
227 604
228 start_nid = hugetlb_next_nid; 605 start_nid = h->hugetlb_next_nid;
229 606
230 do { 607 do {
231 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 608 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
232 if (page) 609 if (page)
233 ret = 1; 610 ret = 1;
234 /* 611 next_nid = hstate_next_node(h);
235 * Use a helper variable to find the next node and then 612 } while (!page && h->hugetlb_next_nid != start_nid);
236 * copy it back to hugetlb_next_nid afterwards:
237 * otherwise there's a window in which a racer might
238 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
239 * But we don't need to use a spin_lock here: it really
240 * doesn't matter if occasionally a racer chooses the
241 * same nid as we do. Move nid forward in the mask even
242 * if we just successfully allocated a hugepage so that
243 * the next caller gets hugepages on the next node.
244 */
245 next_nid = next_node(hugetlb_next_nid, node_online_map);
246 if (next_nid == MAX_NUMNODES)
247 next_nid = first_node(node_online_map);
248 hugetlb_next_nid = next_nid;
249 } while (!page && hugetlb_next_nid != start_nid);
250 613
251 if (ret) 614 if (ret)
252 count_vm_event(HTLB_BUDDY_PGALLOC); 615 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +619,15 @@ static int alloc_fresh_huge_page(void)
256 return ret; 619 return ret;
257} 620}
258 621
259static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 622static struct page *alloc_buddy_huge_page(struct hstate *h,
260 unsigned long address) 623 struct vm_area_struct *vma, unsigned long address)
261{ 624{
262 struct page *page; 625 struct page *page;
263 unsigned int nid; 626 unsigned int nid;
264 627
628 if (h->order >= MAX_ORDER)
629 return NULL;
630
265 /* 631 /*
266 * Assume we will successfully allocate the surplus page to 632 * Assume we will successfully allocate the surplus page to
267 * prevent racing processes from causing the surplus to exceed 633 * prevent racing processes from causing the surplus to exceed
@@ -286,18 +652,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
286 * per-node value is checked there. 652 * per-node value is checked there.
287 */ 653 */
288 spin_lock(&hugetlb_lock); 654 spin_lock(&hugetlb_lock);
289 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 655 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
290 spin_unlock(&hugetlb_lock); 656 spin_unlock(&hugetlb_lock);
291 return NULL; 657 return NULL;
292 } else { 658 } else {
293 nr_huge_pages++; 659 h->nr_huge_pages++;
294 surplus_huge_pages++; 660 h->surplus_huge_pages++;
295 } 661 }
296 spin_unlock(&hugetlb_lock); 662 spin_unlock(&hugetlb_lock);
297 663
298 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 664 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
299 __GFP_REPEAT|__GFP_NOWARN, 665 __GFP_REPEAT|__GFP_NOWARN,
300 HUGETLB_PAGE_ORDER); 666 huge_page_order(h));
667
668 if (page && arch_prepare_hugepage(page)) {
669 __free_pages(page, huge_page_order(h));
670 return NULL;
671 }
301 672
302 spin_lock(&hugetlb_lock); 673 spin_lock(&hugetlb_lock);
303 if (page) { 674 if (page) {
@@ -312,12 +683,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
312 /* 683 /*
313 * We incremented the global counters already 684 * We incremented the global counters already
314 */ 685 */
315 nr_huge_pages_node[nid]++; 686 h->nr_huge_pages_node[nid]++;
316 surplus_huge_pages_node[nid]++; 687 h->surplus_huge_pages_node[nid]++;
317 __count_vm_event(HTLB_BUDDY_PGALLOC); 688 __count_vm_event(HTLB_BUDDY_PGALLOC);
318 } else { 689 } else {
319 nr_huge_pages--; 690 h->nr_huge_pages--;
320 surplus_huge_pages--; 691 h->surplus_huge_pages--;
321 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 692 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
322 } 693 }
323 spin_unlock(&hugetlb_lock); 694 spin_unlock(&hugetlb_lock);
@@ -329,16 +700,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
329 * Increase the hugetlb pool such that it can accomodate a reservation 700 * Increase the hugetlb pool such that it can accomodate a reservation
330 * of size 'delta'. 701 * of size 'delta'.
331 */ 702 */
332static int gather_surplus_pages(int delta) 703static int gather_surplus_pages(struct hstate *h, int delta)
333{ 704{
334 struct list_head surplus_list; 705 struct list_head surplus_list;
335 struct page *page, *tmp; 706 struct page *page, *tmp;
336 int ret, i; 707 int ret, i;
337 int needed, allocated; 708 int needed, allocated;
338 709
339 needed = (resv_huge_pages + delta) - free_huge_pages; 710 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
340 if (needed <= 0) { 711 if (needed <= 0) {
341 resv_huge_pages += delta; 712 h->resv_huge_pages += delta;
342 return 0; 713 return 0;
343 } 714 }
344 715
@@ -349,7 +720,7 @@ static int gather_surplus_pages(int delta)
349retry: 720retry:
350 spin_unlock(&hugetlb_lock); 721 spin_unlock(&hugetlb_lock);
351 for (i = 0; i < needed; i++) { 722 for (i = 0; i < needed; i++) {
352 page = alloc_buddy_huge_page(NULL, 0); 723 page = alloc_buddy_huge_page(h, NULL, 0);
353 if (!page) { 724 if (!page) {
354 /* 725 /*
355 * We were not able to allocate enough pages to 726 * We were not able to allocate enough pages to
@@ -370,7 +741,8 @@ retry:
370 * because either resv_huge_pages or free_huge_pages may have changed. 741 * because either resv_huge_pages or free_huge_pages may have changed.
371 */ 742 */
372 spin_lock(&hugetlb_lock); 743 spin_lock(&hugetlb_lock);
373 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 744 needed = (h->resv_huge_pages + delta) -
745 (h->free_huge_pages + allocated);
374 if (needed > 0) 746 if (needed > 0)
375 goto retry; 747 goto retry;
376 748
@@ -383,7 +755,7 @@ retry:
383 * before they are reserved. 755 * before they are reserved.
384 */ 756 */
385 needed += allocated; 757 needed += allocated;
386 resv_huge_pages += delta; 758 h->resv_huge_pages += delta;
387 ret = 0; 759 ret = 0;
388free: 760free:
389 /* Free the needed pages to the hugetlb pool */ 761 /* Free the needed pages to the hugetlb pool */
@@ -391,7 +763,7 @@ free:
391 if ((--needed) < 0) 763 if ((--needed) < 0)
392 break; 764 break;
393 list_del(&page->lru); 765 list_del(&page->lru);
394 enqueue_huge_page(page); 766 enqueue_huge_page(h, page);
395 } 767 }
396 768
397 /* Free unnecessary surplus pages to the buddy allocator */ 769 /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +791,8 @@ free:
419 * allocated to satisfy the reservation must be explicitly freed if they were 791 * allocated to satisfy the reservation must be explicitly freed if they were
420 * never used. 792 * never used.
421 */ 793 */
422static void return_unused_surplus_pages(unsigned long unused_resv_pages) 794static void return_unused_surplus_pages(struct hstate *h,
795 unsigned long unused_resv_pages)
423{ 796{
424 static int nid = -1; 797 static int nid = -1;
425 struct page *page; 798 struct page *page;
@@ -434,157 +807,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
434 unsigned long remaining_iterations = num_online_nodes(); 807 unsigned long remaining_iterations = num_online_nodes();
435 808
436 /* Uncommit the reservation */ 809 /* Uncommit the reservation */
437 resv_huge_pages -= unused_resv_pages; 810 h->resv_huge_pages -= unused_resv_pages;
438 811
439 nr_pages = min(unused_resv_pages, surplus_huge_pages); 812 /* Cannot return gigantic pages currently */
813 if (h->order >= MAX_ORDER)
814 return;
815
816 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
440 817
441 while (remaining_iterations-- && nr_pages) { 818 while (remaining_iterations-- && nr_pages) {
442 nid = next_node(nid, node_online_map); 819 nid = next_node(nid, node_online_map);
443 if (nid == MAX_NUMNODES) 820 if (nid == MAX_NUMNODES)
444 nid = first_node(node_online_map); 821 nid = first_node(node_online_map);
445 822
446 if (!surplus_huge_pages_node[nid]) 823 if (!h->surplus_huge_pages_node[nid])
447 continue; 824 continue;
448 825
449 if (!list_empty(&hugepage_freelists[nid])) { 826 if (!list_empty(&h->hugepage_freelists[nid])) {
450 page = list_entry(hugepage_freelists[nid].next, 827 page = list_entry(h->hugepage_freelists[nid].next,
451 struct page, lru); 828 struct page, lru);
452 list_del(&page->lru); 829 list_del(&page->lru);
453 update_and_free_page(page); 830 update_and_free_page(h, page);
454 free_huge_pages--; 831 h->free_huge_pages--;
455 free_huge_pages_node[nid]--; 832 h->free_huge_pages_node[nid]--;
456 surplus_huge_pages--; 833 h->surplus_huge_pages--;
457 surplus_huge_pages_node[nid]--; 834 h->surplus_huge_pages_node[nid]--;
458 nr_pages--; 835 nr_pages--;
459 remaining_iterations = num_online_nodes(); 836 remaining_iterations = num_online_nodes();
460 } 837 }
461 } 838 }
462} 839}
463 840
841/*
842 * Determine if the huge page at addr within the vma has an associated
843 * reservation. Where it does not we will need to logically increase
844 * reservation and actually increase quota before an allocation can occur.
845 * Where any new reservation would be required the reservation change is
846 * prepared, but not committed. Once the page has been quota'd allocated
847 * an instantiated the change should be committed via vma_commit_reservation.
848 * No action is required on failure.
849 */
850static int vma_needs_reservation(struct hstate *h,
851 struct vm_area_struct *vma, unsigned long addr)
852{
853 struct address_space *mapping = vma->vm_file->f_mapping;
854 struct inode *inode = mapping->host;
855
856 if (vma->vm_flags & VM_SHARED) {
857 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
858 return region_chg(&inode->i_mapping->private_list,
859 idx, idx + 1);
860
861 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
862 return 1;
863
864 } else {
865 int err;
866 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
867 struct resv_map *reservations = vma_resv_map(vma);
464 868
465static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 869 err = region_chg(&reservations->regions, idx, idx + 1);
466 unsigned long addr) 870 if (err < 0)
871 return err;
872 return 0;
873 }
874}
875static void vma_commit_reservation(struct hstate *h,
876 struct vm_area_struct *vma, unsigned long addr)
467{ 877{
468 struct page *page; 878 struct address_space *mapping = vma->vm_file->f_mapping;
879 struct inode *inode = mapping->host;
469 880
470 spin_lock(&hugetlb_lock); 881 if (vma->vm_flags & VM_SHARED) {
471 page = dequeue_huge_page_vma(vma, addr); 882 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
472 spin_unlock(&hugetlb_lock); 883 region_add(&inode->i_mapping->private_list, idx, idx + 1);
473 return page ? page : ERR_PTR(-VM_FAULT_OOM); 884
885 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
886 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
887 struct resv_map *reservations = vma_resv_map(vma);
888
889 /* Mark this page used in the map. */
890 region_add(&reservations->regions, idx, idx + 1);
891 }
474} 892}
475 893
476static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 894static struct page *alloc_huge_page(struct vm_area_struct *vma,
477 unsigned long addr) 895 unsigned long addr, int avoid_reserve)
478{ 896{
479 struct page *page = NULL; 897 struct hstate *h = hstate_vma(vma);
898 struct page *page;
899 struct address_space *mapping = vma->vm_file->f_mapping;
900 struct inode *inode = mapping->host;
901 unsigned int chg;
480 902
481 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 903 /*
482 return ERR_PTR(-VM_FAULT_SIGBUS); 904 * Processes that did not create the mapping will have no reserves and
905 * will not have accounted against quota. Check that the quota can be
906 * made before satisfying the allocation
907 * MAP_NORESERVE mappings may also need pages and quota allocated
908 * if no reserve mapping overlaps.
909 */
910 chg = vma_needs_reservation(h, vma, addr);
911 if (chg < 0)
912 return ERR_PTR(chg);
913 if (chg)
914 if (hugetlb_get_quota(inode->i_mapping, chg))
915 return ERR_PTR(-ENOSPC);
483 916
484 spin_lock(&hugetlb_lock); 917 spin_lock(&hugetlb_lock);
485 if (free_huge_pages > resv_huge_pages) 918 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
486 page = dequeue_huge_page_vma(vma, addr);
487 spin_unlock(&hugetlb_lock); 919 spin_unlock(&hugetlb_lock);
920
488 if (!page) { 921 if (!page) {
489 page = alloc_buddy_huge_page(vma, addr); 922 page = alloc_buddy_huge_page(h, vma, addr);
490 if (!page) { 923 if (!page) {
491 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 924 hugetlb_put_quota(inode->i_mapping, chg);
492 return ERR_PTR(-VM_FAULT_OOM); 925 return ERR_PTR(-VM_FAULT_OOM);
493 } 926 }
494 } 927 }
928
929 set_page_refcounted(page);
930 set_page_private(page, (unsigned long) mapping);
931
932 vma_commit_reservation(h, vma, addr);
933
495 return page; 934 return page;
496} 935}
497 936
498static struct page *alloc_huge_page(struct vm_area_struct *vma, 937__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
499 unsigned long addr)
500{ 938{
501 struct page *page; 939 struct huge_bootmem_page *m;
502 struct address_space *mapping = vma->vm_file->f_mapping; 940 int nr_nodes = nodes_weight(node_online_map);
503 941
504 if (vma->vm_flags & VM_MAYSHARE) 942 while (nr_nodes) {
505 page = alloc_huge_page_shared(vma, addr); 943 void *addr;
506 else 944
507 page = alloc_huge_page_private(vma, addr); 945 addr = __alloc_bootmem_node_nopanic(
946 NODE_DATA(h->hugetlb_next_nid),
947 huge_page_size(h), huge_page_size(h), 0);
508 948
509 if (!IS_ERR(page)) { 949 if (addr) {
510 set_page_refcounted(page); 950 /*
511 set_page_private(page, (unsigned long) mapping); 951 * Use the beginning of the huge page to store the
952 * huge_bootmem_page struct (until gather_bootmem
953 * puts them into the mem_map).
954 */
955 m = addr;
956 if (m)
957 goto found;
958 }
959 hstate_next_node(h);
960 nr_nodes--;
512 } 961 }
513 return page; 962 return 0;
963
964found:
965 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
966 /* Put them into a private list first because mem_map is not up yet */
967 list_add(&m->list, &huge_boot_pages);
968 m->hstate = h;
969 return 1;
514} 970}
515 971
516static int __init hugetlb_init(void) 972/* Put bootmem huge pages into the standard lists after mem_map is up */
973static void __init gather_bootmem_prealloc(void)
517{ 974{
518 unsigned long i; 975 struct huge_bootmem_page *m;
519 976
520 if (HPAGE_SHIFT == 0) 977 list_for_each_entry(m, &huge_boot_pages, list) {
521 return 0; 978 struct page *page = virt_to_page(m);
522 979 struct hstate *h = m->hstate;
523 for (i = 0; i < MAX_NUMNODES; ++i) 980 __ClearPageReserved(page);
524 INIT_LIST_HEAD(&hugepage_freelists[i]); 981 WARN_ON(page_count(page) != 1);
982 prep_compound_page(page, h->order);
983 prep_new_huge_page(h, page, page_to_nid(page));
984 }
985}
525 986
526 hugetlb_next_nid = first_node(node_online_map); 987static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
988{
989 unsigned long i;
527 990
528 for (i = 0; i < max_huge_pages; ++i) { 991 for (i = 0; i < h->max_huge_pages; ++i) {
529 if (!alloc_fresh_huge_page()) 992 if (h->order >= MAX_ORDER) {
993 if (!alloc_bootmem_huge_page(h))
994 break;
995 } else if (!alloc_fresh_huge_page(h))
530 break; 996 break;
531 } 997 }
532 max_huge_pages = free_huge_pages = nr_huge_pages = i; 998 h->max_huge_pages = i;
533 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
534 return 0;
535} 999}
536module_init(hugetlb_init);
537 1000
538static int __init hugetlb_setup(char *s) 1001static void __init hugetlb_init_hstates(void)
539{ 1002{
540 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 1003 struct hstate *h;
541 max_huge_pages = 0; 1004
542 return 1; 1005 for_each_hstate(h) {
1006 /* oversize hugepages were init'ed in early boot */
1007 if (h->order < MAX_ORDER)
1008 hugetlb_hstate_alloc_pages(h);
1009 }
543} 1010}
544__setup("hugepages=", hugetlb_setup);
545 1011
546static unsigned int cpuset_mems_nr(unsigned int *array) 1012static char * __init memfmt(char *buf, unsigned long n)
547{ 1013{
548 int node; 1014 if (n >= (1UL << 30))
549 unsigned int nr = 0; 1015 sprintf(buf, "%lu GB", n >> 30);
550 1016 else if (n >= (1UL << 20))
551 for_each_node_mask(node, cpuset_current_mems_allowed) 1017 sprintf(buf, "%lu MB", n >> 20);
552 nr += array[node]; 1018 else
1019 sprintf(buf, "%lu KB", n >> 10);
1020 return buf;
1021}
553 1022
554 return nr; 1023static void __init report_hugepages(void)
1024{
1025 struct hstate *h;
1026
1027 for_each_hstate(h) {
1028 char buf[32];
1029 printk(KERN_INFO "HugeTLB registered %s page size, "
1030 "pre-allocated %ld pages\n",
1031 memfmt(buf, huge_page_size(h)),
1032 h->free_huge_pages);
1033 }
555} 1034}
556 1035
557#ifdef CONFIG_SYSCTL
558#ifdef CONFIG_HIGHMEM 1036#ifdef CONFIG_HIGHMEM
559static void try_to_free_low(unsigned long count) 1037static void try_to_free_low(struct hstate *h, unsigned long count)
560{ 1038{
561 int i; 1039 int i;
562 1040
1041 if (h->order >= MAX_ORDER)
1042 return;
1043
563 for (i = 0; i < MAX_NUMNODES; ++i) { 1044 for (i = 0; i < MAX_NUMNODES; ++i) {
564 struct page *page, *next; 1045 struct page *page, *next;
565 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 1046 struct list_head *freel = &h->hugepage_freelists[i];
566 if (count >= nr_huge_pages) 1047 list_for_each_entry_safe(page, next, freel, lru) {
1048 if (count >= h->nr_huge_pages)
567 return; 1049 return;
568 if (PageHighMem(page)) 1050 if (PageHighMem(page))
569 continue; 1051 continue;
570 list_del(&page->lru); 1052 list_del(&page->lru);
571 update_and_free_page(page); 1053 update_and_free_page(h, page);
572 free_huge_pages--; 1054 h->free_huge_pages--;
573 free_huge_pages_node[page_to_nid(page)]--; 1055 h->free_huge_pages_node[page_to_nid(page)]--;
574 } 1056 }
575 } 1057 }
576} 1058}
577#else 1059#else
578static inline void try_to_free_low(unsigned long count) 1060static inline void try_to_free_low(struct hstate *h, unsigned long count)
579{ 1061{
580} 1062}
581#endif 1063#endif
582 1064
583#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 1065#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
584static unsigned long set_max_huge_pages(unsigned long count) 1066static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
585{ 1067{
586 unsigned long min_count, ret; 1068 unsigned long min_count, ret;
587 1069
1070 if (h->order >= MAX_ORDER)
1071 return h->max_huge_pages;
1072
588 /* 1073 /*
589 * Increase the pool size 1074 * Increase the pool size
590 * First take pages out of surplus state. Then make up the 1075 * First take pages out of surplus state. Then make up the
@@ -597,20 +1082,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
597 * within all the constraints specified by the sysctls. 1082 * within all the constraints specified by the sysctls.
598 */ 1083 */
599 spin_lock(&hugetlb_lock); 1084 spin_lock(&hugetlb_lock);
600 while (surplus_huge_pages && count > persistent_huge_pages) { 1085 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
601 if (!adjust_pool_surplus(-1)) 1086 if (!adjust_pool_surplus(h, -1))
602 break; 1087 break;
603 } 1088 }
604 1089
605 while (count > persistent_huge_pages) { 1090 while (count > persistent_huge_pages(h)) {
606 int ret;
607 /* 1091 /*
608 * If this allocation races such that we no longer need the 1092 * If this allocation races such that we no longer need the
609 * page, free_huge_page will handle it by freeing the page 1093 * page, free_huge_page will handle it by freeing the page
610 * and reducing the surplus. 1094 * and reducing the surplus.
611 */ 1095 */
612 spin_unlock(&hugetlb_lock); 1096 spin_unlock(&hugetlb_lock);
613 ret = alloc_fresh_huge_page(); 1097 ret = alloc_fresh_huge_page(h);
614 spin_lock(&hugetlb_lock); 1098 spin_lock(&hugetlb_lock);
615 if (!ret) 1099 if (!ret)
616 goto out; 1100 goto out;
@@ -632,31 +1116,305 @@ static unsigned long set_max_huge_pages(unsigned long count)
632 * and won't grow the pool anywhere else. Not until one of the 1116 * and won't grow the pool anywhere else. Not until one of the
633 * sysctls are changed, or the surplus pages go out of use. 1117 * sysctls are changed, or the surplus pages go out of use.
634 */ 1118 */
635 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 1119 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
636 min_count = max(count, min_count); 1120 min_count = max(count, min_count);
637 try_to_free_low(min_count); 1121 try_to_free_low(h, min_count);
638 while (min_count < persistent_huge_pages) { 1122 while (min_count < persistent_huge_pages(h)) {
639 struct page *page = dequeue_huge_page(); 1123 struct page *page = dequeue_huge_page(h);
640 if (!page) 1124 if (!page)
641 break; 1125 break;
642 update_and_free_page(page); 1126 update_and_free_page(h, page);
643 } 1127 }
644 while (count < persistent_huge_pages) { 1128 while (count < persistent_huge_pages(h)) {
645 if (!adjust_pool_surplus(1)) 1129 if (!adjust_pool_surplus(h, 1))
646 break; 1130 break;
647 } 1131 }
648out: 1132out:
649 ret = persistent_huge_pages; 1133 ret = persistent_huge_pages(h);
650 spin_unlock(&hugetlb_lock); 1134 spin_unlock(&hugetlb_lock);
651 return ret; 1135 return ret;
652} 1136}
653 1137
1138#define HSTATE_ATTR_RO(_name) \
1139 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1140
1141#define HSTATE_ATTR(_name) \
1142 static struct kobj_attribute _name##_attr = \
1143 __ATTR(_name, 0644, _name##_show, _name##_store)
1144
1145static struct kobject *hugepages_kobj;
1146static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1147
1148static struct hstate *kobj_to_hstate(struct kobject *kobj)
1149{
1150 int i;
1151 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1152 if (hstate_kobjs[i] == kobj)
1153 return &hstates[i];
1154 BUG();
1155 return NULL;
1156}
1157
1158static ssize_t nr_hugepages_show(struct kobject *kobj,
1159 struct kobj_attribute *attr, char *buf)
1160{
1161 struct hstate *h = kobj_to_hstate(kobj);
1162 return sprintf(buf, "%lu\n", h->nr_huge_pages);
1163}
1164static ssize_t nr_hugepages_store(struct kobject *kobj,
1165 struct kobj_attribute *attr, const char *buf, size_t count)
1166{
1167 int err;
1168 unsigned long input;
1169 struct hstate *h = kobj_to_hstate(kobj);
1170
1171 err = strict_strtoul(buf, 10, &input);
1172 if (err)
1173 return 0;
1174
1175 h->max_huge_pages = set_max_huge_pages(h, input);
1176
1177 return count;
1178}
1179HSTATE_ATTR(nr_hugepages);
1180
1181static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1182 struct kobj_attribute *attr, char *buf)
1183{
1184 struct hstate *h = kobj_to_hstate(kobj);
1185 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1186}
1187static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1188 struct kobj_attribute *attr, const char *buf, size_t count)
1189{
1190 int err;
1191 unsigned long input;
1192 struct hstate *h = kobj_to_hstate(kobj);
1193
1194 err = strict_strtoul(buf, 10, &input);
1195 if (err)
1196 return 0;
1197
1198 spin_lock(&hugetlb_lock);
1199 h->nr_overcommit_huge_pages = input;
1200 spin_unlock(&hugetlb_lock);
1201
1202 return count;
1203}
1204HSTATE_ATTR(nr_overcommit_hugepages);
1205
1206static ssize_t free_hugepages_show(struct kobject *kobj,
1207 struct kobj_attribute *attr, char *buf)
1208{
1209 struct hstate *h = kobj_to_hstate(kobj);
1210 return sprintf(buf, "%lu\n", h->free_huge_pages);
1211}
1212HSTATE_ATTR_RO(free_hugepages);
1213
1214static ssize_t resv_hugepages_show(struct kobject *kobj,
1215 struct kobj_attribute *attr, char *buf)
1216{
1217 struct hstate *h = kobj_to_hstate(kobj);
1218 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1219}
1220HSTATE_ATTR_RO(resv_hugepages);
1221
1222static ssize_t surplus_hugepages_show(struct kobject *kobj,
1223 struct kobj_attribute *attr, char *buf)
1224{
1225 struct hstate *h = kobj_to_hstate(kobj);
1226 return sprintf(buf, "%lu\n", h->surplus_huge_pages);
1227}
1228HSTATE_ATTR_RO(surplus_hugepages);
1229
1230static struct attribute *hstate_attrs[] = {
1231 &nr_hugepages_attr.attr,
1232 &nr_overcommit_hugepages_attr.attr,
1233 &free_hugepages_attr.attr,
1234 &resv_hugepages_attr.attr,
1235 &surplus_hugepages_attr.attr,
1236 NULL,
1237};
1238
1239static struct attribute_group hstate_attr_group = {
1240 .attrs = hstate_attrs,
1241};
1242
1243static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
1244{
1245 int retval;
1246
1247 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
1248 hugepages_kobj);
1249 if (!hstate_kobjs[h - hstates])
1250 return -ENOMEM;
1251
1252 retval = sysfs_create_group(hstate_kobjs[h - hstates],
1253 &hstate_attr_group);
1254 if (retval)
1255 kobject_put(hstate_kobjs[h - hstates]);
1256
1257 return retval;
1258}
1259
1260static void __init hugetlb_sysfs_init(void)
1261{
1262 struct hstate *h;
1263 int err;
1264
1265 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1266 if (!hugepages_kobj)
1267 return;
1268
1269 for_each_hstate(h) {
1270 err = hugetlb_sysfs_add_hstate(h);
1271 if (err)
1272 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1273 h->name);
1274 }
1275}
1276
1277static void __exit hugetlb_exit(void)
1278{
1279 struct hstate *h;
1280
1281 for_each_hstate(h) {
1282 kobject_put(hstate_kobjs[h - hstates]);
1283 }
1284
1285 kobject_put(hugepages_kobj);
1286}
1287module_exit(hugetlb_exit);
1288
1289static int __init hugetlb_init(void)
1290{
1291 /* Some platform decide whether they support huge pages at boot
1292 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1293 * there is no such support
1294 */
1295 if (HPAGE_SHIFT == 0)
1296 return 0;
1297
1298 if (!size_to_hstate(default_hstate_size)) {
1299 default_hstate_size = HPAGE_SIZE;
1300 if (!size_to_hstate(default_hstate_size))
1301 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1302 }
1303 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1304 if (default_hstate_max_huge_pages)
1305 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1306
1307 hugetlb_init_hstates();
1308
1309 gather_bootmem_prealloc();
1310
1311 report_hugepages();
1312
1313 hugetlb_sysfs_init();
1314
1315 return 0;
1316}
1317module_init(hugetlb_init);
1318
1319/* Should be called on processing a hugepagesz=... option */
1320void __init hugetlb_add_hstate(unsigned order)
1321{
1322 struct hstate *h;
1323 unsigned long i;
1324
1325 if (size_to_hstate(PAGE_SIZE << order)) {
1326 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1327 return;
1328 }
1329 BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1330 BUG_ON(order == 0);
1331 h = &hstates[max_hstate++];
1332 h->order = order;
1333 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1334 h->nr_huge_pages = 0;
1335 h->free_huge_pages = 0;
1336 for (i = 0; i < MAX_NUMNODES; ++i)
1337 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1338 h->hugetlb_next_nid = first_node(node_online_map);
1339 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1340 huge_page_size(h)/1024);
1341
1342 parsed_hstate = h;
1343}
1344
1345static int __init hugetlb_nrpages_setup(char *s)
1346{
1347 unsigned long *mhp;
1348 static unsigned long *last_mhp;
1349
1350 /*
1351 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1352 * so this hugepages= parameter goes to the "default hstate".
1353 */
1354 if (!max_hstate)
1355 mhp = &default_hstate_max_huge_pages;
1356 else
1357 mhp = &parsed_hstate->max_huge_pages;
1358
1359 if (mhp == last_mhp) {
1360 printk(KERN_WARNING "hugepages= specified twice without "
1361 "interleaving hugepagesz=, ignoring\n");
1362 return 1;
1363 }
1364
1365 if (sscanf(s, "%lu", mhp) <= 0)
1366 *mhp = 0;
1367
1368 /*
1369 * Global state is always initialized later in hugetlb_init.
1370 * But we need to allocate >= MAX_ORDER hstates here early to still
1371 * use the bootmem allocator.
1372 */
1373 if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1374 hugetlb_hstate_alloc_pages(parsed_hstate);
1375
1376 last_mhp = mhp;
1377
1378 return 1;
1379}
1380__setup("hugepages=", hugetlb_nrpages_setup);
1381
1382static int __init hugetlb_default_setup(char *s)
1383{
1384 default_hstate_size = memparse(s, &s);
1385 return 1;
1386}
1387__setup("default_hugepagesz=", hugetlb_default_setup);
1388
1389static unsigned int cpuset_mems_nr(unsigned int *array)
1390{
1391 int node;
1392 unsigned int nr = 0;
1393
1394 for_each_node_mask(node, cpuset_current_mems_allowed)
1395 nr += array[node];
1396
1397 return nr;
1398}
1399
1400#ifdef CONFIG_SYSCTL
654int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1401int hugetlb_sysctl_handler(struct ctl_table *table, int write,
655 struct file *file, void __user *buffer, 1402 struct file *file, void __user *buffer,
656 size_t *length, loff_t *ppos) 1403 size_t *length, loff_t *ppos)
657{ 1404{
1405 struct hstate *h = &default_hstate;
1406 unsigned long tmp;
1407
1408 if (!write)
1409 tmp = h->max_huge_pages;
1410
1411 table->data = &tmp;
1412 table->maxlen = sizeof(unsigned long);
658 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1413 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
659 max_huge_pages = set_max_huge_pages(max_huge_pages); 1414
1415 if (write)
1416 h->max_huge_pages = set_max_huge_pages(h, tmp);
1417
660 return 0; 1418 return 0;
661} 1419}
662 1420
@@ -676,10 +1434,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
676 struct file *file, void __user *buffer, 1434 struct file *file, void __user *buffer,
677 size_t *length, loff_t *ppos) 1435 size_t *length, loff_t *ppos)
678{ 1436{
1437 struct hstate *h = &default_hstate;
1438 unsigned long tmp;
1439
1440 if (!write)
1441 tmp = h->nr_overcommit_huge_pages;
1442
1443 table->data = &tmp;
1444 table->maxlen = sizeof(unsigned long);
679 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1445 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
680 spin_lock(&hugetlb_lock); 1446
681 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 1447 if (write) {
682 spin_unlock(&hugetlb_lock); 1448 spin_lock(&hugetlb_lock);
1449 h->nr_overcommit_huge_pages = tmp;
1450 spin_unlock(&hugetlb_lock);
1451 }
1452
683 return 0; 1453 return 0;
684} 1454}
685 1455
@@ -687,34 +1457,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
687 1457
688int hugetlb_report_meminfo(char *buf) 1458int hugetlb_report_meminfo(char *buf)
689{ 1459{
1460 struct hstate *h = &default_hstate;
690 return sprintf(buf, 1461 return sprintf(buf,
691 "HugePages_Total: %5lu\n" 1462 "HugePages_Total: %5lu\n"
692 "HugePages_Free: %5lu\n" 1463 "HugePages_Free: %5lu\n"
693 "HugePages_Rsvd: %5lu\n" 1464 "HugePages_Rsvd: %5lu\n"
694 "HugePages_Surp: %5lu\n" 1465 "HugePages_Surp: %5lu\n"
695 "Hugepagesize: %5lu kB\n", 1466 "Hugepagesize: %5lu kB\n",
696 nr_huge_pages, 1467 h->nr_huge_pages,
697 free_huge_pages, 1468 h->free_huge_pages,
698 resv_huge_pages, 1469 h->resv_huge_pages,
699 surplus_huge_pages, 1470 h->surplus_huge_pages,
700 HPAGE_SIZE/1024); 1471 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
701} 1472}
702 1473
703int hugetlb_report_node_meminfo(int nid, char *buf) 1474int hugetlb_report_node_meminfo(int nid, char *buf)
704{ 1475{
1476 struct hstate *h = &default_hstate;
705 return sprintf(buf, 1477 return sprintf(buf,
706 "Node %d HugePages_Total: %5u\n" 1478 "Node %d HugePages_Total: %5u\n"
707 "Node %d HugePages_Free: %5u\n" 1479 "Node %d HugePages_Free: %5u\n"
708 "Node %d HugePages_Surp: %5u\n", 1480 "Node %d HugePages_Surp: %5u\n",
709 nid, nr_huge_pages_node[nid], 1481 nid, h->nr_huge_pages_node[nid],
710 nid, free_huge_pages_node[nid], 1482 nid, h->free_huge_pages_node[nid],
711 nid, surplus_huge_pages_node[nid]); 1483 nid, h->surplus_huge_pages_node[nid]);
712} 1484}
713 1485
714/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 1486/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
715unsigned long hugetlb_total_pages(void) 1487unsigned long hugetlb_total_pages(void)
716{ 1488{
717 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 1489 struct hstate *h = &default_hstate;
1490 return h->nr_huge_pages * pages_per_huge_page(h);
1491}
1492
1493static int hugetlb_acct_memory(struct hstate *h, long delta)
1494{
1495 int ret = -ENOMEM;
1496
1497 spin_lock(&hugetlb_lock);
1498 /*
1499 * When cpuset is configured, it breaks the strict hugetlb page
1500 * reservation as the accounting is done on a global variable. Such
1501 * reservation is completely rubbish in the presence of cpuset because
1502 * the reservation is not checked against page availability for the
1503 * current cpuset. Application can still potentially OOM'ed by kernel
1504 * with lack of free htlb page in cpuset that the task is in.
1505 * Attempt to enforce strict accounting with cpuset is almost
1506 * impossible (or too ugly) because cpuset is too fluid that
1507 * task or memory node can be dynamically moved between cpusets.
1508 *
1509 * The change of semantics for shared hugetlb mapping with cpuset is
1510 * undesirable. However, in order to preserve some of the semantics,
1511 * we fall back to check against current free page availability as
1512 * a best attempt and hopefully to minimize the impact of changing
1513 * semantics that cpuset has.
1514 */
1515 if (delta > 0) {
1516 if (gather_surplus_pages(h, delta) < 0)
1517 goto out;
1518
1519 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1520 return_unused_surplus_pages(h, delta);
1521 goto out;
1522 }
1523 }
1524
1525 ret = 0;
1526 if (delta < 0)
1527 return_unused_surplus_pages(h, (unsigned long) -delta);
1528
1529out:
1530 spin_unlock(&hugetlb_lock);
1531 return ret;
1532}
1533
1534static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1535{
1536 struct resv_map *reservations = vma_resv_map(vma);
1537
1538 /*
1539 * This new VMA should share its siblings reservation map if present.
1540 * The VMA will only ever have a valid reservation map pointer where
1541 * it is being copied for another still existing VMA. As that VMA
1542 * has a reference to the reservation map it cannot dissappear until
1543 * after this open call completes. It is therefore safe to take a
1544 * new reference here without additional locking.
1545 */
1546 if (reservations)
1547 kref_get(&reservations->refs);
1548}
1549
1550static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1551{
1552 struct hstate *h = hstate_vma(vma);
1553 struct resv_map *reservations = vma_resv_map(vma);
1554 unsigned long reserve;
1555 unsigned long start;
1556 unsigned long end;
1557
1558 if (reservations) {
1559 start = vma_hugecache_offset(h, vma, vma->vm_start);
1560 end = vma_hugecache_offset(h, vma, vma->vm_end);
1561
1562 reserve = (end - start) -
1563 region_count(&reservations->regions, start, end);
1564
1565 kref_put(&reservations->refs, resv_map_release);
1566
1567 if (reserve) {
1568 hugetlb_acct_memory(h, -reserve);
1569 hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
1570 }
1571 }
718} 1572}
719 1573
720/* 1574/*
@@ -731,6 +1585,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
731 1585
732struct vm_operations_struct hugetlb_vm_ops = { 1586struct vm_operations_struct hugetlb_vm_ops = {
733 .fault = hugetlb_vm_op_fault, 1587 .fault = hugetlb_vm_op_fault,
1588 .open = hugetlb_vm_op_open,
1589 .close = hugetlb_vm_op_close,
734}; 1590};
735 1591
736static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 1592static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1625,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
769 struct page *ptepage; 1625 struct page *ptepage;
770 unsigned long addr; 1626 unsigned long addr;
771 int cow; 1627 int cow;
1628 struct hstate *h = hstate_vma(vma);
1629 unsigned long sz = huge_page_size(h);
772 1630
773 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1631 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
774 1632
775 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 1633 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
776 src_pte = huge_pte_offset(src, addr); 1634 src_pte = huge_pte_offset(src, addr);
777 if (!src_pte) 1635 if (!src_pte)
778 continue; 1636 continue;
779 dst_pte = huge_pte_alloc(dst, addr); 1637 dst_pte = huge_pte_alloc(dst, addr, sz);
780 if (!dst_pte) 1638 if (!dst_pte)
781 goto nomem; 1639 goto nomem;
782 1640
@@ -804,7 +1662,7 @@ nomem:
804} 1662}
805 1663
806void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1664void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
807 unsigned long end) 1665 unsigned long end, struct page *ref_page)
808{ 1666{
809 struct mm_struct *mm = vma->vm_mm; 1667 struct mm_struct *mm = vma->vm_mm;
810 unsigned long address; 1668 unsigned long address;
@@ -812,6 +1670,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
812 pte_t pte; 1670 pte_t pte;
813 struct page *page; 1671 struct page *page;
814 struct page *tmp; 1672 struct page *tmp;
1673 struct hstate *h = hstate_vma(vma);
1674 unsigned long sz = huge_page_size(h);
1675
815 /* 1676 /*
816 * A page gathering list, protected by per file i_mmap_lock. The 1677 * A page gathering list, protected by per file i_mmap_lock. The
817 * lock is used to avoid list corruption from multiple unmapping 1678 * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1681,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
820 LIST_HEAD(page_list); 1681 LIST_HEAD(page_list);
821 1682
822 WARN_ON(!is_vm_hugetlb_page(vma)); 1683 WARN_ON(!is_vm_hugetlb_page(vma));
823 BUG_ON(start & ~HPAGE_MASK); 1684 BUG_ON(start & ~huge_page_mask(h));
824 BUG_ON(end & ~HPAGE_MASK); 1685 BUG_ON(end & ~huge_page_mask(h));
825 1686
1687 mmu_notifier_invalidate_range_start(mm, start, end);
826 spin_lock(&mm->page_table_lock); 1688 spin_lock(&mm->page_table_lock);
827 for (address = start; address < end; address += HPAGE_SIZE) { 1689 for (address = start; address < end; address += sz) {
828 ptep = huge_pte_offset(mm, address); 1690 ptep = huge_pte_offset(mm, address);
829 if (!ptep) 1691 if (!ptep)
830 continue; 1692 continue;
@@ -832,6 +1694,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
832 if (huge_pmd_unshare(mm, &address, ptep)) 1694 if (huge_pmd_unshare(mm, &address, ptep))
833 continue; 1695 continue;
834 1696
1697 /*
1698 * If a reference page is supplied, it is because a specific
1699 * page is being unmapped, not a range. Ensure the page we
1700 * are about to unmap is the actual page of interest.
1701 */
1702 if (ref_page) {
1703 pte = huge_ptep_get(ptep);
1704 if (huge_pte_none(pte))
1705 continue;
1706 page = pte_page(pte);
1707 if (page != ref_page)
1708 continue;
1709
1710 /*
1711 * Mark the VMA as having unmapped its page so that
1712 * future faults in this VMA will fail rather than
1713 * looking like data was lost
1714 */
1715 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
1716 }
1717
835 pte = huge_ptep_get_and_clear(mm, address, ptep); 1718 pte = huge_ptep_get_and_clear(mm, address, ptep);
836 if (huge_pte_none(pte)) 1719 if (huge_pte_none(pte))
837 continue; 1720 continue;
@@ -843,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
843 } 1726 }
844 spin_unlock(&mm->page_table_lock); 1727 spin_unlock(&mm->page_table_lock);
845 flush_tlb_range(vma, start, end); 1728 flush_tlb_range(vma, start, end);
1729 mmu_notifier_invalidate_range_end(mm, start, end);
846 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1730 list_for_each_entry_safe(page, tmp, &page_list, lru) {
847 list_del(&page->lru); 1731 list_del(&page->lru);
848 put_page(page); 1732 put_page(page);
@@ -850,31 +1734,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
850} 1734}
851 1735
852void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1736void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
853 unsigned long end) 1737 unsigned long end, struct page *ref_page)
854{ 1738{
1739 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1740 __unmap_hugepage_range(vma, start, end, ref_page);
1741 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1742}
1743
1744/*
1745 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1746 * mappping it owns the reserve page for. The intention is to unmap the page
1747 * from other VMAs and let the children be SIGKILLed if they are faulting the
1748 * same region.
1749 */
1750int unmap_ref_private(struct mm_struct *mm,
1751 struct vm_area_struct *vma,
1752 struct page *page,
1753 unsigned long address)
1754{
1755 struct vm_area_struct *iter_vma;
1756 struct address_space *mapping;
1757 struct prio_tree_iter iter;
1758 pgoff_t pgoff;
1759
855 /* 1760 /*
856 * It is undesirable to test vma->vm_file as it should be non-null 1761 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
857 * for valid hugetlb area. However, vm_file will be NULL in the error 1762 * from page cache lookup which is in HPAGE_SIZE units.
858 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
859 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
860 * to clean up. Since no pte has actually been setup, it is safe to
861 * do nothing in this case.
862 */ 1763 */
863 if (vma->vm_file) { 1764 address = address & huge_page_mask(hstate_vma(vma));
864 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1765 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
865 __unmap_hugepage_range(vma, start, end); 1766 + (vma->vm_pgoff >> PAGE_SHIFT);
866 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1767 mapping = (struct address_space *)page_private(page);
1768
1769 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1770 /* Do not unmap the current VMA */
1771 if (iter_vma == vma)
1772 continue;
1773
1774 /*
1775 * Unmap the page from other VMAs without their own reserves.
1776 * They get marked to be SIGKILLed if they fault in these
1777 * areas. This is because a future no-page fault on this VMA
1778 * could insert a zeroed page instead of the data existing
1779 * from the time of fork. This would look like data corruption
1780 */
1781 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1782 unmap_hugepage_range(iter_vma,
1783 address, address + HPAGE_SIZE,
1784 page);
867 } 1785 }
1786
1787 return 1;
868} 1788}
869 1789
870static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1790static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
871 unsigned long address, pte_t *ptep, pte_t pte) 1791 unsigned long address, pte_t *ptep, pte_t pte,
1792 struct page *pagecache_page)
872{ 1793{
1794 struct hstate *h = hstate_vma(vma);
873 struct page *old_page, *new_page; 1795 struct page *old_page, *new_page;
874 int avoidcopy; 1796 int avoidcopy;
1797 int outside_reserve = 0;
875 1798
876 old_page = pte_page(pte); 1799 old_page = pte_page(pte);
877 1800
1801retry_avoidcopy:
878 /* If no-one else is actually using this page, avoid the copy 1802 /* If no-one else is actually using this page, avoid the copy
879 * and just make the page writable */ 1803 * and just make the page writable */
880 avoidcopy = (page_count(old_page) == 1); 1804 avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1807,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
883 return 0; 1807 return 0;
884 } 1808 }
885 1809
1810 /*
1811 * If the process that created a MAP_PRIVATE mapping is about to
1812 * perform a COW due to a shared page count, attempt to satisfy
1813 * the allocation without using the existing reserves. The pagecache
1814 * page is used to determine if the reserve at this address was
1815 * consumed or not. If reserves were used, a partial faulted mapping
1816 * at the time of fork() could consume its reserves on COW instead
1817 * of the full address range.
1818 */
1819 if (!(vma->vm_flags & VM_SHARED) &&
1820 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1821 old_page != pagecache_page)
1822 outside_reserve = 1;
1823
886 page_cache_get(old_page); 1824 page_cache_get(old_page);
887 new_page = alloc_huge_page(vma, address); 1825 new_page = alloc_huge_page(vma, address, outside_reserve);
888 1826
889 if (IS_ERR(new_page)) { 1827 if (IS_ERR(new_page)) {
890 page_cache_release(old_page); 1828 page_cache_release(old_page);
1829
1830 /*
1831 * If a process owning a MAP_PRIVATE mapping fails to COW,
1832 * it is due to references held by a child and an insufficient
1833 * huge page pool. To guarantee the original mappers
1834 * reliability, unmap the page from child processes. The child
1835 * may get SIGKILLed if it later faults.
1836 */
1837 if (outside_reserve) {
1838 BUG_ON(huge_pte_none(pte));
1839 if (unmap_ref_private(mm, vma, old_page, address)) {
1840 BUG_ON(page_count(old_page) != 1);
1841 BUG_ON(huge_pte_none(pte));
1842 goto retry_avoidcopy;
1843 }
1844 WARN_ON_ONCE(1);
1845 }
1846
891 return -PTR_ERR(new_page); 1847 return -PTR_ERR(new_page);
892 } 1848 }
893 1849
@@ -896,7 +1852,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
896 __SetPageUptodate(new_page); 1852 __SetPageUptodate(new_page);
897 spin_lock(&mm->page_table_lock); 1853 spin_lock(&mm->page_table_lock);
898 1854
899 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1855 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
900 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1856 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
901 /* Break COW */ 1857 /* Break COW */
902 huge_ptep_clear_flush(vma, address, ptep); 1858 huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1866,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
910 return 0; 1866 return 0;
911} 1867}
912 1868
1869/* Return the pagecache page at a given address within a VMA */
1870static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1871 struct vm_area_struct *vma, unsigned long address)
1872{
1873 struct address_space *mapping;
1874 pgoff_t idx;
1875
1876 mapping = vma->vm_file->f_mapping;
1877 idx = vma_hugecache_offset(h, vma, address);
1878
1879 return find_lock_page(mapping, idx);
1880}
1881
913static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1882static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
914 unsigned long address, pte_t *ptep, int write_access) 1883 unsigned long address, pte_t *ptep, int write_access)
915{ 1884{
1885 struct hstate *h = hstate_vma(vma);
916 int ret = VM_FAULT_SIGBUS; 1886 int ret = VM_FAULT_SIGBUS;
917 unsigned long idx; 1887 pgoff_t idx;
918 unsigned long size; 1888 unsigned long size;
919 struct page *page; 1889 struct page *page;
920 struct address_space *mapping; 1890 struct address_space *mapping;
921 pte_t new_pte; 1891 pte_t new_pte;
922 1892
1893 /*
1894 * Currently, we are forced to kill the process in the event the
1895 * original mapper has unmapped pages from the child due to a failed
1896 * COW. Warn that such a situation has occured as it may not be obvious
1897 */
1898 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1899 printk(KERN_WARNING
1900 "PID %d killed due to inadequate hugepage pool\n",
1901 current->pid);
1902 return ret;
1903 }
1904
923 mapping = vma->vm_file->f_mapping; 1905 mapping = vma->vm_file->f_mapping;
924 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1906 idx = vma_hugecache_offset(h, vma, address);
925 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
926 1907
927 /* 1908 /*
928 * Use page lock to guard against racing truncation 1909 * Use page lock to guard against racing truncation
@@ -931,15 +1912,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
931retry: 1912retry:
932 page = find_lock_page(mapping, idx); 1913 page = find_lock_page(mapping, idx);
933 if (!page) { 1914 if (!page) {
934 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1915 size = i_size_read(mapping->host) >> huge_page_shift(h);
935 if (idx >= size) 1916 if (idx >= size)
936 goto out; 1917 goto out;
937 page = alloc_huge_page(vma, address); 1918 page = alloc_huge_page(vma, address, 0);
938 if (IS_ERR(page)) { 1919 if (IS_ERR(page)) {
939 ret = -PTR_ERR(page); 1920 ret = -PTR_ERR(page);
940 goto out; 1921 goto out;
941 } 1922 }
942 clear_huge_page(page, address); 1923 clear_huge_page(page, address, huge_page_size(h));
943 __SetPageUptodate(page); 1924 __SetPageUptodate(page);
944 1925
945 if (vma->vm_flags & VM_SHARED) { 1926 if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1936,26 @@ retry:
955 } 1936 }
956 1937
957 spin_lock(&inode->i_lock); 1938 spin_lock(&inode->i_lock);
958 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1939 inode->i_blocks += blocks_per_huge_page(h);
959 spin_unlock(&inode->i_lock); 1940 spin_unlock(&inode->i_lock);
960 } else 1941 } else
961 lock_page(page); 1942 lock_page(page);
962 } 1943 }
963 1944
1945 /*
1946 * If we are going to COW a private mapping later, we examine the
1947 * pending reservations for this page now. This will ensure that
1948 * any allocations necessary to record that reservation occur outside
1949 * the spinlock.
1950 */
1951 if (write_access && !(vma->vm_flags & VM_SHARED))
1952 if (vma_needs_reservation(h, vma, address) < 0) {
1953 ret = VM_FAULT_OOM;
1954 goto backout_unlocked;
1955 }
1956
964 spin_lock(&mm->page_table_lock); 1957 spin_lock(&mm->page_table_lock);
965 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1958 size = i_size_read(mapping->host) >> huge_page_shift(h);
966 if (idx >= size) 1959 if (idx >= size)
967 goto backout; 1960 goto backout;
968 1961
@@ -976,7 +1969,7 @@ retry:
976 1969
977 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1970 if (write_access && !(vma->vm_flags & VM_SHARED)) {
978 /* Optimization, do the COW without a second fault */ 1971 /* Optimization, do the COW without a second fault */
979 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1972 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
980 } 1973 }
981 1974
982 spin_unlock(&mm->page_table_lock); 1975 spin_unlock(&mm->page_table_lock);
@@ -986,6 +1979,7 @@ out:
986 1979
987backout: 1980backout:
988 spin_unlock(&mm->page_table_lock); 1981 spin_unlock(&mm->page_table_lock);
1982backout_unlocked:
989 unlock_page(page); 1983 unlock_page(page);
990 put_page(page); 1984 put_page(page);
991 goto out; 1985 goto out;
@@ -997,9 +1991,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
997 pte_t *ptep; 1991 pte_t *ptep;
998 pte_t entry; 1992 pte_t entry;
999 int ret; 1993 int ret;
1994 struct page *pagecache_page = NULL;
1000 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1995 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1996 struct hstate *h = hstate_vma(vma);
1001 1997
1002 ptep = huge_pte_alloc(mm, address); 1998 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
1003 if (!ptep) 1999 if (!ptep)
1004 return VM_FAULT_OOM; 2000 return VM_FAULT_OOM;
1005 2001
@@ -1012,23 +2008,58 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1012 entry = huge_ptep_get(ptep); 2008 entry = huge_ptep_get(ptep);
1013 if (huge_pte_none(entry)) { 2009 if (huge_pte_none(entry)) {
1014 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 2010 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
1015 mutex_unlock(&hugetlb_instantiation_mutex); 2011 goto out_unlock;
1016 return ret;
1017 } 2012 }
1018 2013
1019 ret = 0; 2014 ret = 0;
1020 2015
2016 /*
2017 * If we are going to COW the mapping later, we examine the pending
2018 * reservations for this page now. This will ensure that any
2019 * allocations necessary to record that reservation occur outside the
2020 * spinlock. For private mappings, we also lookup the pagecache
2021 * page now as it is used to determine if a reservation has been
2022 * consumed.
2023 */
2024 if (write_access && !pte_write(entry)) {
2025 if (vma_needs_reservation(h, vma, address) < 0) {
2026 ret = VM_FAULT_OOM;
2027 goto out_unlock;
2028 }
2029
2030 if (!(vma->vm_flags & VM_SHARED))
2031 pagecache_page = hugetlbfs_pagecache_page(h,
2032 vma, address);
2033 }
2034
1021 spin_lock(&mm->page_table_lock); 2035 spin_lock(&mm->page_table_lock);
1022 /* Check for a racing update before calling hugetlb_cow */ 2036 /* Check for a racing update before calling hugetlb_cow */
1023 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 2037 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1024 if (write_access && !pte_write(entry)) 2038 if (write_access && !pte_write(entry))
1025 ret = hugetlb_cow(mm, vma, address, ptep, entry); 2039 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2040 pagecache_page);
1026 spin_unlock(&mm->page_table_lock); 2041 spin_unlock(&mm->page_table_lock);
2042
2043 if (pagecache_page) {
2044 unlock_page(pagecache_page);
2045 put_page(pagecache_page);
2046 }
2047
2048out_unlock:
1027 mutex_unlock(&hugetlb_instantiation_mutex); 2049 mutex_unlock(&hugetlb_instantiation_mutex);
1028 2050
1029 return ret; 2051 return ret;
1030} 2052}
1031 2053
2054/* Can be overriden by architectures */
2055__attribute__((weak)) struct page *
2056follow_huge_pud(struct mm_struct *mm, unsigned long address,
2057 pud_t *pud, int write)
2058{
2059 BUG();
2060 return NULL;
2061}
2062
1032int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2063int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1033 struct page **pages, struct vm_area_struct **vmas, 2064 struct page **pages, struct vm_area_struct **vmas,
1034 unsigned long *position, int *length, int i, 2065 unsigned long *position, int *length, int i,
@@ -1037,6 +2068,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1037 unsigned long pfn_offset; 2068 unsigned long pfn_offset;
1038 unsigned long vaddr = *position; 2069 unsigned long vaddr = *position;
1039 int remainder = *length; 2070 int remainder = *length;
2071 struct hstate *h = hstate_vma(vma);
1040 2072
1041 spin_lock(&mm->page_table_lock); 2073 spin_lock(&mm->page_table_lock);
1042 while (vaddr < vma->vm_end && remainder) { 2074 while (vaddr < vma->vm_end && remainder) {
@@ -1048,7 +2080,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1048 * each hugepage. We have to make * sure we get the 2080 * each hugepage. We have to make * sure we get the
1049 * first, for the page indexing below to work. 2081 * first, for the page indexing below to work.
1050 */ 2082 */
1051 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 2083 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
1052 2084
1053 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 2085 if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1054 (write && !pte_write(huge_ptep_get(pte)))) { 2086 (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1066,7 +2098,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1066 break; 2098 break;
1067 } 2099 }
1068 2100
1069 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 2101 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
1070 page = pte_page(huge_ptep_get(pte)); 2102 page = pte_page(huge_ptep_get(pte));
1071same_page: 2103same_page:
1072 if (pages) { 2104 if (pages) {
@@ -1082,7 +2114,7 @@ same_page:
1082 --remainder; 2114 --remainder;
1083 ++i; 2115 ++i;
1084 if (vaddr < vma->vm_end && remainder && 2116 if (vaddr < vma->vm_end && remainder &&
1085 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 2117 pfn_offset < pages_per_huge_page(h)) {
1086 /* 2118 /*
1087 * We use pfn_offset to avoid touching the pageframes 2119 * We use pfn_offset to avoid touching the pageframes
1088 * of this compound page. 2120 * of this compound page.
@@ -1104,13 +2136,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1104 unsigned long start = address; 2136 unsigned long start = address;
1105 pte_t *ptep; 2137 pte_t *ptep;
1106 pte_t pte; 2138 pte_t pte;
2139 struct hstate *h = hstate_vma(vma);
1107 2140
1108 BUG_ON(address >= end); 2141 BUG_ON(address >= end);
1109 flush_cache_range(vma, address, end); 2142 flush_cache_range(vma, address, end);
1110 2143
1111 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2144 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1112 spin_lock(&mm->page_table_lock); 2145 spin_lock(&mm->page_table_lock);
1113 for (; address < end; address += HPAGE_SIZE) { 2146 for (; address < end; address += huge_page_size(h)) {
1114 ptep = huge_pte_offset(mm, address); 2147 ptep = huge_pte_offset(mm, address);
1115 if (!ptep) 2148 if (!ptep)
1116 continue; 2149 continue;
@@ -1128,195 +2161,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1128 flush_tlb_range(vma, start, end); 2161 flush_tlb_range(vma, start, end);
1129} 2162}
1130 2163
1131struct file_region { 2164int hugetlb_reserve_pages(struct inode *inode,
1132 struct list_head link; 2165 long from, long to,
1133 long from; 2166 struct vm_area_struct *vma)
1134 long to;
1135};
1136
1137static long region_add(struct list_head *head, long f, long t)
1138{
1139 struct file_region *rg, *nrg, *trg;
1140
1141 /* Locate the region we are either in or before. */
1142 list_for_each_entry(rg, head, link)
1143 if (f <= rg->to)
1144 break;
1145
1146 /* Round our left edge to the current segment if it encloses us. */
1147 if (f > rg->from)
1148 f = rg->from;
1149
1150 /* Check for and consume any regions we now overlap with. */
1151 nrg = rg;
1152 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1153 if (&rg->link == head)
1154 break;
1155 if (rg->from > t)
1156 break;
1157
1158 /* If this area reaches higher then extend our area to
1159 * include it completely. If this is not the first area
1160 * which we intend to reuse, free it. */
1161 if (rg->to > t)
1162 t = rg->to;
1163 if (rg != nrg) {
1164 list_del(&rg->link);
1165 kfree(rg);
1166 }
1167 }
1168 nrg->from = f;
1169 nrg->to = t;
1170 return 0;
1171}
1172
1173static long region_chg(struct list_head *head, long f, long t)
1174{ 2167{
1175 struct file_region *rg, *nrg; 2168 long ret, chg;
1176 long chg = 0; 2169 struct hstate *h = hstate_inode(inode);
1177
1178 /* Locate the region we are before or in. */
1179 list_for_each_entry(rg, head, link)
1180 if (f <= rg->to)
1181 break;
1182
1183 /* If we are below the current region then a new region is required.
1184 * Subtle, allocate a new region at the position but make it zero
1185 * size such that we can guarantee to record the reservation. */
1186 if (&rg->link == head || t < rg->from) {
1187 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1188 if (!nrg)
1189 return -ENOMEM;
1190 nrg->from = f;
1191 nrg->to = f;
1192 INIT_LIST_HEAD(&nrg->link);
1193 list_add(&nrg->link, rg->link.prev);
1194
1195 return t - f;
1196 }
1197
1198 /* Round our left edge to the current segment if it encloses us. */
1199 if (f > rg->from)
1200 f = rg->from;
1201 chg = t - f;
1202
1203 /* Check for and consume any regions we now overlap with. */
1204 list_for_each_entry(rg, rg->link.prev, link) {
1205 if (&rg->link == head)
1206 break;
1207 if (rg->from > t)
1208 return chg;
1209
1210 /* We overlap with this area, if it extends futher than
1211 * us then we must extend ourselves. Account for its
1212 * existing reservation. */
1213 if (rg->to > t) {
1214 chg += rg->to - t;
1215 t = rg->to;
1216 }
1217 chg -= rg->to - rg->from;
1218 }
1219 return chg;
1220}
1221
1222static long region_truncate(struct list_head *head, long end)
1223{
1224 struct file_region *rg, *trg;
1225 long chg = 0;
1226 2170
1227 /* Locate the region we are either in or before. */ 2171 if (vma && vma->vm_flags & VM_NORESERVE)
1228 list_for_each_entry(rg, head, link)
1229 if (end <= rg->to)
1230 break;
1231 if (&rg->link == head)
1232 return 0; 2172 return 0;
1233 2173
1234 /* If we are in the middle of a region then adjust it. */
1235 if (end > rg->from) {
1236 chg = rg->to - end;
1237 rg->to = end;
1238 rg = list_entry(rg->link.next, typeof(*rg), link);
1239 }
1240
1241 /* Drop any remaining regions. */
1242 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1243 if (&rg->link == head)
1244 break;
1245 chg += rg->to - rg->from;
1246 list_del(&rg->link);
1247 kfree(rg);
1248 }
1249 return chg;
1250}
1251
1252static int hugetlb_acct_memory(long delta)
1253{
1254 int ret = -ENOMEM;
1255
1256 spin_lock(&hugetlb_lock);
1257 /* 2174 /*
1258 * When cpuset is configured, it breaks the strict hugetlb page 2175 * Shared mappings base their reservation on the number of pages that
1259 * reservation as the accounting is done on a global variable. Such 2176 * are already allocated on behalf of the file. Private mappings need
1260 * reservation is completely rubbish in the presence of cpuset because 2177 * to reserve the full area even if read-only as mprotect() may be
1261 * the reservation is not checked against page availability for the 2178 * called to make the mapping read-write. Assume !vma is a shm mapping
1262 * current cpuset. Application can still potentially OOM'ed by kernel
1263 * with lack of free htlb page in cpuset that the task is in.
1264 * Attempt to enforce strict accounting with cpuset is almost
1265 * impossible (or too ugly) because cpuset is too fluid that
1266 * task or memory node can be dynamically moved between cpusets.
1267 *
1268 * The change of semantics for shared hugetlb mapping with cpuset is
1269 * undesirable. However, in order to preserve some of the semantics,
1270 * we fall back to check against current free page availability as
1271 * a best attempt and hopefully to minimize the impact of changing
1272 * semantics that cpuset has.
1273 */ 2179 */
1274 if (delta > 0) { 2180 if (!vma || vma->vm_flags & VM_SHARED)
1275 if (gather_surplus_pages(delta) < 0) 2181 chg = region_chg(&inode->i_mapping->private_list, from, to);
1276 goto out; 2182 else {
1277 2183 struct resv_map *resv_map = resv_map_alloc();
1278 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 2184 if (!resv_map)
1279 return_unused_surplus_pages(delta); 2185 return -ENOMEM;
1280 goto out;
1281 }
1282 }
1283
1284 ret = 0;
1285 if (delta < 0)
1286 return_unused_surplus_pages((unsigned long) -delta);
1287 2186
1288out: 2187 chg = to - from;
1289 spin_unlock(&hugetlb_lock);
1290 return ret;
1291}
1292 2188
1293int hugetlb_reserve_pages(struct inode *inode, long from, long to) 2189 set_vma_resv_map(vma, resv_map);
1294{ 2190 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1295 long ret, chg; 2191 }
1296 2192
1297 chg = region_chg(&inode->i_mapping->private_list, from, to);
1298 if (chg < 0) 2193 if (chg < 0)
1299 return chg; 2194 return chg;
1300 2195
1301 if (hugetlb_get_quota(inode->i_mapping, chg)) 2196 if (hugetlb_get_quota(inode->i_mapping, chg))
1302 return -ENOSPC; 2197 return -ENOSPC;
1303 ret = hugetlb_acct_memory(chg); 2198 ret = hugetlb_acct_memory(h, chg);
1304 if (ret < 0) { 2199 if (ret < 0) {
1305 hugetlb_put_quota(inode->i_mapping, chg); 2200 hugetlb_put_quota(inode->i_mapping, chg);
1306 return ret; 2201 return ret;
1307 } 2202 }
1308 region_add(&inode->i_mapping->private_list, from, to); 2203 if (!vma || vma->vm_flags & VM_SHARED)
2204 region_add(&inode->i_mapping->private_list, from, to);
1309 return 0; 2205 return 0;
1310} 2206}
1311 2207
1312void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2208void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1313{ 2209{
2210 struct hstate *h = hstate_inode(inode);
1314 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2211 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1315 2212
1316 spin_lock(&inode->i_lock); 2213 spin_lock(&inode->i_lock);
1317 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 2214 inode->i_blocks -= blocks_per_huge_page(h);
1318 spin_unlock(&inode->i_lock); 2215 spin_unlock(&inode->i_lock);
1319 2216
1320 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2217 hugetlb_put_quota(inode->i_mapping, (chg - freed));
1321 hugetlb_acct_memory(-(chg - freed)); 2218 hugetlb_acct_memory(h, -(chg - freed));
1322} 2219}
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..1f43f7416972 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15 15
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling);
18
19extern void prep_compound_page(struct page *page, unsigned long order);
20
16static inline void set_page_count(struct page *page, int v) 21static inline void set_page_count(struct page *page, int v)
17{ 22{
18 atomic_set(&page->_count, v); 23 atomic_set(&page->_count, v);
@@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page)
59#define __paginginit __init 64#define __paginginit __init
60#endif 65#endif
61 66
67/* Memory initialisation debug and verification */
68enum mminit_level {
69 MMINIT_WARNING,
70 MMINIT_VERIFY,
71 MMINIT_TRACE
72};
73
74#ifdef CONFIG_DEBUG_MEMORY_INIT
75
76extern int mminit_loglevel;
77
78#define mminit_dprintk(level, prefix, fmt, arg...) \
79do { \
80 if (level < mminit_loglevel) { \
81 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
82 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
83 } \
84} while (0)
85
86extern void mminit_verify_pageflags_layout(void);
87extern void mminit_verify_page_links(struct page *page,
88 enum zone_type zone, unsigned long nid, unsigned long pfn);
89extern void mminit_verify_zonelist(void);
90
91#else
92
93static inline void mminit_dprintk(enum mminit_level level,
94 const char *prefix, const char *fmt, ...)
95{
96}
97
98static inline void mminit_verify_pageflags_layout(void)
99{
100}
101
102static inline void mminit_verify_page_links(struct page *page,
103 enum zone_type zone, unsigned long nid, unsigned long pfn)
104{
105}
106
107static inline void mminit_verify_zonelist(void)
108{
109}
110#endif /* CONFIG_DEBUG_MEMORY_INIT */
111
112/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
113#if defined(CONFIG_SPARSEMEM)
114extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
115 unsigned long *end_pfn);
116#else
117static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
118 unsigned long *end_pfn)
119{
120}
121#endif /* CONFIG_SPARSEMEM */
122
62#endif 123#endif
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea0..f9349c18a1b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
132 * Application no longer needs these pages. If the pages are dirty, 132 * Application no longer needs these pages. If the pages are dirty,
133 * it's OK to just throw them away. The app will be more careful about 133 * it's OK to just throw them away. The app will be more careful about
134 * data it wants to keep. Be sure to free swap resources too. The 134 * data it wants to keep. Be sure to free swap resources too. The
135 * zap_page_range call sets things up for refill_inactive to actually free 135 * zap_page_range call sets things up for shrink_active_list to actually free
136 * these pages later if no one else has touched them in the meantime, 136 * these pages later if no one else has touched them in the meantime,
137 * although we could add these pages to a global reuse list for 137 * although we could add these pages to a global reuse list for
138 * refill_inactive to pick up before reclaiming other pages. 138 * shrink_active_list to pick up before reclaiming other pages.
139 * 139 *
140 * NB: This interface discards data rather than pushes it out to swap, 140 * NB: This interface discards data rather than pushes it out to swap,
141 * as some implementations do. This has performance implications for 141 * as some implementations do. This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..36896f3eb7f5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,9 +35,9 @@
35 35
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37 37
38struct cgroup_subsys mem_cgroup_subsys; 38struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 39static struct kmem_cache *page_cgroup_cache __read_mostly;
40static struct kmem_cache *page_cgroup_cache; 40#define MEM_CGROUP_RECLAIM_RETRIES 5
41 41
42/* 42/*
43 * Statistics for memory cgroup. 43 * Statistics for memory cgroup.
@@ -166,7 +166,6 @@ struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */ 166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page; 167 struct page *page;
168 struct mem_cgroup *mem_cgroup; 168 struct mem_cgroup *mem_cgroup;
169 int ref_cnt; /* cached, mapped, migrating */
170 int flags; 169 int flags;
171}; 170};
172#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
@@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
185enum charge_type { 184enum charge_type {
186 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
187 MEM_CGROUP_CHARGE_TYPE_MAPPED, 186 MEM_CGROUP_CHARGE_TYPE_MAPPED,
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
188}; 188};
189 189
190/* 190/*
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
250 250
251struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 251struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
252{ 252{
253 /*
254 * mm_update_next_owner() may clear mm->owner to NULL
255 * if it races with swapoff, page migration, etc.
256 * So this can be called with p == NULL.
257 */
258 if (unlikely(!p))
259 return NULL;
260
253 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 261 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
254 struct mem_cgroup, css); 262 struct mem_cgroup, css);
255} 263}
@@ -296,7 +304,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 304 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
297 305
298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 306 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
299 list_del_init(&pc->lru); 307 list_del(&pc->lru);
300} 308}
301 309
302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 310static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
@@ -354,6 +362,9 @@ void mem_cgroup_move_lists(struct page *page, bool active)
354 struct mem_cgroup_per_zone *mz; 362 struct mem_cgroup_per_zone *mz;
355 unsigned long flags; 363 unsigned long flags;
356 364
365 if (mem_cgroup_subsys.disabled)
366 return;
367
357 /* 368 /*
358 * We cannot lock_page_cgroup while holding zone's lru_lock, 369 * We cannot lock_page_cgroup while holding zone's lru_lock,
359 * because other holders of lock_page_cgroup can be interrupted 370 * because other holders of lock_page_cgroup can be interrupted
@@ -524,7 +535,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
524 * < 0 if the cgroup is over its limit 535 * < 0 if the cgroup is over its limit
525 */ 536 */
526static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 537static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
527 gfp_t gfp_mask, enum charge_type ctype) 538 gfp_t gfp_mask, enum charge_type ctype,
539 struct mem_cgroup *memcg)
528{ 540{
529 struct mem_cgroup *mem; 541 struct mem_cgroup *mem;
530 struct page_cgroup *pc; 542 struct page_cgroup *pc;
@@ -532,35 +544,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
532 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 544 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
533 struct mem_cgroup_per_zone *mz; 545 struct mem_cgroup_per_zone *mz;
534 546
535 if (mem_cgroup_subsys.disabled) 547 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
536 return 0; 548 if (unlikely(pc == NULL))
537
538 /*
539 * Should page_cgroup's go to their own slab?
540 * One could optimize the performance of the charging routine
541 * by saving a bit in the page_flags and using it as a lock
542 * to see if the cgroup page already has a page_cgroup associated
543 * with it
544 */
545retry:
546 lock_page_cgroup(page);
547 pc = page_get_page_cgroup(page);
548 /*
549 * The page_cgroup exists and
550 * the page has already been accounted.
551 */
552 if (pc) {
553 VM_BUG_ON(pc->page != page);
554 VM_BUG_ON(pc->ref_cnt <= 0);
555
556 pc->ref_cnt++;
557 unlock_page_cgroup(page);
558 goto done;
559 }
560 unlock_page_cgroup(page);
561
562 pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
563 if (pc == NULL)
564 goto err; 549 goto err;
565 550
566 /* 551 /*
@@ -569,16 +554,23 @@ retry:
569 * thread group leader migrates. It's possible that mm is not 554 * thread group leader migrates. It's possible that mm is not
570 * set, if so charge the init_mm (happens for pagecache usage). 555 * set, if so charge the init_mm (happens for pagecache usage).
571 */ 556 */
572 if (!mm) 557 if (likely(!memcg)) {
573 mm = &init_mm; 558 rcu_read_lock();
574 559 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
575 rcu_read_lock(); 560 if (unlikely(!mem)) {
576 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 561 rcu_read_unlock();
577 /* 562 kmem_cache_free(page_cgroup_cache, pc);
578 * For every charge from the cgroup, increment reference count 563 return 0;
579 */ 564 }
580 css_get(&mem->css); 565 /*
581 rcu_read_unlock(); 566 * For every charge from the cgroup, increment reference count
567 */
568 css_get(&mem->css);
569 rcu_read_unlock();
570 } else {
571 mem = memcg;
572 css_get(&memcg->css);
573 }
582 574
583 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 575 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
584 if (!(gfp_mask & __GFP_WAIT)) 576 if (!(gfp_mask & __GFP_WAIT))
@@ -603,25 +595,24 @@ retry:
603 } 595 }
604 } 596 }
605 597
606 pc->ref_cnt = 1;
607 pc->mem_cgroup = mem; 598 pc->mem_cgroup = mem;
608 pc->page = page; 599 pc->page = page;
609 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 600 /*
601 * If a page is accounted as a page cache, insert to inactive list.
602 * If anon, insert to active list.
603 */
610 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 604 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
611 pc->flags = PAGE_CGROUP_FLAG_CACHE; 605 pc->flags = PAGE_CGROUP_FLAG_CACHE;
606 else
607 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
612 608
613 lock_page_cgroup(page); 609 lock_page_cgroup(page);
614 if (page_get_page_cgroup(page)) { 610 if (unlikely(page_get_page_cgroup(page))) {
615 unlock_page_cgroup(page); 611 unlock_page_cgroup(page);
616 /*
617 * Another charge has been added to this page already.
618 * We take lock_page_cgroup(page) again and read
619 * page->cgroup, increment refcnt.... just retry is OK.
620 */
621 res_counter_uncharge(&mem->res, PAGE_SIZE); 612 res_counter_uncharge(&mem->res, PAGE_SIZE);
622 css_put(&mem->css); 613 css_put(&mem->css);
623 kmem_cache_free(page_cgroup_cache, pc); 614 kmem_cache_free(page_cgroup_cache, pc);
624 goto retry; 615 goto done;
625 } 616 }
626 page_assign_page_cgroup(page, pc); 617 page_assign_page_cgroup(page, pc);
627 618
@@ -642,24 +633,65 @@ err:
642 633
643int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 634int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
644{ 635{
636 if (mem_cgroup_subsys.disabled)
637 return 0;
638
639 /*
640 * If already mapped, we don't have to account.
641 * If page cache, page->mapping has address_space.
642 * But page->mapping may have out-of-use anon_vma pointer,
643 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
644 * is NULL.
645 */
646 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
647 return 0;
648 if (unlikely(!mm))
649 mm = &init_mm;
645 return mem_cgroup_charge_common(page, mm, gfp_mask, 650 return mem_cgroup_charge_common(page, mm, gfp_mask,
646 MEM_CGROUP_CHARGE_TYPE_MAPPED); 651 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
647} 652}
648 653
649int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 654int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
650 gfp_t gfp_mask) 655 gfp_t gfp_mask)
651{ 656{
652 if (!mm) 657 if (mem_cgroup_subsys.disabled)
658 return 0;
659
660 /*
661 * Corner case handling. This is called from add_to_page_cache()
662 * in usual. But some FS (shmem) precharges this page before calling it
663 * and call add_to_page_cache() with GFP_NOWAIT.
664 *
665 * For GFP_NOWAIT case, the page may be pre-charged before calling
666 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
667 * charge twice. (It works but has to pay a bit larger cost.)
668 */
669 if (!(gfp_mask & __GFP_WAIT)) {
670 struct page_cgroup *pc;
671
672 lock_page_cgroup(page);
673 pc = page_get_page_cgroup(page);
674 if (pc) {
675 VM_BUG_ON(pc->page != page);
676 VM_BUG_ON(!pc->mem_cgroup);
677 unlock_page_cgroup(page);
678 return 0;
679 }
680 unlock_page_cgroup(page);
681 }
682
683 if (unlikely(!mm))
653 mm = &init_mm; 684 mm = &init_mm;
685
654 return mem_cgroup_charge_common(page, mm, gfp_mask, 686 return mem_cgroup_charge_common(page, mm, gfp_mask,
655 MEM_CGROUP_CHARGE_TYPE_CACHE); 687 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
656} 688}
657 689
658/* 690/*
659 * Uncharging is always a welcome operation, we never complain, simply 691 * uncharge if !page_mapped(page)
660 * uncharge.
661 */ 692 */
662void mem_cgroup_uncharge_page(struct page *page) 693static void
694__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
663{ 695{
664 struct page_cgroup *pc; 696 struct page_cgroup *pc;
665 struct mem_cgroup *mem; 697 struct mem_cgroup *mem;
@@ -674,98 +706,158 @@ void mem_cgroup_uncharge_page(struct page *page)
674 */ 706 */
675 lock_page_cgroup(page); 707 lock_page_cgroup(page);
676 pc = page_get_page_cgroup(page); 708 pc = page_get_page_cgroup(page);
677 if (!pc) 709 if (unlikely(!pc))
678 goto unlock; 710 goto unlock;
679 711
680 VM_BUG_ON(pc->page != page); 712 VM_BUG_ON(pc->page != page);
681 VM_BUG_ON(pc->ref_cnt <= 0);
682 713
683 if (--(pc->ref_cnt) == 0) { 714 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
684 mz = page_cgroup_zoneinfo(pc); 715 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
685 spin_lock_irqsave(&mz->lru_lock, flags); 716 || page_mapped(page)))
686 __mem_cgroup_remove_list(mz, pc); 717 goto unlock;
687 spin_unlock_irqrestore(&mz->lru_lock, flags);
688 718
689 page_assign_page_cgroup(page, NULL); 719 mz = page_cgroup_zoneinfo(pc);
690 unlock_page_cgroup(page); 720 spin_lock_irqsave(&mz->lru_lock, flags);
721 __mem_cgroup_remove_list(mz, pc);
722 spin_unlock_irqrestore(&mz->lru_lock, flags);
691 723
692 mem = pc->mem_cgroup; 724 page_assign_page_cgroup(page, NULL);
693 res_counter_uncharge(&mem->res, PAGE_SIZE); 725 unlock_page_cgroup(page);
694 css_put(&mem->css);
695 726
696 kmem_cache_free(page_cgroup_cache, pc); 727 mem = pc->mem_cgroup;
697 return; 728 res_counter_uncharge(&mem->res, PAGE_SIZE);
698 } 729 css_put(&mem->css);
699 730
731 kmem_cache_free(page_cgroup_cache, pc);
732 return;
700unlock: 733unlock:
701 unlock_page_cgroup(page); 734 unlock_page_cgroup(page);
702} 735}
703 736
737void mem_cgroup_uncharge_page(struct page *page)
738{
739 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
740}
741
742void mem_cgroup_uncharge_cache_page(struct page *page)
743{
744 VM_BUG_ON(page_mapped(page));
745 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
746}
747
704/* 748/*
705 * Returns non-zero if a page (under migration) has valid page_cgroup member. 749 * Before starting migration, account against new page.
706 * Refcnt of page_cgroup is incremented.
707 */ 750 */
708int mem_cgroup_prepare_migration(struct page *page) 751int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
709{ 752{
710 struct page_cgroup *pc; 753 struct page_cgroup *pc;
754 struct mem_cgroup *mem = NULL;
755 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
756 int ret = 0;
711 757
712 if (mem_cgroup_subsys.disabled) 758 if (mem_cgroup_subsys.disabled)
713 return 0; 759 return 0;
714 760
715 lock_page_cgroup(page); 761 lock_page_cgroup(page);
716 pc = page_get_page_cgroup(page); 762 pc = page_get_page_cgroup(page);
717 if (pc) 763 if (pc) {
718 pc->ref_cnt++; 764 mem = pc->mem_cgroup;
765 css_get(&mem->css);
766 if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
767 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
768 }
719 unlock_page_cgroup(page); 769 unlock_page_cgroup(page);
720 return pc != NULL; 770 if (mem) {
771 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
772 ctype, mem);
773 css_put(&mem->css);
774 }
775 return ret;
721} 776}
722 777
723void mem_cgroup_end_migration(struct page *page) 778/* remove redundant charge if migration failed*/
779void mem_cgroup_end_migration(struct page *newpage)
724{ 780{
725 mem_cgroup_uncharge_page(page); 781 /*
782 * At success, page->mapping is not NULL.
783 * special rollback care is necessary when
784 * 1. at migration failure. (newpage->mapping is cleared in this case)
785 * 2. the newpage was moved but not remapped again because the task
786 * exits and the newpage is obsolete. In this case, the new page
787 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
788 * always for avoiding mess. The page_cgroup will be removed if
789 * unnecessary. File cache pages is still on radix-tree. Don't
790 * care it.
791 */
792 if (!newpage->mapping)
793 __mem_cgroup_uncharge_common(newpage,
794 MEM_CGROUP_CHARGE_TYPE_FORCE);
795 else if (PageAnon(newpage))
796 mem_cgroup_uncharge_page(newpage);
726} 797}
727 798
728/* 799/*
729 * We know both *page* and *newpage* are now not-on-LRU and PG_locked. 800 * A call to try to shrink memory usage under specified resource controller.
730 * And no race with uncharge() routines because page_cgroup for *page* 801 * This is typically used for page reclaiming for shmem for reducing side
731 * has extra one reference by mem_cgroup_prepare_migration. 802 * effect of page allocation from shmem, which is used by some mem_cgroup.
732 */ 803 */
733void mem_cgroup_page_migration(struct page *page, struct page *newpage) 804int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
734{ 805{
735 struct page_cgroup *pc; 806 struct mem_cgroup *mem;
736 struct mem_cgroup_per_zone *mz; 807 int progress = 0;
737 unsigned long flags; 808 int retry = MEM_CGROUP_RECLAIM_RETRIES;
738 809
739 lock_page_cgroup(page); 810 if (mem_cgroup_subsys.disabled)
740 pc = page_get_page_cgroup(page); 811 return 0;
741 if (!pc) { 812 if (!mm)
742 unlock_page_cgroup(page); 813 return 0;
743 return; 814
815 rcu_read_lock();
816 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
817 if (unlikely(!mem)) {
818 rcu_read_unlock();
819 return 0;
744 } 820 }
821 css_get(&mem->css);
822 rcu_read_unlock();
745 823
746 mz = page_cgroup_zoneinfo(pc); 824 do {
747 spin_lock_irqsave(&mz->lru_lock, flags); 825 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
748 __mem_cgroup_remove_list(mz, pc); 826 progress += res_counter_check_under_limit(&mem->res);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 827 } while (!progress && --retry);
750 828
751 page_assign_page_cgroup(page, NULL); 829 css_put(&mem->css);
752 unlock_page_cgroup(page); 830 if (!retry)
831 return -ENOMEM;
832 return 0;
833}
753 834
754 pc->page = newpage; 835int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
755 lock_page_cgroup(newpage); 836{
756 page_assign_page_cgroup(newpage, pc);
757 837
758 mz = page_cgroup_zoneinfo(pc); 838 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
759 spin_lock_irqsave(&mz->lru_lock, flags); 839 int progress;
760 __mem_cgroup_add_list(mz, pc); 840 int ret = 0;
761 spin_unlock_irqrestore(&mz->lru_lock, flags);
762 841
763 unlock_page_cgroup(newpage); 842 while (res_counter_set_limit(&memcg->res, val)) {
843 if (signal_pending(current)) {
844 ret = -EINTR;
845 break;
846 }
847 if (!retry_count) {
848 ret = -EBUSY;
849 break;
850 }
851 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
852 if (!progress)
853 retry_count--;
854 }
855 return ret;
764} 856}
765 857
858
766/* 859/*
767 * This routine traverse page_cgroup in given list and drop them all. 860 * This routine traverse page_cgroup in given list and drop them all.
768 * This routine ignores page_cgroup->ref_cnt.
769 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 861 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
770 */ 862 */
771#define FORCE_UNCHARGE_BATCH (128) 863#define FORCE_UNCHARGE_BATCH (128)
@@ -790,12 +882,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
790 page = pc->page; 882 page = pc->page;
791 get_page(page); 883 get_page(page);
792 spin_unlock_irqrestore(&mz->lru_lock, flags); 884 spin_unlock_irqrestore(&mz->lru_lock, flags);
793 mem_cgroup_uncharge_page(page); 885 /*
794 put_page(page); 886 * Check if this page is on LRU. !LRU page can be found
795 if (--count <= 0) { 887 * if it's under page migration.
796 count = FORCE_UNCHARGE_BATCH; 888 */
889 if (PageLRU(page)) {
890 __mem_cgroup_uncharge_common(page,
891 MEM_CGROUP_CHARGE_TYPE_FORCE);
892 put_page(page);
893 if (--count <= 0) {
894 count = FORCE_UNCHARGE_BATCH;
895 cond_resched();
896 }
897 } else
797 cond_resched(); 898 cond_resched();
798 }
799 spin_lock_irqsave(&mz->lru_lock, flags); 899 spin_lock_irqsave(&mz->lru_lock, flags);
800 } 900 }
801 spin_unlock_irqrestore(&mz->lru_lock, flags); 901 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -810,9 +910,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
810 int ret = -EBUSY; 910 int ret = -EBUSY;
811 int node, zid; 911 int node, zid;
812 912
813 if (mem_cgroup_subsys.disabled)
814 return 0;
815
816 css_get(&mem->css); 913 css_get(&mem->css);
817 /* 914 /*
818 * page reclaim code (kswapd etc..) will move pages between 915 * page reclaim code (kswapd etc..) will move pages between
@@ -838,32 +935,34 @@ out:
838 return ret; 935 return ret;
839} 936}
840 937
841static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
842{
843 *tmp = memparse(buf, &buf);
844 if (*buf != '\0')
845 return -EINVAL;
846
847 /*
848 * Round up the value to the closest page size
849 */
850 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
851 return 0;
852}
853
854static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 938static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
855{ 939{
856 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 940 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
857 cft->private); 941 cft->private);
858} 942}
859 943/*
860static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 944 * The user of this function is...
861 struct file *file, const char __user *userbuf, 945 * RES_LIMIT.
862 size_t nbytes, loff_t *ppos) 946 */
947static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
948 const char *buffer)
863{ 949{
864 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 950 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
865 cft->private, userbuf, nbytes, ppos, 951 unsigned long long val;
866 mem_cgroup_write_strategy); 952 int ret;
953
954 switch (cft->private) {
955 case RES_LIMIT:
956 /* This function does all necessary parse...reuse it */
957 ret = res_counter_memparse_write_strategy(buffer, &val);
958 if (!ret)
959 ret = mem_cgroup_resize_limit(memcg, val);
960 break;
961 default:
962 ret = -EINVAL; /* should be BUG() ? */
963 break;
964 }
965 return ret;
867} 966}
868 967
869static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 968static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -940,7 +1039,7 @@ static struct cftype mem_cgroup_files[] = {
940 { 1039 {
941 .name = "limit_in_bytes", 1040 .name = "limit_in_bytes",
942 .private = RES_LIMIT, 1041 .private = RES_LIMIT,
943 .write = mem_cgroup_write, 1042 .write_string = mem_cgroup_write,
944 .read_u64 = mem_cgroup_read, 1043 .read_u64 = mem_cgroup_read,
945 }, 1044 },
946 { 1045 {
@@ -1070,8 +1169,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1070static int mem_cgroup_populate(struct cgroup_subsys *ss, 1169static int mem_cgroup_populate(struct cgroup_subsys *ss,
1071 struct cgroup *cont) 1170 struct cgroup *cont)
1072{ 1171{
1073 if (mem_cgroup_subsys.disabled)
1074 return 0;
1075 return cgroup_add_files(cont, ss, mem_cgroup_files, 1172 return cgroup_add_files(cont, ss, mem_cgroup_files,
1076 ARRAY_SIZE(mem_cgroup_files)); 1173 ARRAY_SIZE(mem_cgroup_files));
1077} 1174}
@@ -1084,9 +1181,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1084 struct mm_struct *mm; 1181 struct mm_struct *mm;
1085 struct mem_cgroup *mem, *old_mem; 1182 struct mem_cgroup *mem, *old_mem;
1086 1183
1087 if (mem_cgroup_subsys.disabled)
1088 return;
1089
1090 mm = get_task_mm(p); 1184 mm = get_task_mm(p);
1091 if (mm == NULL) 1185 if (mm == NULL)
1092 return; 1186 return;
@@ -1094,9 +1188,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1094 mem = mem_cgroup_from_cont(cont); 1188 mem = mem_cgroup_from_cont(cont);
1095 old_mem = mem_cgroup_from_cont(old_cont); 1189 old_mem = mem_cgroup_from_cont(old_cont);
1096 1190
1097 if (mem == old_mem)
1098 goto out;
1099
1100 /* 1191 /*
1101 * Only thread group leaders are allowed to migrate, the mm_struct is 1192 * Only thread group leaders are allowed to migrate, the mm_struct is
1102 * in effect owned by the leader 1193 * in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 19e0ae9beecb..1002f473f497 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
54 55
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -61,6 +62,8 @@
61#include <linux/swapops.h> 62#include <linux/swapops.h>
62#include <linux/elf.h> 63#include <linux/elf.h>
63 64
65#include "internal.h"
66
64#ifndef CONFIG_NEED_MULTIPLE_NODES 67#ifndef CONFIG_NEED_MULTIPLE_NODES
65/* use the per-pgdat data instead for discontigmem - mbligh */ 68/* use the per-pgdat data instead for discontigmem - mbligh */
66unsigned long max_mapnr; 69unsigned long max_mapnr;
@@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
211 * 214 *
212 * Must be called with pagetable lock held. 215 * Must be called with pagetable lock held.
213 */ 216 */
214void free_pgd_range(struct mmu_gather **tlb, 217void free_pgd_range(struct mmu_gather *tlb,
215 unsigned long addr, unsigned long end, 218 unsigned long addr, unsigned long end,
216 unsigned long floor, unsigned long ceiling) 219 unsigned long floor, unsigned long ceiling)
217{ 220{
@@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb,
262 return; 265 return;
263 266
264 start = addr; 267 start = addr;
265 pgd = pgd_offset((*tlb)->mm, addr); 268 pgd = pgd_offset(tlb->mm, addr);
266 do { 269 do {
267 next = pgd_addr_end(addr, end); 270 next = pgd_addr_end(addr, end);
268 if (pgd_none_or_clear_bad(pgd)) 271 if (pgd_none_or_clear_bad(pgd))
269 continue; 272 continue;
270 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 273 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
271 } while (pgd++, addr = next, addr != end); 274 } while (pgd++, addr = next, addr != end);
272} 275}
273 276
274void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, 277void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
275 unsigned long floor, unsigned long ceiling) 278 unsigned long floor, unsigned long ceiling)
276{ 279{
277 while (vma) { 280 while (vma) {
@@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
372 * 375 *
373 * The calling function must still handle the error. 376 * The calling function must still handle the error.
374 */ 377 */
375void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) 378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
379 unsigned long vaddr)
376{ 380{
377 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
378 "vm_flags = %lx, vaddr = %lx\n", 382 "vm_flags = %lx, vaddr = %lx\n",
@@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
649 unsigned long next; 653 unsigned long next;
650 unsigned long addr = vma->vm_start; 654 unsigned long addr = vma->vm_start;
651 unsigned long end = vma->vm_end; 655 unsigned long end = vma->vm_end;
656 int ret;
652 657
653 /* 658 /*
654 * Don't copy ptes where a page fault will fill them correctly. 659 * Don't copy ptes where a page fault will fill them correctly.
@@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
664 if (is_vm_hugetlb_page(vma)) 669 if (is_vm_hugetlb_page(vma))
665 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 670 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
666 671
672 /*
673 * We need to invalidate the secondary MMU mappings only when
674 * there could be a permission downgrade on the ptes of the
675 * parent mm. And a permission downgrade will only happen if
676 * is_cow_mapping() returns true.
677 */
678 if (is_cow_mapping(vma->vm_flags))
679 mmu_notifier_invalidate_range_start(src_mm, addr, end);
680
681 ret = 0;
667 dst_pgd = pgd_offset(dst_mm, addr); 682 dst_pgd = pgd_offset(dst_mm, addr);
668 src_pgd = pgd_offset(src_mm, addr); 683 src_pgd = pgd_offset(src_mm, addr);
669 do { 684 do {
670 next = pgd_addr_end(addr, end); 685 next = pgd_addr_end(addr, end);
671 if (pgd_none_or_clear_bad(src_pgd)) 686 if (pgd_none_or_clear_bad(src_pgd))
672 continue; 687 continue;
673 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 688 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
674 vma, addr, next)) 689 vma, addr, next))) {
675 return -ENOMEM; 690 ret = -ENOMEM;
691 break;
692 }
676 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 693 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
677 return 0; 694
695 if (is_cow_mapping(vma->vm_flags))
696 mmu_notifier_invalidate_range_end(src_mm,
697 vma->vm_start, end);
698 return ret;
678} 699}
679 700
680static unsigned long zap_pte_range(struct mmu_gather *tlb, 701static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
878 unsigned long start = start_addr; 899 unsigned long start = start_addr;
879 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 900 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
880 int fullmm = (*tlbp)->fullmm; 901 int fullmm = (*tlbp)->fullmm;
902 struct mm_struct *mm = vma->vm_mm;
881 903
904 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
882 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 905 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
883 unsigned long end; 906 unsigned long end;
884 907
@@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
899 } 922 }
900 923
901 if (unlikely(is_vm_hugetlb_page(vma))) { 924 if (unlikely(is_vm_hugetlb_page(vma))) {
902 unmap_hugepage_range(vma, start, end); 925 /*
903 zap_work -= (end - start) / 926 * It is undesirable to test vma->vm_file as it
904 (HPAGE_SIZE / PAGE_SIZE); 927 * should be non-null for valid hugetlb area.
928 * However, vm_file will be NULL in the error
929 * cleanup path of do_mmap_pgoff. When
930 * hugetlbfs ->mmap method fails,
931 * do_mmap_pgoff() nullifies vma->vm_file
932 * before calling this function to clean up.
933 * Since no pte has actually been setup, it is
934 * safe to do nothing in this case.
935 */
936 if (vma->vm_file) {
937 unmap_hugepage_range(vma, start, end, NULL);
938 zap_work -= (end - start) /
939 pages_per_huge_page(hstate_vma(vma));
940 }
941
905 start = end; 942 start = end;
906 } else 943 } else
907 start = unmap_page_range(*tlbp, vma, 944 start = unmap_page_range(*tlbp, vma,
@@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
929 } 966 }
930 } 967 }
931out: 968out:
969 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
932 return start; /* which is now the end (or restart) address */ 970 return start; /* which is now the end (or restart) address */
933} 971}
934 972
@@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
956 return end; 994 return end;
957} 995}
958 996
997/**
998 * zap_vma_ptes - remove ptes mapping the vma
999 * @vma: vm_area_struct holding ptes to be zapped
1000 * @address: starting address of pages to zap
1001 * @size: number of bytes to zap
1002 *
1003 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1004 *
1005 * The entire address range must be fully contained within the vma.
1006 *
1007 * Returns 0 if successful.
1008 */
1009int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1010 unsigned long size)
1011{
1012 if (address < vma->vm_start || address + size > vma->vm_end ||
1013 !(vma->vm_flags & VM_PFNMAP))
1014 return -1;
1015 zap_page_range(vma, address, size, NULL);
1016 return 0;
1017}
1018EXPORT_SYMBOL_GPL(zap_vma_ptes);
1019
959/* 1020/*
960 * Do a quick page-table lookup for a single page. 1021 * Do a quick page-table lookup for a single page.
961 */ 1022 */
@@ -982,34 +1043,37 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
982 goto no_page_table; 1043 goto no_page_table;
983 1044
984 pud = pud_offset(pgd, address); 1045 pud = pud_offset(pgd, address);
985 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 1046 if (pud_none(*pud))
1047 goto no_page_table;
1048 if (pud_huge(*pud)) {
1049 BUG_ON(flags & FOLL_GET);
1050 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1051 goto out;
1052 }
1053 if (unlikely(pud_bad(*pud)))
986 goto no_page_table; 1054 goto no_page_table;
987 1055
988 pmd = pmd_offset(pud, address); 1056 pmd = pmd_offset(pud, address);
989 if (pmd_none(*pmd)) 1057 if (pmd_none(*pmd))
990 goto no_page_table; 1058 goto no_page_table;
991
992 if (pmd_huge(*pmd)) { 1059 if (pmd_huge(*pmd)) {
993 BUG_ON(flags & FOLL_GET); 1060 BUG_ON(flags & FOLL_GET);
994 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1061 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
995 goto out; 1062 goto out;
996 } 1063 }
997
998 if (unlikely(pmd_bad(*pmd))) 1064 if (unlikely(pmd_bad(*pmd)))
999 goto no_page_table; 1065 goto no_page_table;
1000 1066
1001 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1067 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1002 if (!ptep)
1003 goto out;
1004 1068
1005 pte = *ptep; 1069 pte = *ptep;
1006 if (!pte_present(pte)) 1070 if (!pte_present(pte))
1007 goto unlock; 1071 goto no_page;
1008 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1072 if ((flags & FOLL_WRITE) && !pte_write(pte))
1009 goto unlock; 1073 goto unlock;
1010 page = vm_normal_page(vma, address, pte); 1074 page = vm_normal_page(vma, address, pte);
1011 if (unlikely(!page)) 1075 if (unlikely(!page))
1012 goto unlock; 1076 goto bad_page;
1013 1077
1014 if (flags & FOLL_GET) 1078 if (flags & FOLL_GET)
1015 get_page(page); 1079 get_page(page);
@@ -1024,6 +1088,15 @@ unlock:
1024out: 1088out:
1025 return page; 1089 return page;
1026 1090
1091bad_page:
1092 pte_unmap_unlock(ptep, ptl);
1093 return ERR_PTR(-EFAULT);
1094
1095no_page:
1096 pte_unmap_unlock(ptep, ptl);
1097 if (!pte_none(pte))
1098 return page;
1099 /* Fall through to ZERO_PAGE handling */
1027no_page_table: 1100no_page_table:
1028 /* 1101 /*
1029 * When core dumping an enormous anonymous area that nobody 1102 * When core dumping an enormous anonymous area that nobody
@@ -1038,6 +1111,24 @@ no_page_table:
1038 return page; 1111 return page;
1039} 1112}
1040 1113
1114/* Can we do the FOLL_ANON optimization? */
1115static inline int use_zero_page(struct vm_area_struct *vma)
1116{
1117 /*
1118 * We don't want to optimize FOLL_ANON for make_pages_present()
1119 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1120 * we want to get the page from the page tables to make sure
1121 * that we serialize and update with any other user of that
1122 * mapping.
1123 */
1124 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1125 return 0;
1126 /*
1127 * And if we have a fault routine, it's not an anonymous region.
1128 */
1129 return !vma->vm_ops || !vma->vm_ops->fault;
1130}
1131
1041int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1132int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1042 unsigned long start, int len, int write, int force, 1133 unsigned long start, int len, int write, int force,
1043 struct page **pages, struct vm_area_struct **vmas) 1134 struct page **pages, struct vm_area_struct **vmas)
@@ -1112,8 +1203,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1112 foll_flags = FOLL_TOUCH; 1203 foll_flags = FOLL_TOUCH;
1113 if (pages) 1204 if (pages)
1114 foll_flags |= FOLL_GET; 1205 foll_flags |= FOLL_GET;
1115 if (!write && !(vma->vm_flags & VM_LOCKED) && 1206 if (!write && use_zero_page(vma))
1116 (!vma->vm_ops || !vma->vm_ops->fault))
1117 foll_flags |= FOLL_ANON; 1207 foll_flags |= FOLL_ANON;
1118 1208
1119 do { 1209 do {
@@ -1125,7 +1215,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1125 * be processed until returning to user space. 1215 * be processed until returning to user space.
1126 */ 1216 */
1127 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) 1217 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
1128 return -ENOMEM; 1218 return i ? i : -ENOMEM;
1129 1219
1130 if (write) 1220 if (write)
1131 foll_flags |= FOLL_WRITE; 1221 foll_flags |= FOLL_WRITE;
@@ -1159,6 +1249,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1159 1249
1160 cond_resched(); 1250 cond_resched();
1161 } 1251 }
1252 if (IS_ERR(page))
1253 return i ? i : PTR_ERR(page);
1162 if (pages) { 1254 if (pages) {
1163 pages[i] = page; 1255 pages[i] = page;
1164 1256
@@ -1310,6 +1402,11 @@ out:
1310 * 1402 *
1311 * This function should only be called from a vm_ops->fault handler, and 1403 * This function should only be called from a vm_ops->fault handler, and
1312 * in that case the handler should return NULL. 1404 * in that case the handler should return NULL.
1405 *
1406 * vma cannot be a COW mapping.
1407 *
1408 * As this is called only for pages that do not currently exist, we
1409 * do not need to flush old virtual caches or the TLB.
1313 */ 1410 */
1314int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1411int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1315 unsigned long pfn) 1412 unsigned long pfn)
@@ -1520,6 +1617,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1520 unsigned long next; 1617 unsigned long next;
1521 int err; 1618 int err;
1522 1619
1620 BUG_ON(pud_huge(*pud));
1621
1523 pmd = pmd_alloc(mm, pud, addr); 1622 pmd = pmd_alloc(mm, pud, addr);
1524 if (!pmd) 1623 if (!pmd)
1525 return -ENOMEM; 1624 return -ENOMEM;
@@ -1561,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1561{ 1660{
1562 pgd_t *pgd; 1661 pgd_t *pgd;
1563 unsigned long next; 1662 unsigned long next;
1564 unsigned long end = addr + size; 1663 unsigned long start = addr, end = addr + size;
1565 int err; 1664 int err;
1566 1665
1567 BUG_ON(addr >= end); 1666 BUG_ON(addr >= end);
1667 mmu_notifier_invalidate_range_start(mm, start, end);
1568 pgd = pgd_offset(mm, addr); 1668 pgd = pgd_offset(mm, addr);
1569 do { 1669 do {
1570 next = pgd_addr_end(addr, end); 1670 next = pgd_addr_end(addr, end);
@@ -1572,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1572 if (err) 1672 if (err)
1573 break; 1673 break;
1574 } while (pgd++, addr = next, addr != end); 1674 } while (pgd++, addr = next, addr != end);
1675 mmu_notifier_invalidate_range_end(mm, start, end);
1575 return err; 1676 return err;
1576} 1677}
1577EXPORT_SYMBOL_GPL(apply_to_page_range); 1678EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1669,15 +1770,26 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1669 struct page *dirty_page = NULL; 1770 struct page *dirty_page = NULL;
1670 1771
1671 old_page = vm_normal_page(vma, address, orig_pte); 1772 old_page = vm_normal_page(vma, address, orig_pte);
1672 if (!old_page) 1773 if (!old_page) {
1774 /*
1775 * VM_MIXEDMAP !pfn_valid() case
1776 *
1777 * We should not cow pages in a shared writeable mapping.
1778 * Just mark the pages writable as we can't do any dirty
1779 * accounting on raw pfn maps.
1780 */
1781 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1782 (VM_WRITE|VM_SHARED))
1783 goto reuse;
1673 goto gotten; 1784 goto gotten;
1785 }
1674 1786
1675 /* 1787 /*
1676 * Take out anonymous pages first, anonymous shared vmas are 1788 * Take out anonymous pages first, anonymous shared vmas are
1677 * not dirty accountable. 1789 * not dirty accountable.
1678 */ 1790 */
1679 if (PageAnon(old_page)) { 1791 if (PageAnon(old_page)) {
1680 if (!TestSetPageLocked(old_page)) { 1792 if (trylock_page(old_page)) {
1681 reuse = can_share_swap_page(old_page); 1793 reuse = can_share_swap_page(old_page);
1682 unlock_page(old_page); 1794 unlock_page(old_page);
1683 } 1795 }
@@ -1723,6 +1835,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1723 } 1835 }
1724 1836
1725 if (reuse) { 1837 if (reuse) {
1838reuse:
1726 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1839 flush_cache_page(vma, address, pte_pfn(orig_pte));
1727 entry = pte_mkyoung(orig_pte); 1840 entry = pte_mkyoung(orig_pte);
1728 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1841 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1757,7 +1870,6 @@ gotten:
1757 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 1870 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1758 if (likely(pte_same(*page_table, orig_pte))) { 1871 if (likely(pte_same(*page_table, orig_pte))) {
1759 if (old_page) { 1872 if (old_page) {
1760 page_remove_rmap(old_page, vma);
1761 if (!PageAnon(old_page)) { 1873 if (!PageAnon(old_page)) {
1762 dec_mm_counter(mm, file_rss); 1874 dec_mm_counter(mm, file_rss);
1763 inc_mm_counter(mm, anon_rss); 1875 inc_mm_counter(mm, anon_rss);
@@ -1773,12 +1885,38 @@ gotten:
1773 * seen in the presence of one thread doing SMC and another 1885 * seen in the presence of one thread doing SMC and another
1774 * thread doing COW. 1886 * thread doing COW.
1775 */ 1887 */
1776 ptep_clear_flush(vma, address, page_table); 1888 ptep_clear_flush_notify(vma, address, page_table);
1777 set_pte_at(mm, address, page_table, entry); 1889 set_pte_at(mm, address, page_table, entry);
1778 update_mmu_cache(vma, address, entry); 1890 update_mmu_cache(vma, address, entry);
1779 lru_cache_add_active(new_page); 1891 lru_cache_add_active(new_page);
1780 page_add_new_anon_rmap(new_page, vma, address); 1892 page_add_new_anon_rmap(new_page, vma, address);
1781 1893
1894 if (old_page) {
1895 /*
1896 * Only after switching the pte to the new page may
1897 * we remove the mapcount here. Otherwise another
1898 * process may come and find the rmap count decremented
1899 * before the pte is switched to the new page, and
1900 * "reuse" the old page writing into it while our pte
1901 * here still points into it and can be read by other
1902 * threads.
1903 *
1904 * The critical issue is to order this
1905 * page_remove_rmap with the ptp_clear_flush above.
1906 * Those stores are ordered by (if nothing else,)
1907 * the barrier present in the atomic_add_negative
1908 * in page_remove_rmap.
1909 *
1910 * Then the TLB flush in ptep_clear_flush ensures that
1911 * no process can access the old page before the
1912 * decremented mapcount is visible. And the old page
1913 * cannot be reused until after the decremented
1914 * mapcount is visible. So transitively, TLBs to
1915 * old page will be flushed before it can be reused.
1916 */
1917 page_remove_rmap(old_page, vma);
1918 }
1919
1782 /* Free the old page.. */ 1920 /* Free the old page.. */
1783 new_page = old_page; 1921 new_page = old_page;
1784 ret |= VM_FAULT_WRITE; 1922 ret |= VM_FAULT_WRITE;
@@ -2436,59 +2574,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2436 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2574 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2437} 2575}
2438 2576
2439
2440/*
2441 * do_no_pfn() tries to create a new page mapping for a page without
2442 * a struct_page backing it
2443 *
2444 * As this is called only for pages that do not currently exist, we
2445 * do not need to flush old virtual caches or the TLB.
2446 *
2447 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2448 * but allow concurrent faults), and pte mapped but not yet locked.
2449 * We return with mmap_sem still held, but pte unmapped and unlocked.
2450 *
2451 * It is expected that the ->nopfn handler always returns the same pfn
2452 * for a given virtual mapping.
2453 *
2454 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2455 */
2456static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2457 unsigned long address, pte_t *page_table, pmd_t *pmd,
2458 int write_access)
2459{
2460 spinlock_t *ptl;
2461 pte_t entry;
2462 unsigned long pfn;
2463
2464 pte_unmap(page_table);
2465 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2466 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2467
2468 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2469
2470 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2471
2472 if (unlikely(pfn == NOPFN_OOM))
2473 return VM_FAULT_OOM;
2474 else if (unlikely(pfn == NOPFN_SIGBUS))
2475 return VM_FAULT_SIGBUS;
2476 else if (unlikely(pfn == NOPFN_REFAULT))
2477 return 0;
2478
2479 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2480
2481 /* Only go through if we didn't race with anybody else... */
2482 if (pte_none(*page_table)) {
2483 entry = pfn_pte(pfn, vma->vm_page_prot);
2484 if (write_access)
2485 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2486 set_pte_at(mm, address, page_table, entry);
2487 }
2488 pte_unmap_unlock(page_table, ptl);
2489 return 0;
2490}
2491
2492/* 2577/*
2493 * Fault of a previously existing named mapping. Repopulate the pte 2578 * Fault of a previously existing named mapping. Repopulate the pte
2494 * from the encoded file_pte if possible. This enables swappable 2579 * from the encoded file_pte if possible. This enables swappable
@@ -2549,9 +2634,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2549 if (likely(vma->vm_ops->fault)) 2634 if (likely(vma->vm_ops->fault))
2550 return do_linear_fault(mm, vma, address, 2635 return do_linear_fault(mm, vma, address,
2551 pte, pmd, write_access, entry); 2636 pte, pmd, write_access, entry);
2552 if (unlikely(vma->vm_ops->nopfn))
2553 return do_no_pfn(mm, vma, address, pte,
2554 pmd, write_access);
2555 } 2637 }
2556 return do_anonymous_page(mm, vma, address, 2638 return do_anonymous_page(mm, vma, address,
2557 pte, pmd, write_access); 2639 pte, pmd, write_access);
@@ -2683,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end)
2683 2765
2684 vma = find_vma(current->mm, addr); 2766 vma = find_vma(current->mm, addr);
2685 if (!vma) 2767 if (!vma)
2686 return -1; 2768 return -ENOMEM;
2687 write = (vma->vm_flags & VM_WRITE) != 0; 2769 write = (vma->vm_flags & VM_WRITE) != 0;
2688 BUG_ON(addr >= end); 2770 BUG_ON(addr >= end);
2689 BUG_ON(end > vma->vm_end); 2771 BUG_ON(end > vma->vm_end);
2690 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 2772 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2691 ret = get_user_pages(current, current->mm, addr, 2773 ret = get_user_pages(current, current->mm, addr,
2692 len, write, 0, NULL, NULL); 2774 len, write, 0, NULL, NULL);
2693 if (ret < 0) 2775 if (ret < 0) {
2776 /*
2777 SUS require strange return value to mlock
2778 - invalid addr generate to ENOMEM.
2779 - out of memory should generate EAGAIN.
2780 */
2781 if (ret == -EFAULT)
2782 ret = -ENOMEM;
2783 else if (ret == -ENOMEM)
2784 ret = -EAGAIN;
2694 return ret; 2785 return ret;
2695 return ret == len ? 0 : -1; 2786 }
2787 return ret == len ? 0 : -ENOMEM;
2696} 2788}
2697 2789
2698#if !defined(__HAVE_ARCH_GATE_AREA) 2790#if !defined(__HAVE_ARCH_GATE_AREA)
@@ -2739,6 +2831,86 @@ int in_gate_area_no_task(unsigned long addr)
2739 2831
2740#endif /* __HAVE_ARCH_GATE_AREA */ 2832#endif /* __HAVE_ARCH_GATE_AREA */
2741 2833
2834#ifdef CONFIG_HAVE_IOREMAP_PROT
2835static resource_size_t follow_phys(struct vm_area_struct *vma,
2836 unsigned long address, unsigned int flags,
2837 unsigned long *prot)
2838{
2839 pgd_t *pgd;
2840 pud_t *pud;
2841 pmd_t *pmd;
2842 pte_t *ptep, pte;
2843 spinlock_t *ptl;
2844 resource_size_t phys_addr = 0;
2845 struct mm_struct *mm = vma->vm_mm;
2846
2847 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2848
2849 pgd = pgd_offset(mm, address);
2850 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2851 goto no_page_table;
2852
2853 pud = pud_offset(pgd, address);
2854 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2855 goto no_page_table;
2856
2857 pmd = pmd_offset(pud, address);
2858 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2859 goto no_page_table;
2860
2861 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
2862 if (pmd_huge(*pmd))
2863 goto no_page_table;
2864
2865 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2866 if (!ptep)
2867 goto out;
2868
2869 pte = *ptep;
2870 if (!pte_present(pte))
2871 goto unlock;
2872 if ((flags & FOLL_WRITE) && !pte_write(pte))
2873 goto unlock;
2874 phys_addr = pte_pfn(pte);
2875 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
2876
2877 *prot = pgprot_val(pte_pgprot(pte));
2878
2879unlock:
2880 pte_unmap_unlock(ptep, ptl);
2881out:
2882 return phys_addr;
2883no_page_table:
2884 return 0;
2885}
2886
2887int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2888 void *buf, int len, int write)
2889{
2890 resource_size_t phys_addr;
2891 unsigned long prot = 0;
2892 void *maddr;
2893 int offset = addr & (PAGE_SIZE-1);
2894
2895 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2896 return -EINVAL;
2897
2898 phys_addr = follow_phys(vma, addr, write, &prot);
2899
2900 if (!phys_addr)
2901 return -EINVAL;
2902
2903 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2904 if (write)
2905 memcpy_toio(maddr + offset, buf, len);
2906 else
2907 memcpy_fromio(buf, maddr + offset, len);
2908 iounmap(maddr);
2909
2910 return len;
2911}
2912#endif
2913
2742/* 2914/*
2743 * Access another process' address space. 2915 * Access another process' address space.
2744 * Source/target buffer must be kernel space, 2916 * Source/target buffer must be kernel space,
@@ -2748,7 +2920,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2748{ 2920{
2749 struct mm_struct *mm; 2921 struct mm_struct *mm;
2750 struct vm_area_struct *vma; 2922 struct vm_area_struct *vma;
2751 struct page *page;
2752 void *old_buf = buf; 2923 void *old_buf = buf;
2753 2924
2754 mm = get_task_mm(tsk); 2925 mm = get_task_mm(tsk);
@@ -2760,28 +2931,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2760 while (len) { 2931 while (len) {
2761 int bytes, ret, offset; 2932 int bytes, ret, offset;
2762 void *maddr; 2933 void *maddr;
2934 struct page *page = NULL;
2763 2935
2764 ret = get_user_pages(tsk, mm, addr, 1, 2936 ret = get_user_pages(tsk, mm, addr, 1,
2765 write, 1, &page, &vma); 2937 write, 1, &page, &vma);
2766 if (ret <= 0) 2938 if (ret <= 0) {
2767 break; 2939 /*
2768 2940 * Check if this is a VM_IO | VM_PFNMAP VMA, which
2769 bytes = len; 2941 * we can access using slightly different code.
2770 offset = addr & (PAGE_SIZE-1); 2942 */
2771 if (bytes > PAGE_SIZE-offset) 2943#ifdef CONFIG_HAVE_IOREMAP_PROT
2772 bytes = PAGE_SIZE-offset; 2944 vma = find_vma(mm, addr);
2773 2945 if (!vma)
2774 maddr = kmap(page); 2946 break;
2775 if (write) { 2947 if (vma->vm_ops && vma->vm_ops->access)
2776 copy_to_user_page(vma, page, addr, 2948 ret = vma->vm_ops->access(vma, addr, buf,
2777 maddr + offset, buf, bytes); 2949 len, write);
2778 set_page_dirty_lock(page); 2950 if (ret <= 0)
2951#endif
2952 break;
2953 bytes = ret;
2779 } else { 2954 } else {
2780 copy_from_user_page(vma, page, addr, 2955 bytes = len;
2781 buf, maddr + offset, bytes); 2956 offset = addr & (PAGE_SIZE-1);
2957 if (bytes > PAGE_SIZE-offset)
2958 bytes = PAGE_SIZE-offset;
2959
2960 maddr = kmap(page);
2961 if (write) {
2962 copy_to_user_page(vma, page, addr,
2963 maddr + offset, buf, bytes);
2964 set_page_dirty_lock(page);
2965 } else {
2966 copy_from_user_page(vma, page, addr,
2967 buf, maddr + offset, bytes);
2968 }
2969 kunmap(page);
2970 page_cache_release(page);
2782 } 2971 }
2783 kunmap(page);
2784 page_cache_release(page);
2785 len -= bytes; 2972 len -= bytes;
2786 buf += bytes; 2973 buf += bytes;
2787 addr += bytes; 2974 addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..89fee2dcb039 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res)
62 62
63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
64#ifndef CONFIG_SPARSEMEM_VMEMMAP 64#ifndef CONFIG_SPARSEMEM_VMEMMAP
65static void get_page_bootmem(unsigned long info, struct page *page, int magic) 65static void get_page_bootmem(unsigned long info, struct page *page, int type)
66{ 66{
67 atomic_set(&page->_mapcount, magic); 67 atomic_set(&page->_mapcount, type);
68 SetPagePrivate(page); 68 SetPagePrivate(page);
69 set_page_private(page, info); 69 set_page_private(page, info);
70 atomic_inc(&page->_count); 70 atomic_inc(&page->_count);
@@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic)
72 72
73void put_page_bootmem(struct page *page) 73void put_page_bootmem(struct page *page)
74{ 74{
75 int magic; 75 int type;
76 76
77 magic = atomic_read(&page->_mapcount); 77 type = atomic_read(&page->_mapcount);
78 BUG_ON(magic >= -1); 78 BUG_ON(type >= -1);
79 79
80 if (atomic_dec_return(&page->_count) == 1) { 80 if (atomic_dec_return(&page->_count) == 1) {
81 ClearPagePrivate(page); 81 ClearPagePrivate(page);
@@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page)
86 86
87} 87}
88 88
89void register_page_bootmem_info_section(unsigned long start_pfn) 89static void register_page_bootmem_info_section(unsigned long start_pfn)
90{ 90{
91 unsigned long *usemap, mapsize, section_nr, i; 91 unsigned long *usemap, mapsize, section_nr, i;
92 struct mem_section *ms; 92 struct mem_section *ms;
@@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
120 120
121 for (i = 0; i < mapsize; i++, page++) 121 for (i = 0; i < mapsize; i++, page++)
122 get_page_bootmem(section_nr, page, MIX_INFO); 122 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
123 123
124} 124}
125 125
@@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 429
430 if (need_zonelists_rebuild) 430 if (need_zonelists_rebuild)
431 build_all_zonelists(); 431 build_all_zonelists();
432 vm_total_pages = nr_free_pagecache_pages(); 432 else
433 vm_total_pages = nr_free_pagecache_pages();
434
433 writeback_set_ratelimit(); 435 writeback_set_ratelimit();
434 436
435 if (onlined_pages) 437 if (onlined_pages)
@@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
455 /* we can use NODE_DATA(nid) from here */ 457 /* we can use NODE_DATA(nid) from here */
456 458
457 /* init node's zones as empty zones, we don't have any present pages.*/ 459 /* init node's zones as empty zones, we don't have any present pages.*/
458 free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); 460 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
459 461
460 return pgdat; 462 return pgdat;
461} 463}
@@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory);
521 523
522#ifdef CONFIG_MEMORY_HOTREMOVE 524#ifdef CONFIG_MEMORY_HOTREMOVE
523/* 525/*
526 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
527 * set and the size of the free page is given by page_order(). Using this,
528 * the function determines if the pageblock contains only free pages.
529 * Due to buddy contraints, a free page at least the size of a pageblock will
530 * be located at the start of the pageblock
531 */
532static inline int pageblock_free(struct page *page)
533{
534 return PageBuddy(page) && page_order(page) >= pageblock_order;
535}
536
537/* Return the start of the next active pageblock after a given page */
538static struct page *next_active_pageblock(struct page *page)
539{
540 int pageblocks_stride;
541
542 /* Ensure the starting page is pageblock-aligned */
543 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
544
545 /* Move forward by at least 1 * pageblock_nr_pages */
546 pageblocks_stride = 1;
547
548 /* If the entire pageblock is free, move to the end of free page */
549 if (pageblock_free(page))
550 pageblocks_stride += page_order(page) - pageblock_order;
551
552 return page + (pageblocks_stride * pageblock_nr_pages);
553}
554
555/* Checks if this range of memory is likely to be hot-removable. */
556int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
557{
558 int type;
559 struct page *page = pfn_to_page(start_pfn);
560 struct page *end_page = page + nr_pages;
561
562 /* Check the starting page of each pageblock within the range */
563 for (; page < end_page; page = next_active_pageblock(page)) {
564 type = get_pageblock_migratetype(page);
565
566 /*
567 * A pageblock containing MOVABLE or free pages is considered
568 * removable
569 */
570 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
571 return 0;
572
573 /*
574 * A pageblock starting with a PageReserved page is not
575 * considered removable.
576 */
577 if (PageReserved(page))
578 return 0;
579 }
580
581 /* All pageblocks in the memory block are likely to be hot-removable */
582 return 1;
583}
584
585/*
524 * Confirm all pages in a range [start, end) is belongs to the same zone. 586 * Confirm all pages in a range [start, end) is belongs to the same zone.
525 */ 587 */
526static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 588static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a37a5034f63d..83369058ec13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -729,7 +729,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
729 } else { 729 } else {
730 *policy = pol == &default_policy ? MPOL_DEFAULT : 730 *policy = pol == &default_policy ? MPOL_DEFAULT :
731 pol->mode; 731 pol->mode;
732 *policy |= pol->flags; 732 /*
733 * Internal mempolicy flags must be masked off before exposing
734 * the policy to userspace.
735 */
736 *policy |= (pol->flags & MPOL_MODE_FLAGS);
733 } 737 }
734 738
735 if (vma) { 739 if (vma) {
@@ -799,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
799int do_migrate_pages(struct mm_struct *mm, 803int do_migrate_pages(struct mm_struct *mm,
800 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
801{ 805{
802 LIST_HEAD(pagelist);
803 int busy = 0; 806 int busy = 0;
804 int err = 0; 807 int err = 0;
805 nodemask_t tmp; 808 nodemask_t tmp;
@@ -1477,7 +1480,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1477 1480
1478 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1481 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1479 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1482 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1480 HPAGE_SHIFT), gfp_flags); 1483 huge_page_shift(hstate_vma(vma))), gfp_flags);
1481 } else { 1484 } else {
1482 zl = policy_zonelist(gfp_flags, *mpol); 1485 zl = policy_zonelist(gfp_flags, *mpol);
1483 if ((*mpol)->mode == MPOL_BIND) 1486 if ((*mpol)->mode == MPOL_BIND)
@@ -2216,9 +2219,12 @@ static void check_huge_range(struct vm_area_struct *vma,
2216{ 2219{
2217 unsigned long addr; 2220 unsigned long addr;
2218 struct page *page; 2221 struct page *page;
2222 struct hstate *h = hstate_vma(vma);
2223 unsigned long sz = huge_page_size(h);
2219 2224
2220 for (addr = start; addr < end; addr += HPAGE_SIZE) { 2225 for (addr = start; addr < end; addr += sz) {
2221 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2226 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2227 addr & huge_page_mask(h));
2222 pte_t pte; 2228 pte_t pte;
2223 2229
2224 if (!ptep) 2230 if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 449d77d409f5..2a80136b23bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -9,7 +9,7 @@
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp> 10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com> 11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter <clameter@sgi.com> 12 * Christoph Lameter
13 */ 13 */
14 14
15#include <linux/migrate.h> 15#include <linux/migrate.h>
@@ -30,6 +30,7 @@
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
@@ -284,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
284 285
285 page = migration_entry_to_page(entry); 286 page = migration_entry_to_page(entry);
286 287
287 get_page(page); 288 /*
289 * Once radix-tree replacement of page migration started, page_count
290 * *must* be zero. And, we don't want to call wait_on_page_locked()
291 * against a page without get_page().
292 * So, we use get_page_unless_zero(), here. Even failed, page fault
293 * will occur again.
294 */
295 if (!get_page_unless_zero(page))
296 goto out;
288 pte_unmap_unlock(ptep, ptl); 297 pte_unmap_unlock(ptep, ptl);
289 wait_on_page_locked(page); 298 wait_on_page_locked(page);
290 put_page(page); 299 put_page(page);
@@ -304,6 +313,7 @@ out:
304static int migrate_page_move_mapping(struct address_space *mapping, 313static int migrate_page_move_mapping(struct address_space *mapping,
305 struct page *newpage, struct page *page) 314 struct page *newpage, struct page *page)
306{ 315{
316 int expected_count;
307 void **pslot; 317 void **pslot;
308 318
309 if (!mapping) { 319 if (!mapping) {
@@ -313,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
313 return 0; 323 return 0;
314 } 324 }
315 325
316 write_lock_irq(&mapping->tree_lock); 326 spin_lock_irq(&mapping->tree_lock);
317 327
318 pslot = radix_tree_lookup_slot(&mapping->page_tree, 328 pslot = radix_tree_lookup_slot(&mapping->page_tree,
319 page_index(page)); 329 page_index(page));
320 330
321 if (page_count(page) != 2 + !!PagePrivate(page) || 331 expected_count = 2 + !!PagePrivate(page);
332 if (page_count(page) != expected_count ||
322 (struct page *)radix_tree_deref_slot(pslot) != page) { 333 (struct page *)radix_tree_deref_slot(pslot) != page) {
323 write_unlock_irq(&mapping->tree_lock); 334 spin_unlock_irq(&mapping->tree_lock);
335 return -EAGAIN;
336 }
337
338 if (!page_freeze_refs(page, expected_count)) {
339 spin_unlock_irq(&mapping->tree_lock);
324 return -EAGAIN; 340 return -EAGAIN;
325 } 341 }
326 342
@@ -337,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
337 353
338 radix_tree_replace_slot(pslot, newpage); 354 radix_tree_replace_slot(pslot, newpage);
339 355
356 page_unfreeze_refs(page, expected_count);
340 /* 357 /*
341 * Drop cache reference from old page. 358 * Drop cache reference from old page.
342 * We know this isn't the last reference. 359 * We know this isn't the last reference.
@@ -356,7 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 __dec_zone_page_state(page, NR_FILE_PAGES); 373 __dec_zone_page_state(page, NR_FILE_PAGES);
357 __inc_zone_page_state(newpage, NR_FILE_PAGES); 374 __inc_zone_page_state(newpage, NR_FILE_PAGES);
358 375
359 write_unlock_irq(&mapping->tree_lock); 376 spin_unlock_irq(&mapping->tree_lock);
377 if (!PageSwapCache(newpage))
378 mem_cgroup_uncharge_cache_page(page);
360 379
361 return 0; 380 return 0;
362} 381}
@@ -586,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
586 * establishing additional references. We are the only one 605 * establishing additional references. We are the only one
587 * holding a reference to the new page at this point. 606 * holding a reference to the new page at this point.
588 */ 607 */
589 if (TestSetPageLocked(newpage)) 608 if (!trylock_page(newpage))
590 BUG(); 609 BUG();
591 610
592 /* Prepare mapping for the new page.*/ 611 /* Prepare mapping for the new page.*/
@@ -610,7 +629,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
610 rc = fallback_migrate_page(mapping, newpage, page); 629 rc = fallback_migrate_page(mapping, newpage, page);
611 630
612 if (!rc) { 631 if (!rc) {
613 mem_cgroup_page_migration(page, newpage);
614 remove_migration_ptes(page, newpage); 632 remove_migration_ptes(page, newpage);
615 } else 633 } else
616 newpage->mapping = NULL; 634 newpage->mapping = NULL;
@@ -640,8 +658,16 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
640 /* page was freed from under us. So we are done. */ 658 /* page was freed from under us. So we are done. */
641 goto move_newpage; 659 goto move_newpage;
642 660
661 charge = mem_cgroup_prepare_migration(page, newpage);
662 if (charge == -ENOMEM) {
663 rc = -ENOMEM;
664 goto move_newpage;
665 }
666 /* prepare cgroup just returns 0 or -ENOMEM */
667 BUG_ON(charge);
668
643 rc = -EAGAIN; 669 rc = -EAGAIN;
644 if (TestSetPageLocked(page)) { 670 if (!trylock_page(page)) {
645 if (!force) 671 if (!force)
646 goto move_newpage; 672 goto move_newpage;
647 lock_page(page); 673 lock_page(page);
@@ -691,19 +717,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
691 goto rcu_unlock; 717 goto rcu_unlock;
692 } 718 }
693 719
694 charge = mem_cgroup_prepare_migration(page);
695 /* Establish migration ptes or remove ptes */ 720 /* Establish migration ptes or remove ptes */
696 try_to_unmap(page, 1); 721 try_to_unmap(page, 1);
697 722
698 if (!page_mapped(page)) 723 if (!page_mapped(page))
699 rc = move_to_new_page(newpage, page); 724 rc = move_to_new_page(newpage, page);
700 725
701 if (rc) { 726 if (rc)
702 remove_migration_ptes(page, page); 727 remove_migration_ptes(page, page);
703 if (charge)
704 mem_cgroup_end_migration(page);
705 } else if (charge)
706 mem_cgroup_end_migration(newpage);
707rcu_unlock: 728rcu_unlock:
708 if (rcu_locked) 729 if (rcu_locked)
709 rcu_read_unlock(); 730 rcu_read_unlock();
@@ -724,6 +745,8 @@ unlock:
724 } 745 }
725 746
726move_newpage: 747move_newpage:
748 if (!charge)
749 mem_cgroup_end_migration(newpage);
727 /* 750 /*
728 * Move the new page to the LRU. If migration was not successful 751 * Move the new page to the LRU. If migration was not successful
729 * then this will free the page. 752 * then this will free the page.
@@ -865,6 +888,11 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
865 goto set_status; 888 goto set_status;
866 889
867 page = follow_page(vma, pp->addr, FOLL_GET); 890 page = follow_page(vma, pp->addr, FOLL_GET);
891
892 err = PTR_ERR(page);
893 if (IS_ERR(page))
894 goto set_status;
895
868 err = -ENOENT; 896 err = -ENOENT;
869 if (!page) 897 if (!page)
870 goto set_status; 898 goto set_status;
@@ -928,6 +956,11 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
928 goto set_status; 956 goto set_status;
929 957
930 page = follow_page(vma, pm->addr, 0); 958 page = follow_page(vma, pm->addr, 0);
959
960 err = PTR_ERR(page);
961 if (IS_ERR(page))
962 goto set_status;
963
931 err = -ENOENT; 964 err = -ENOENT;
932 /* Use PageReserved to check for zero page */ 965 /* Use PageReserved to check for zero page */
933 if (!page || PageReserved(page)) 966 if (!page || PageReserved(page))
@@ -1060,7 +1093,6 @@ out2:
1060 mmput(mm); 1093 mmput(mm);
1061 return err; 1094 return err;
1062} 1095}
1063#endif
1064 1096
1065/* 1097/*
1066 * Call migration functions in the vma_ops that may prepare 1098 * Call migration functions in the vma_ops that may prepare
@@ -1082,3 +1114,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1082 } 1114 }
1083 return err; 1115 return err;
1084} 1116}
1117#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6a..01fbe93eff5c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -78,8 +78,6 @@ success:
78 78
79 mm->locked_vm -= pages; 79 mm->locked_vm -= pages;
80out: 80out:
81 if (ret == -ENOMEM)
82 ret = -EAGAIN;
83 return ret; 81 return ret;
84} 82}
85 83
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..4e0e26591dfa
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
1/*
2 * mm_init.c - Memory initialisation verification and debugging
3 *
4 * Copyright 2008 IBM Corporation, 2008
5 * Author Mel Gorman <mel@csn.ul.ie>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/kobject.h>
11#include <linux/module.h>
12#include "internal.h"
13
14#ifdef CONFIG_DEBUG_MEMORY_INIT
15int mminit_loglevel;
16
17#ifndef SECTIONS_SHIFT
18#define SECTIONS_SHIFT 0
19#endif
20
21/* The zonelists are simply reported, validation is manual. */
22void mminit_verify_zonelist(void)
23{
24 int nid;
25
26 if (mminit_loglevel < MMINIT_VERIFY)
27 return;
28
29 for_each_online_node(nid) {
30 pg_data_t *pgdat = NODE_DATA(nid);
31 struct zone *zone;
32 struct zoneref *z;
33 struct zonelist *zonelist;
34 int i, listid, zoneid;
35
36 BUG_ON(MAX_ZONELISTS > 2);
37 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
38
39 /* Identify the zone and nodelist */
40 zoneid = i % MAX_NR_ZONES;
41 listid = i / MAX_NR_ZONES;
42 zonelist = &pgdat->node_zonelists[listid];
43 zone = &pgdat->node_zones[zoneid];
44 if (!populated_zone(zone))
45 continue;
46
47 /* Print information about the zonelist */
48 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
49 listid > 0 ? "thisnode" : "general", nid,
50 zone->name);
51
52 /* Iterate the zonelist */
53 for_each_zone_zonelist(zone, z, zonelist, zoneid) {
54#ifdef CONFIG_NUMA
55 printk(KERN_CONT "%d:%s ",
56 zone->node, zone->name);
57#else
58 printk(KERN_CONT "0:%s ", zone->name);
59#endif /* CONFIG_NUMA */
60 }
61 printk(KERN_CONT "\n");
62 }
63 }
64}
65
66void __init mminit_verify_pageflags_layout(void)
67{
68 int shift, width;
69 unsigned long or_mask, add_mask;
70
71 shift = 8 * sizeof(unsigned long);
72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
74 "Section %d Node %d Zone %d Flags %d\n",
75 SECTIONS_WIDTH,
76 NODES_WIDTH,
77 ZONES_WIDTH,
78 NR_PAGEFLAGS);
79 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
80 "Section %d Node %d Zone %d\n",
81 SECTIONS_SHIFT,
82 NODES_SHIFT,
83 ZONES_SHIFT);
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
85 "Section %lu Node %lu Zone %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT);
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
90 "Zone ID: %lu -> %lu\n",
91 (unsigned long)ZONEID_PGOFF,
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags");
99#endif
100
101 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH;
103 BUG_ON(shift != SECTIONS_PGSHIFT);
104 }
105 if (NODES_WIDTH) {
106 shift -= NODES_WIDTH;
107 BUG_ON(shift != NODES_PGSHIFT);
108 }
109 if (ZONES_WIDTH) {
110 shift -= ZONES_WIDTH;
111 BUG_ON(shift != ZONES_PGSHIFT);
112 }
113
114 /* Check for bitmask overlaps */
115 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
116 (NODES_MASK << NODES_PGSHIFT) |
117 (SECTIONS_MASK << SECTIONS_PGSHIFT);
118 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
119 (NODES_MASK << NODES_PGSHIFT) +
120 (SECTIONS_MASK << SECTIONS_PGSHIFT);
121 BUG_ON(or_mask != add_mask);
122}
123
124void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
125 unsigned long nid, unsigned long pfn)
126{
127 BUG_ON(page_to_nid(page) != nid);
128 BUG_ON(page_zonenum(page) != zone);
129 BUG_ON(page_to_pfn(page) != pfn);
130}
131
132static __init int set_mminit_loglevel(char *str)
133{
134 get_option(&str, &mminit_loglevel);
135 return 0;
136}
137early_param("mminit_loglevel", set_mminit_loglevel);
138#endif /* CONFIG_DEBUG_MEMORY_INIT */
139
140struct kobject *mm_kobj;
141EXPORT_SYMBOL_GPL(mm_kobj);
142
143static int __init mm_sysfs_init(void)
144{
145 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
146 if (!mm_kobj)
147 return -ENOMEM;
148
149 return 0;
150}
151
152__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 3354fdd83d4b..e7a5a68a9c2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,12 +26,15 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/cacheflush.h> 32#include <asm/cacheflush.h>
32#include <asm/tlb.h> 33#include <asm/tlb.h>
33#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
34 35
36#include "internal.h"
37
35#ifndef arch_mmap_check 38#ifndef arch_mmap_check
36#define arch_mmap_check(addr, len, flags) (0) 39#define arch_mmap_check(addr, len, flags) (0)
37#endif 40#endif
@@ -72,8 +75,9 @@ pgprot_t protection_map[16] = {
72 75
73pgprot_t vm_get_page_prot(unsigned long vm_flags) 76pgprot_t vm_get_page_prot(unsigned long vm_flags)
74{ 77{
75 return protection_map[vm_flags & 78 return __pgprot(pgprot_val(protection_map[vm_flags &
76 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; 79 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
80 pgprot_val(arch_vm_get_page_prot(vm_flags)));
77} 81}
78EXPORT_SYMBOL(vm_get_page_prot); 82EXPORT_SYMBOL(vm_get_page_prot);
79 83
@@ -366,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
366 if (vma_tmp->vm_end > addr) { 370 if (vma_tmp->vm_end > addr) {
367 vma = vma_tmp; 371 vma = vma_tmp;
368 if (vma_tmp->vm_start <= addr) 372 if (vma_tmp->vm_start <= addr)
369 return vma; 373 break;
370 __rb_link = &__rb_parent->rb_left; 374 __rb_link = &__rb_parent->rb_left;
371 } else { 375 } else {
372 rb_prev = __rb_parent; 376 rb_prev = __rb_parent;
@@ -1026,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
1026 } else { 1030 } else {
1027 switch (flags & MAP_TYPE) { 1031 switch (flags & MAP_TYPE) {
1028 case MAP_SHARED: 1032 case MAP_SHARED:
1033 /*
1034 * Ignore pgoff.
1035 */
1036 pgoff = 0;
1029 vm_flags |= VM_SHARED | VM_MAYSHARE; 1037 vm_flags |= VM_SHARED | VM_MAYSHARE;
1030 break; 1038 break;
1031 case MAP_PRIVATE: 1039 case MAP_PRIVATE:
@@ -1107,6 +1115,9 @@ munmap_back:
1107 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 1115 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1108 return -ENOMEM; 1116 return -ENOMEM;
1109 1117
1118 if (flags & MAP_NORESERVE)
1119 vm_flags |= VM_NORESERVE;
1120
1110 if (accountable && (!(flags & MAP_NORESERVE) || 1121 if (accountable && (!(flags & MAP_NORESERVE) ||
1111 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 1122 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1112 if (vm_flags & VM_SHARED) { 1123 if (vm_flags & VM_SHARED) {
@@ -1762,7 +1773,7 @@ static void unmap_region(struct mm_struct *mm,
1762 update_hiwater_rss(mm); 1773 update_hiwater_rss(mm);
1763 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1774 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1764 vm_unacct_memory(nr_accounted); 1775 vm_unacct_memory(nr_accounted);
1765 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1776 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1766 next? next->vm_start: 0); 1777 next? next->vm_start: 0);
1767 tlb_finish_mmu(tlb, start, end); 1778 tlb_finish_mmu(tlb, start, end);
1768} 1779}
@@ -1806,7 +1817,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1806 struct mempolicy *pol; 1817 struct mempolicy *pol;
1807 struct vm_area_struct *new; 1818 struct vm_area_struct *new;
1808 1819
1809 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 1820 if (is_vm_hugetlb_page(vma) && (addr &
1821 ~(huge_page_mask(hstate_vma(vma)))))
1810 return -EINVAL; 1822 return -EINVAL;
1811 1823
1812 if (mm->map_count >= sysctl_max_map_count) 1824 if (mm->map_count >= sysctl_max_map_count)
@@ -2054,6 +2066,7 @@ void exit_mmap(struct mm_struct *mm)
2054 2066
2055 /* mm's last user has gone, and its about to be pulled down */ 2067 /* mm's last user has gone, and its about to be pulled down */
2056 arch_exit_mmap(mm); 2068 arch_exit_mmap(mm);
2069 mmu_notifier_release(mm);
2057 2070
2058 lru_add_drain(); 2071 lru_add_drain();
2059 flush_cache_mm(mm); 2072 flush_cache_mm(mm);
@@ -2062,7 +2075,7 @@ void exit_mmap(struct mm_struct *mm)
2062 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2075 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2063 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2076 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2064 vm_unacct_memory(nr_accounted); 2077 vm_unacct_memory(nr_accounted);
2065 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2078 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2066 tlb_finish_mmu(tlb, 0, end); 2079 tlb_finish_mmu(tlb, 0, end);
2067 2080
2068 /* 2081 /*
@@ -2261,3 +2274,167 @@ int install_special_mapping(struct mm_struct *mm,
2261 2274
2262 return 0; 2275 return 0;
2263} 2276}
2277
2278static DEFINE_MUTEX(mm_all_locks_mutex);
2279
2280static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2281{
2282 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2283 /*
2284 * The LSB of head.next can't change from under us
2285 * because we hold the mm_all_locks_mutex.
2286 */
2287 spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
2288 /*
2289 * We can safely modify head.next after taking the
2290 * anon_vma->lock. If some other vma in this mm shares
2291 * the same anon_vma we won't take it again.
2292 *
2293 * No need of atomic instructions here, head.next
2294 * can't change from under us thanks to the
2295 * anon_vma->lock.
2296 */
2297 if (__test_and_set_bit(0, (unsigned long *)
2298 &anon_vma->head.next))
2299 BUG();
2300 }
2301}
2302
2303static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2304{
2305 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2306 /*
2307 * AS_MM_ALL_LOCKS can't change from under us because
2308 * we hold the mm_all_locks_mutex.
2309 *
2310 * Operations on ->flags have to be atomic because
2311 * even if AS_MM_ALL_LOCKS is stable thanks to the
2312 * mm_all_locks_mutex, there may be other cpus
2313 * changing other bitflags in parallel to us.
2314 */
2315 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2316 BUG();
2317 spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
2318 }
2319}
2320
2321/*
2322 * This operation locks against the VM for all pte/vma/mm related
2323 * operations that could ever happen on a certain mm. This includes
2324 * vmtruncate, try_to_unmap, and all page faults.
2325 *
2326 * The caller must take the mmap_sem in write mode before calling
2327 * mm_take_all_locks(). The caller isn't allowed to release the
2328 * mmap_sem until mm_drop_all_locks() returns.
2329 *
2330 * mmap_sem in write mode is required in order to block all operations
2331 * that could modify pagetables and free pages without need of
2332 * altering the vma layout (for example populate_range() with
2333 * nonlinear vmas). It's also needed in write mode to avoid new
2334 * anon_vmas to be associated with existing vmas.
2335 *
2336 * A single task can't take more than one mm_take_all_locks() in a row
2337 * or it would deadlock.
2338 *
2339 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
2340 * mapping->flags avoid to take the same lock twice, if more than one
2341 * vma in this mm is backed by the same anon_vma or address_space.
2342 *
2343 * We can take all the locks in random order because the VM code
2344 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
2345 * takes more than one of them in a row. Secondly we're protected
2346 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2347 *
2348 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2349 * that may have to take thousand of locks.
2350 *
2351 * mm_take_all_locks() can fail if it's interrupted by signals.
2352 */
2353int mm_take_all_locks(struct mm_struct *mm)
2354{
2355 struct vm_area_struct *vma;
2356 int ret = -EINTR;
2357
2358 BUG_ON(down_read_trylock(&mm->mmap_sem));
2359
2360 mutex_lock(&mm_all_locks_mutex);
2361
2362 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2363 if (signal_pending(current))
2364 goto out_unlock;
2365 if (vma->vm_file && vma->vm_file->f_mapping)
2366 vm_lock_mapping(mm, vma->vm_file->f_mapping);
2367 }
2368
2369 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2370 if (signal_pending(current))
2371 goto out_unlock;
2372 if (vma->anon_vma)
2373 vm_lock_anon_vma(mm, vma->anon_vma);
2374 }
2375
2376 ret = 0;
2377
2378out_unlock:
2379 if (ret)
2380 mm_drop_all_locks(mm);
2381
2382 return ret;
2383}
2384
2385static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2386{
2387 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2388 /*
2389 * The LSB of head.next can't change to 0 from under
2390 * us because we hold the mm_all_locks_mutex.
2391 *
2392 * We must however clear the bitflag before unlocking
2393 * the vma so the users using the anon_vma->head will
2394 * never see our bitflag.
2395 *
2396 * No need of atomic instructions here, head.next
2397 * can't change from under us until we release the
2398 * anon_vma->lock.
2399 */
2400 if (!__test_and_clear_bit(0, (unsigned long *)
2401 &anon_vma->head.next))
2402 BUG();
2403 spin_unlock(&anon_vma->lock);
2404 }
2405}
2406
2407static void vm_unlock_mapping(struct address_space *mapping)
2408{
2409 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2410 /*
2411 * AS_MM_ALL_LOCKS can't change to 0 from under us
2412 * because we hold the mm_all_locks_mutex.
2413 */
2414 spin_unlock(&mapping->i_mmap_lock);
2415 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2416 &mapping->flags))
2417 BUG();
2418 }
2419}
2420
2421/*
2422 * The mmap_sem cannot be released by the caller until
2423 * mm_drop_all_locks() returns.
2424 */
2425void mm_drop_all_locks(struct mm_struct *mm)
2426{
2427 struct vm_area_struct *vma;
2428
2429 BUG_ON(down_read_trylock(&mm->mmap_sem));
2430 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2431
2432 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2433 if (vma->anon_vma)
2434 vm_unlock_anon_vma(vma->anon_vma);
2435 if (vma->vm_file && vma->vm_file->f_mapping)
2436 vm_unlock_mapping(vma->vm_file->f_mapping);
2437 }
2438
2439 mutex_unlock(&mm_all_locks_mutex);
2440}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
1/*
2 * linux/mm/mmu_notifier.c
3 *
4 * Copyright (C) 2008 Qumranet, Inc.
5 * Copyright (C) 2008 SGI
6 * Christoph Lameter <clameter@sgi.com>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See
9 * the COPYING file in the top-level directory.
10 */
11
12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h>
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/err.h>
17#include <linux/rcupdate.h>
18#include <linux/sched.h>
19
20/*
21 * This function can't run concurrently against mmu_notifier_register
22 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
23 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
24 * in parallel despite there being no task using this mm any more,
25 * through the vmas outside of the exit_mmap context, such as with
26 * vmtruncate. This serializes against mmu_notifier_unregister with
27 * the mmu_notifier_mm->lock in addition to RCU and it serializes
28 * against the other mmu notifiers with RCU. struct mmu_notifier_mm
29 * can't go away from under us as exit_mmap holds an mm_count pin
30 * itself.
31 */
32void __mmu_notifier_release(struct mm_struct *mm)
33{
34 struct mmu_notifier *mn;
35
36 spin_lock(&mm->mmu_notifier_mm->lock);
37 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
38 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
39 struct mmu_notifier,
40 hlist);
41 /*
42 * We arrived before mmu_notifier_unregister so
43 * mmu_notifier_unregister will do nothing other than
44 * to wait ->release to finish and
45 * mmu_notifier_unregister to return.
46 */
47 hlist_del_init_rcu(&mn->hlist);
48 /*
49 * RCU here will block mmu_notifier_unregister until
50 * ->release returns.
51 */
52 rcu_read_lock();
53 spin_unlock(&mm->mmu_notifier_mm->lock);
54 /*
55 * if ->release runs before mmu_notifier_unregister it
56 * must be handled as it's the only way for the driver
57 * to flush all existing sptes and stop the driver
58 * from establishing any more sptes before all the
59 * pages in the mm are freed.
60 */
61 if (mn->ops->release)
62 mn->ops->release(mn, mm);
63 rcu_read_unlock();
64 spin_lock(&mm->mmu_notifier_mm->lock);
65 }
66 spin_unlock(&mm->mmu_notifier_mm->lock);
67
68 /*
69 * synchronize_rcu here prevents mmu_notifier_release to
70 * return to exit_mmap (which would proceed freeing all pages
71 * in the mm) until the ->release method returns, if it was
72 * invoked by mmu_notifier_unregister.
73 *
74 * The mmu_notifier_mm can't go away from under us because one
75 * mm_count is hold by exit_mmap.
76 */
77 synchronize_rcu();
78}
79
80/*
81 * If no young bitflag is supported by the hardware, ->clear_flush_young can
82 * unmap the address and return 1 or 0 depending if the mapping previously
83 * existed or not.
84 */
85int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
86 unsigned long address)
87{
88 struct mmu_notifier *mn;
89 struct hlist_node *n;
90 int young = 0;
91
92 rcu_read_lock();
93 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
94 if (mn->ops->clear_flush_young)
95 young |= mn->ops->clear_flush_young(mn, mm, address);
96 }
97 rcu_read_unlock();
98
99 return young;
100}
101
102void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->invalidate_page)
111 mn->ops->invalidate_page(mn, mm, address);
112 }
113 rcu_read_unlock();
114}
115
116void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
117 unsigned long start, unsigned long end)
118{
119 struct mmu_notifier *mn;
120 struct hlist_node *n;
121
122 rcu_read_lock();
123 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
124 if (mn->ops->invalidate_range_start)
125 mn->ops->invalidate_range_start(mn, mm, start, end);
126 }
127 rcu_read_unlock();
128}
129
130void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
131 unsigned long start, unsigned long end)
132{
133 struct mmu_notifier *mn;
134 struct hlist_node *n;
135
136 rcu_read_lock();
137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
138 if (mn->ops->invalidate_range_end)
139 mn->ops->invalidate_range_end(mn, mm, start, end);
140 }
141 rcu_read_unlock();
142}
143
144static int do_mmu_notifier_register(struct mmu_notifier *mn,
145 struct mm_struct *mm,
146 int take_mmap_sem)
147{
148 struct mmu_notifier_mm *mmu_notifier_mm;
149 int ret;
150
151 BUG_ON(atomic_read(&mm->mm_users) <= 0);
152
153 ret = -ENOMEM;
154 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
155 if (unlikely(!mmu_notifier_mm))
156 goto out;
157
158 if (take_mmap_sem)
159 down_write(&mm->mmap_sem);
160 ret = mm_take_all_locks(mm);
161 if (unlikely(ret))
162 goto out_cleanup;
163
164 if (!mm_has_notifiers(mm)) {
165 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
166 spin_lock_init(&mmu_notifier_mm->lock);
167 mm->mmu_notifier_mm = mmu_notifier_mm;
168 mmu_notifier_mm = NULL;
169 }
170 atomic_inc(&mm->mm_count);
171
172 /*
173 * Serialize the update against mmu_notifier_unregister. A
174 * side note: mmu_notifier_release can't run concurrently with
175 * us because we hold the mm_users pin (either implicitly as
176 * current->mm or explicitly with get_task_mm() or similar).
177 * We can't race against any other mmu notifier method either
178 * thanks to mm_take_all_locks().
179 */
180 spin_lock(&mm->mmu_notifier_mm->lock);
181 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
182 spin_unlock(&mm->mmu_notifier_mm->lock);
183
184 mm_drop_all_locks(mm);
185out_cleanup:
186 if (take_mmap_sem)
187 up_write(&mm->mmap_sem);
188 /* kfree() does nothing if mmu_notifier_mm is NULL */
189 kfree(mmu_notifier_mm);
190out:
191 BUG_ON(atomic_read(&mm->mm_users) <= 0);
192 return ret;
193}
194
195/*
196 * Must not hold mmap_sem nor any other VM related lock when calling
197 * this registration function. Must also ensure mm_users can't go down
198 * to zero while this runs to avoid races with mmu_notifier_release,
199 * so mm has to be current->mm or the mm should be pinned safely such
200 * as with get_task_mm(). If the mm is not current->mm, the mm_users
201 * pin should be released by calling mmput after mmu_notifier_register
202 * returns. mmu_notifier_unregister must be always called to
203 * unregister the notifier. mm_count is automatically pinned to allow
204 * mmu_notifier_unregister to safely run at any time later, before or
205 * after exit_mmap. ->release will always be called before exit_mmap
206 * frees the pages.
207 */
208int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
209{
210 return do_mmu_notifier_register(mn, mm, 1);
211}
212EXPORT_SYMBOL_GPL(mmu_notifier_register);
213
214/*
215 * Same as mmu_notifier_register but here the caller must hold the
216 * mmap_sem in write mode.
217 */
218int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
219{
220 return do_mmu_notifier_register(mn, mm, 0);
221}
222EXPORT_SYMBOL_GPL(__mmu_notifier_register);
223
224/* this is called after the last mmu_notifier_unregister() returned */
225void __mmu_notifier_mm_destroy(struct mm_struct *mm)
226{
227 BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
228 kfree(mm->mmu_notifier_mm);
229 mm->mmu_notifier_mm = LIST_POISON1; /* debug */
230}
231
232/*
233 * This releases the mm_count pin automatically and frees the mm
234 * structure if it was the last user of it. It serializes against
235 * running mmu notifiers with RCU and against mmu_notifier_unregister
236 * with the unregister lock + RCU. All sptes must be dropped before
237 * calling mmu_notifier_unregister. ->release or any other notifier
238 * method may be invoked concurrently with mmu_notifier_unregister,
239 * and only after mmu_notifier_unregister returned we're guaranteed
240 * that ->release or any other method can't run anymore.
241 */
242void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
243{
244 BUG_ON(atomic_read(&mm->mm_count) <= 0);
245
246 spin_lock(&mm->mmu_notifier_mm->lock);
247 if (!hlist_unhashed(&mn->hlist)) {
248 hlist_del_rcu(&mn->hlist);
249
250 /*
251 * RCU here will force exit_mmap to wait ->release to finish
252 * before freeing the pages.
253 */
254 rcu_read_lock();
255 spin_unlock(&mm->mmu_notifier_mm->lock);
256 /*
257 * exit_mmap will block in mmu_notifier_release to
258 * guarantee ->release is called before freeing the
259 * pages.
260 */
261 if (mn->ops->release)
262 mn->ops->release(mn, mm);
263 rcu_read_unlock();
264 } else
265 spin_unlock(&mm->mmu_notifier_mm->lock);
266
267 /*
268 * Wait any running method to finish, of course including
269 * ->release if it was run by mmu_notifier_relase instead of us.
270 */
271 synchronize_rcu();
272
273 BUG_ON(atomic_read(&mm->mm_count) <= 0);
274
275 mmdrop(mm);
276}
277EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed595ee6f..16ce8b955dcf 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
69 (z->zone && !zref_in_nodemask(z, nodes))) 69 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++; 70 z++;
71 71
72 *zone = zonelist_zone(z++); 72 *zone = zonelist_zone(z);
73 return z; 73 return z;
74} 74}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a5bf31c27375..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include <asm/pgtable.h> 26#include <asm/pgtable.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
@@ -47,19 +48,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
47 if (pte_present(oldpte)) { 48 if (pte_present(oldpte)) {
48 pte_t ptent; 49 pte_t ptent;
49 50
50 /* Avoid an SMP race with hardware updated dirty/clean 51 ptent = ptep_modify_prot_start(mm, addr, pte);
51 * bits by wiping the pte and then setting the new pte
52 * into place.
53 */
54 ptent = ptep_get_and_clear(mm, addr, pte);
55 ptent = pte_modify(ptent, newprot); 52 ptent = pte_modify(ptent, newprot);
53
56 /* 54 /*
57 * Avoid taking write faults for pages we know to be 55 * Avoid taking write faults for pages we know to be
58 * dirty. 56 * dirty.
59 */ 57 */
60 if (dirty_accountable && pte_dirty(ptent)) 58 if (dirty_accountable && pte_dirty(ptent))
61 ptent = pte_mkwrite(ptent); 59 ptent = pte_mkwrite(ptent);
62 set_pte_at(mm, addr, pte, ptent); 60
61 ptep_modify_prot_commit(mm, addr, pte, ptent);
63#ifdef CONFIG_MIGRATION 62#ifdef CONFIG_MIGRATION
64 } else if (!pte_file(oldpte)) { 63 } else if (!pte_file(oldpte)) {
65 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -155,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
155 * If we make a private mapping writable we increase our commit; 154 * If we make a private mapping writable we increase our commit;
156 * but (without finer accounting) cannot reduce our commit if we 155 * but (without finer accounting) cannot reduce our commit if we
157 * make it unwritable again. 156 * make it unwritable again.
158 *
159 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
160 * a MAP_NORESERVE private mapping to writable will now reserve.
161 */ 157 */
162 if (newflags & VM_WRITE) { 158 if (newflags & VM_WRITE) {
163 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { 159 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
160 VM_SHARED|VM_NORESERVE))) {
164 charged = nrpages; 161 charged = nrpages;
165 if (security_vm_enough_memory(charged)) 162 if (security_vm_enough_memory(charged))
166 return -ENOMEM; 163 return -ENOMEM;
@@ -207,10 +204,12 @@ success:
207 dirty_accountable = 1; 204 dirty_accountable = 1;
208 } 205 }
209 206
207 mmu_notifier_invalidate_range_start(mm, start, end);
210 if (is_vm_hugetlb_page(vma)) 208 if (is_vm_hugetlb_page(vma))
211 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 209 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
212 else 210 else
213 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); 211 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
212 mmu_notifier_invalidate_range_end(mm, start, end);
214 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 213 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
215 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 214 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
216 return 0; 215 return 0;
@@ -239,7 +238,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
239 end = start + len; 238 end = start + len;
240 if (end <= start) 239 if (end <= start)
241 return -ENOMEM; 240 return -ENOMEM;
242 if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) 241 if (!arch_validate_prot(prot))
243 return -EINVAL; 242 return -EINVAL;
244 243
245 reqprot = prot; 244 reqprot = prot;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..1a7743923c8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
18#include <linux/highmem.h> 18#include <linux/highmem.h>
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h>
21 22
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/cacheflush.h> 24#include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
74 struct mm_struct *mm = vma->vm_mm; 75 struct mm_struct *mm = vma->vm_mm;
75 pte_t *old_pte, *new_pte, pte; 76 pte_t *old_pte, *new_pte, pte;
76 spinlock_t *old_ptl, *new_ptl; 77 spinlock_t *old_ptl, *new_ptl;
78 unsigned long old_start;
77 79
80 old_start = old_addr;
81 mmu_notifier_invalidate_range_start(vma->vm_mm,
82 old_start, old_end);
78 if (vma->vm_file) { 83 if (vma->vm_file) {
79 /* 84 /*
80 * Subtle point from Rajesh Venkatasubramanian: before 85 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
116 pte_unmap_unlock(old_pte - 1, old_ptl); 121 pte_unmap_unlock(old_pte - 1, old_ptl);
117 if (mapping) 122 if (mapping)
118 spin_unlock(&mapping->i_mmap_lock); 123 spin_unlock(&mapping->i_mmap_lock);
124 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
119} 125}
120 126
121#define LATENCY_LIMIT (64 * PAGE_SIZE) 127#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..ed75bc962fbe 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/ptrace.h> 25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
@@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node)
266} 266}
267EXPORT_SYMBOL(vmalloc_node); 267EXPORT_SYMBOL(vmalloc_node);
268 268
269#ifndef PAGE_KERNEL_EXEC
270# define PAGE_KERNEL_EXEC PAGE_KERNEL
271#endif
272
273/**
274 * vmalloc_exec - allocate virtually contiguous, executable memory
275 * @size: allocation size
276 *
277 * Kernel-internal function to allocate enough pages to cover @size
278 * the page level allocator and map them into contiguous and
279 * executable kernel virtual space.
280 *
281 * For tight control over page level allocator and protection flags
282 * use __vmalloc() instead.
283 */
284
285void *vmalloc_exec(unsigned long size)
286{
287 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
288}
289
269/** 290/**
270 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 291 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
271 * @size: allocation size 292 * @size: allocation size
@@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file,
745 * it's being traced - otherwise breakpoints set in it may interfere 766 * it's being traced - otherwise breakpoints set in it may interfere
746 * with another untraced process 767 * with another untraced process
747 */ 768 */
748 if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) 769 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
749 vm_flags &= ~VM_MAYSHARE; 770 vm_flags &= ~VM_MAYSHARE;
750 771
751 return vm_flags; 772 return vm_flags;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee6265..64e5b4bcd964 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/notifier.h> 27#include <linux/notifier.h>
28#include <linux/memcontrol.h> 28#include <linux/memcontrol.h>
29#include <linux/security.h>
29 30
30int sysctl_panic_on_oom; 31int sysctl_panic_on_oom;
31int sysctl_oom_kill_allocating_task; 32int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
128 * Superuser processes are usually more important, so we make it 129 * Superuser processes are usually more important, so we make it
129 * less likely that we kill those. 130 * less likely that we kill those.
130 */ 131 */
131 if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) 132 if (has_capability(p, CAP_SYS_ADMIN) ||
133 has_capability(p, CAP_SYS_RESOURCE))
132 points /= 4; 134 points /= 4;
133 135
134 /* 136 /*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
137 * tend to only have this flag set on applications they think 139 * tend to only have this flag set on applications they think
138 * of as important. 140 * of as important.
139 */ 141 */
140 if (__capable(p, CAP_SYS_RAWIO)) 142 if (has_capability(p, CAP_SYS_RAWIO))
141 points /= 4; 143 points /= 4;
142 144
143 /* 145 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6adbef37..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages);
126static struct prop_descriptor vm_completions; 126static struct prop_descriptor vm_completions;
127static struct prop_descriptor vm_dirties; 127static struct prop_descriptor vm_dirties;
128 128
129static unsigned long determine_dirtyable_memory(void);
130
131/* 129/*
132 * couple the period to the dirty_ratio: 130 * couple the period to the dirty_ratio:
133 * 131 *
@@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
347#endif 345#endif
348} 346}
349 347
350static unsigned long determine_dirtyable_memory(void) 348/**
349 * determine_dirtyable_memory - amount of memory that may be used
350 *
351 * Returns the numebr of pages that can currently be freed and used
352 * by the kernel for direct mappings.
353 */
354unsigned long determine_dirtyable_memory(void)
351{ 355{
352 unsigned long x; 356 unsigned long x;
353 357
@@ -956,6 +960,9 @@ retry:
956 } 960 }
957 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 961 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
958 mapping->writeback_index = index; 962 mapping->writeback_index = index;
963
964 if (wbc->range_cont)
965 wbc->range_start = index << PAGE_CACHE_SHIFT;
959 return ret; 966 return ret;
960} 967}
961EXPORT_SYMBOL(write_cache_pages); 968EXPORT_SYMBOL(write_cache_pages);
@@ -1081,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1081 if (!mapping) 1088 if (!mapping)
1082 return 1; 1089 return 1;
1083 1090
1084 write_lock_irq(&mapping->tree_lock); 1091 spin_lock_irq(&mapping->tree_lock);
1085 mapping2 = page_mapping(page); 1092 mapping2 = page_mapping(page);
1086 if (mapping2) { /* Race with truncate? */ 1093 if (mapping2) { /* Race with truncate? */
1087 BUG_ON(mapping2 != mapping); 1094 BUG_ON(mapping2 != mapping);
@@ -1095,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1095 radix_tree_tag_set(&mapping->page_tree, 1102 radix_tree_tag_set(&mapping->page_tree,
1096 page_index(page), PAGECACHE_TAG_DIRTY); 1103 page_index(page), PAGECACHE_TAG_DIRTY);
1097 } 1104 }
1098 write_unlock_irq(&mapping->tree_lock); 1105 spin_unlock_irq(&mapping->tree_lock);
1099 if (mapping->host) { 1106 if (mapping->host) {
1100 /* !PageAnon && !swapper_space */ 1107 /* !PageAnon && !swapper_space */
1101 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1251,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
1251 struct backing_dev_info *bdi = mapping->backing_dev_info; 1258 struct backing_dev_info *bdi = mapping->backing_dev_info;
1252 unsigned long flags; 1259 unsigned long flags;
1253 1260
1254 write_lock_irqsave(&mapping->tree_lock, flags); 1261 spin_lock_irqsave(&mapping->tree_lock, flags);
1255 ret = TestClearPageWriteback(page); 1262 ret = TestClearPageWriteback(page);
1256 if (ret) { 1263 if (ret) {
1257 radix_tree_tag_clear(&mapping->page_tree, 1264 radix_tree_tag_clear(&mapping->page_tree,
@@ -1262,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
1262 __bdi_writeout_inc(bdi); 1269 __bdi_writeout_inc(bdi);
1263 } 1270 }
1264 } 1271 }
1265 write_unlock_irqrestore(&mapping->tree_lock, flags); 1272 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1266 } else { 1273 } else {
1267 ret = TestClearPageWriteback(page); 1274 ret = TestClearPageWriteback(page);
1268 } 1275 }
@@ -1280,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
1280 struct backing_dev_info *bdi = mapping->backing_dev_info; 1287 struct backing_dev_info *bdi = mapping->backing_dev_info;
1281 unsigned long flags; 1288 unsigned long flags;
1282 1289
1283 write_lock_irqsave(&mapping->tree_lock, flags); 1290 spin_lock_irqsave(&mapping->tree_lock, flags);
1284 ret = TestSetPageWriteback(page); 1291 ret = TestSetPageWriteback(page);
1285 if (!ret) { 1292 if (!ret) {
1286 radix_tree_tag_set(&mapping->page_tree, 1293 radix_tree_tag_set(&mapping->page_tree,
@@ -1293,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
1293 radix_tree_tag_clear(&mapping->page_tree, 1300 radix_tree_tag_clear(&mapping->page_tree,
1294 page_index(page), 1301 page_index(page),
1295 PAGECACHE_TAG_DIRTY); 1302 PAGECACHE_TAG_DIRTY);
1296 write_unlock_irqrestore(&mapping->tree_lock, flags); 1303 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1297 } else { 1304 } else {
1298 ret = TestSetPageWriteback(page); 1305 ret = TestSetPageWriteback(page);
1299 } 1306 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f552955a02f..27b8681139fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 unsigned long __initdata required_kernelcore; 156 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 157 static unsigned long __initdata required_movablecore;
158 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
159 159
160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
161 int movable_zone; 161 int movable_zone;
@@ -264,17 +264,18 @@ static void free_compound_page(struct page *page)
264 __free_pages_ok(page, compound_order(page)); 264 __free_pages_ok(page, compound_order(page));
265} 265}
266 266
267static void prep_compound_page(struct page *page, unsigned long order) 267void prep_compound_page(struct page *page, unsigned long order)
268{ 268{
269 int i; 269 int i;
270 int nr_pages = 1 << order; 270 int nr_pages = 1 << order;
271 struct page *p = page + 1;
271 272
272 set_compound_page_dtor(page, free_compound_page); 273 set_compound_page_dtor(page, free_compound_page);
273 set_compound_order(page, order); 274 set_compound_order(page, order);
274 __SetPageHead(page); 275 __SetPageHead(page);
275 for (i = 1; i < nr_pages; i++) { 276 for (i = 1; i < nr_pages; i++, p++) {
276 struct page *p = page + i; 277 if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
277 278 p = pfn_to_page(page_to_pfn(page) + i);
278 __SetPageTail(p); 279 __SetPageTail(p);
279 p->first_page = page; 280 p->first_page = page;
280 } 281 }
@@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
284{ 285{
285 int i; 286 int i;
286 int nr_pages = 1 << order; 287 int nr_pages = 1 << order;
288 struct page *p = page + 1;
287 289
288 if (unlikely(compound_order(page) != order)) 290 if (unlikely(compound_order(page) != order))
289 bad_page(page); 291 bad_page(page);
@@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
291 if (unlikely(!PageHead(page))) 293 if (unlikely(!PageHead(page)))
292 bad_page(page); 294 bad_page(page);
293 __ClearPageHead(page); 295 __ClearPageHead(page);
294 for (i = 1; i < nr_pages; i++) { 296 for (i = 1; i < nr_pages; i++, p++) {
295 struct page *p = page + i; 297 if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
298 p = pfn_to_page(page_to_pfn(page) + i);
296 299
297 if (unlikely(!PageTail(p) | 300 if (unlikely(!PageTail(p) |
298 (p->first_page != page))) 301 (p->first_page != page)))
@@ -432,8 +435,9 @@ static inline void __free_one_page(struct page *page,
432 435
433 buddy = __page_find_buddy(page, page_idx, order); 436 buddy = __page_find_buddy(page, page_idx, order);
434 if (!page_is_buddy(page, buddy, order)) 437 if (!page_is_buddy(page, buddy, order))
435 break; /* Move the buddy up one level. */ 438 break;
436 439
440 /* Our buddy is free, merge with it and move up one order. */
437 list_del(&buddy->lru); 441 list_del(&buddy->lru);
438 zone->free_area[order].nr_free--; 442 zone->free_area[order].nr_free--;
439 rmv_page_order(buddy); 443 rmv_page_order(buddy);
@@ -532,7 +536,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
532/* 536/*
533 * permit the bootmem allocator to evade page validation on high-order frees 537 * permit the bootmem allocator to evade page validation on high-order frees
534 */ 538 */
535void __free_pages_bootmem(struct page *page, unsigned int order) 539void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
536{ 540{
537 if (order == 0) { 541 if (order == 0) {
538 __ClearPageReserved(page); 542 __ClearPageReserved(page);
@@ -673,9 +677,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
673 * Note that start_page and end_pages are not aligned on a pageblock 677 * Note that start_page and end_pages are not aligned on a pageblock
674 * boundary. If alignment is required, use move_freepages_block() 678 * boundary. If alignment is required, use move_freepages_block()
675 */ 679 */
676int move_freepages(struct zone *zone, 680static int move_freepages(struct zone *zone,
677 struct page *start_page, struct page *end_page, 681 struct page *start_page, struct page *end_page,
678 int migratetype) 682 int migratetype)
679{ 683{
680 struct page *page; 684 struct page *page;
681 unsigned long order; 685 unsigned long order;
@@ -693,6 +697,9 @@ int move_freepages(struct zone *zone,
693#endif 697#endif
694 698
695 for (page = start_page; page <= end_page;) { 699 for (page = start_page; page <= end_page;) {
700 /* Make sure we are not inadvertently changing nodes */
701 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
702
696 if (!pfn_valid_within(page_to_pfn(page))) { 703 if (!pfn_valid_within(page_to_pfn(page))) {
697 page++; 704 page++;
698 continue; 705 continue;
@@ -714,7 +721,8 @@ int move_freepages(struct zone *zone,
714 return pages_moved; 721 return pages_moved;
715} 722}
716 723
717int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 724static int move_freepages_block(struct zone *zone, struct page *page,
725 int migratetype)
718{ 726{
719 unsigned long start_pfn, end_pfn; 727 unsigned long start_pfn, end_pfn;
720 struct page *start_page, *end_page; 728 struct page *start_page, *end_page;
@@ -918,7 +926,7 @@ void drain_local_pages(void *arg)
918 */ 926 */
919void drain_all_pages(void) 927void drain_all_pages(void)
920{ 928{
921 on_each_cpu(drain_local_pages, NULL, 0, 1); 929 on_each_cpu(drain_local_pages, NULL, 1);
922} 930}
923 931
924#ifdef CONFIG_HIBERNATION 932#ifdef CONFIG_HIBERNATION
@@ -1429,7 +1437,7 @@ try_next_zone:
1429/* 1437/*
1430 * This is the 'heart' of the zoned buddy allocator. 1438 * This is the 'heart' of the zoned buddy allocator.
1431 */ 1439 */
1432static struct page * 1440struct page *
1433__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1441__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1434 struct zonelist *zonelist, nodemask_t *nodemask) 1442 struct zonelist *zonelist, nodemask_t *nodemask)
1435{ 1443{
@@ -1632,22 +1640,7 @@ nopage:
1632got_pg: 1640got_pg:
1633 return page; 1641 return page;
1634} 1642}
1635 1643EXPORT_SYMBOL(__alloc_pages_internal);
1636struct page *
1637__alloc_pages(gfp_t gfp_mask, unsigned int order,
1638 struct zonelist *zonelist)
1639{
1640 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1641}
1642
1643struct page *
1644__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1645 struct zonelist *zonelist, nodemask_t *nodemask)
1646{
1647 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1648}
1649
1650EXPORT_SYMBOL(__alloc_pages);
1651 1644
1652/* 1645/*
1653 * Common helper functions. 1646 * Common helper functions.
@@ -1711,6 +1704,59 @@ void free_pages(unsigned long addr, unsigned int order)
1711 1704
1712EXPORT_SYMBOL(free_pages); 1705EXPORT_SYMBOL(free_pages);
1713 1706
1707/**
1708 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
1709 * @size: the number of bytes to allocate
1710 * @gfp_mask: GFP flags for the allocation
1711 *
1712 * This function is similar to alloc_pages(), except that it allocates the
1713 * minimum number of pages to satisfy the request. alloc_pages() can only
1714 * allocate memory in power-of-two pages.
1715 *
1716 * This function is also limited by MAX_ORDER.
1717 *
1718 * Memory allocated by this function must be released by free_pages_exact().
1719 */
1720void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1721{
1722 unsigned int order = get_order(size);
1723 unsigned long addr;
1724
1725 addr = __get_free_pages(gfp_mask, order);
1726 if (addr) {
1727 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1728 unsigned long used = addr + PAGE_ALIGN(size);
1729
1730 split_page(virt_to_page(addr), order);
1731 while (used < alloc_end) {
1732 free_page(used);
1733 used += PAGE_SIZE;
1734 }
1735 }
1736
1737 return (void *)addr;
1738}
1739EXPORT_SYMBOL(alloc_pages_exact);
1740
1741/**
1742 * free_pages_exact - release memory allocated via alloc_pages_exact()
1743 * @virt: the value returned by alloc_pages_exact.
1744 * @size: size of allocation, same value as passed to alloc_pages_exact().
1745 *
1746 * Release the memory allocated by a previous call to alloc_pages_exact.
1747 */
1748void free_pages_exact(void *virt, size_t size)
1749{
1750 unsigned long addr = (unsigned long)virt;
1751 unsigned long end = addr + PAGE_ALIGN(size);
1752
1753 while (addr < end) {
1754 free_page(addr);
1755 addr += PAGE_SIZE;
1756 }
1757}
1758EXPORT_SYMBOL(free_pages_exact);
1759
1714static unsigned int nr_free_zone_pages(int offset) 1760static unsigned int nr_free_zone_pages(int offset)
1715{ 1761{
1716 struct zoneref *z; 1762 struct zoneref *z;
@@ -2328,12 +2374,11 @@ static void build_zonelists(pg_data_t *pgdat)
2328static void build_zonelist_cache(pg_data_t *pgdat) 2374static void build_zonelist_cache(pg_data_t *pgdat)
2329{ 2375{
2330 pgdat->node_zonelists[0].zlcache_ptr = NULL; 2376 pgdat->node_zonelists[0].zlcache_ptr = NULL;
2331 pgdat->node_zonelists[1].zlcache_ptr = NULL;
2332} 2377}
2333 2378
2334#endif /* CONFIG_NUMA */ 2379#endif /* CONFIG_NUMA */
2335 2380
2336/* return values int ....just for stop_machine_run() */ 2381/* return values int ....just for stop_machine() */
2337static int __build_all_zonelists(void *dummy) 2382static int __build_all_zonelists(void *dummy)
2338{ 2383{
2339 int nid; 2384 int nid;
@@ -2353,11 +2398,12 @@ void build_all_zonelists(void)
2353 2398
2354 if (system_state == SYSTEM_BOOTING) { 2399 if (system_state == SYSTEM_BOOTING) {
2355 __build_all_zonelists(NULL); 2400 __build_all_zonelists(NULL);
2401 mminit_verify_zonelist();
2356 cpuset_init_current_mems_allowed(); 2402 cpuset_init_current_mems_allowed();
2357 } else { 2403 } else {
2358 /* we have to stop all cpus to guarantee there is no user 2404 /* we have to stop all cpus to guarantee there is no user
2359 of zonelist */ 2405 of zonelist */
2360 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2406 stop_machine(__build_all_zonelists, NULL, NULL);
2361 /* cpuset refresh routine should be here */ 2407 /* cpuset refresh routine should be here */
2362 } 2408 }
2363 vm_total_pages = nr_free_pagecache_pages(); 2409 vm_total_pages = nr_free_pagecache_pages();
@@ -2476,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2476 continue; 2522 continue;
2477 page = pfn_to_page(pfn); 2523 page = pfn_to_page(pfn);
2478 2524
2525 /* Watch out for overlapping nodes */
2526 if (page_to_nid(page) != zone_to_nid(zone))
2527 continue;
2528
2479 /* Blocks with reserved pages will never free, skip them. */ 2529 /* Blocks with reserved pages will never free, skip them. */
2480 if (PageReserved(page)) 2530 if (PageReserved(page))
2481 continue; 2531 continue;
@@ -2535,6 +2585,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2535 } 2585 }
2536 page = pfn_to_page(pfn); 2586 page = pfn_to_page(pfn);
2537 set_page_links(page, zone, nid, pfn); 2587 set_page_links(page, zone, nid, pfn);
2588 mminit_verify_page_links(page, zone, nid, pfn);
2538 init_page_count(page); 2589 init_page_count(page);
2539 reset_page_mapcount(page); 2590 reset_page_mapcount(page);
2540 SetPageReserved(page); 2591 SetPageReserved(page);
@@ -2612,7 +2663,7 @@ static int zone_batchsize(struct zone *zone)
2612 return batch; 2663 return batch;
2613} 2664}
2614 2665
2615inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2666static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2616{ 2667{
2617 struct per_cpu_pages *pcp; 2668 struct per_cpu_pages *pcp;
2618 2669
@@ -2837,6 +2888,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2837 2888
2838 zone->zone_start_pfn = zone_start_pfn; 2889 zone->zone_start_pfn = zone_start_pfn;
2839 2890
2891 mminit_dprintk(MMINIT_TRACE, "memmap_init",
2892 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
2893 pgdat->node_id,
2894 (unsigned long)zone_idx(zone),
2895 zone_start_pfn, (zone_start_pfn + size));
2896
2840 zone_init_free_lists(zone); 2897 zone_init_free_lists(zone);
2841 2898
2842 return 0; 2899 return 0;
@@ -2930,6 +2987,18 @@ void __init free_bootmem_with_active_regions(int nid,
2930 } 2987 }
2931} 2988}
2932 2989
2990void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
2991{
2992 int i;
2993 int ret;
2994
2995 for_each_active_range_index_in_nid(i, nid) {
2996 ret = work_fn(early_node_map[i].start_pfn,
2997 early_node_map[i].end_pfn, data);
2998 if (ret)
2999 break;
3000 }
3001}
2933/** 3002/**
2934 * sparse_memory_present_with_active_regions - Call memory_present for each active range 3003 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2935 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 3004 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -2964,7 +3033,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
2964void __init push_node_boundaries(unsigned int nid, 3033void __init push_node_boundaries(unsigned int nid,
2965 unsigned long start_pfn, unsigned long end_pfn) 3034 unsigned long start_pfn, unsigned long end_pfn)
2966{ 3035{
2967 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 3036 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3037 "Entering push_node_boundaries(%u, %lu, %lu)\n",
2968 nid, start_pfn, end_pfn); 3038 nid, start_pfn, end_pfn);
2969 3039
2970 /* Initialise the boundary for this node if necessary */ 3040 /* Initialise the boundary for this node if necessary */
@@ -2982,7 +3052,8 @@ void __init push_node_boundaries(unsigned int nid,
2982static void __meminit account_node_boundary(unsigned int nid, 3052static void __meminit account_node_boundary(unsigned int nid,
2983 unsigned long *start_pfn, unsigned long *end_pfn) 3053 unsigned long *start_pfn, unsigned long *end_pfn)
2984{ 3054{
2985 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 3055 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3056 "Entering account_node_boundary(%u, %lu, %lu)\n",
2986 nid, *start_pfn, *end_pfn); 3057 nid, *start_pfn, *end_pfn);
2987 3058
2988 /* Return if boundary information has not been provided */ 3059 /* Return if boundary information has not been provided */
@@ -3039,7 +3110,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3039 * assumption is made that zones within a node are ordered in monotonic 3110 * assumption is made that zones within a node are ordered in monotonic
3040 * increasing memory addresses so that the "highest" populated zone is used 3111 * increasing memory addresses so that the "highest" populated zone is used
3041 */ 3112 */
3042void __init find_usable_zone_for_movable(void) 3113static void __init find_usable_zone_for_movable(void)
3043{ 3114{
3044 int zone_index; 3115 int zone_index;
3045 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3116 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3065,7 +3136,7 @@ void __init find_usable_zone_for_movable(void)
3065 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3136 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3066 * zones within a node are in order of monotonic increases memory addresses 3137 * zones within a node are in order of monotonic increases memory addresses
3067 */ 3138 */
3068void __meminit adjust_zone_range_for_zone_movable(int nid, 3139static void __meminit adjust_zone_range_for_zone_movable(int nid,
3069 unsigned long zone_type, 3140 unsigned long zone_type,
3070 unsigned long node_start_pfn, 3141 unsigned long node_start_pfn,
3071 unsigned long node_end_pfn, 3142 unsigned long node_end_pfn,
@@ -3126,7 +3197,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3126 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3197 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3127 * then all holes in the requested range will be accounted for. 3198 * then all holes in the requested range will be accounted for.
3128 */ 3199 */
3129unsigned long __meminit __absent_pages_in_range(int nid, 3200static unsigned long __meminit __absent_pages_in_range(int nid,
3130 unsigned long range_start_pfn, 3201 unsigned long range_start_pfn,
3131 unsigned long range_end_pfn) 3202 unsigned long range_end_pfn)
3132{ 3203{
@@ -3357,8 +3428,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3357 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3428 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3358 if (realsize >= memmap_pages) { 3429 if (realsize >= memmap_pages) {
3359 realsize -= memmap_pages; 3430 realsize -= memmap_pages;
3360 printk(KERN_DEBUG 3431 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3361 " %s zone: %lu pages used for memmap\n", 3432 "%s zone: %lu pages used for memmap\n",
3362 zone_names[j], memmap_pages); 3433 zone_names[j], memmap_pages);
3363 } else 3434 } else
3364 printk(KERN_WARNING 3435 printk(KERN_WARNING
@@ -3368,7 +3439,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3368 /* Account for reserved pages */ 3439 /* Account for reserved pages */
3369 if (j == 0 && realsize > dma_reserve) { 3440 if (j == 0 && realsize > dma_reserve) {
3370 realsize -= dma_reserve; 3441 realsize -= dma_reserve;
3371 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 3442 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3443 "%s zone: %lu pages reserved\n",
3372 zone_names[0], dma_reserve); 3444 zone_names[0], dma_reserve);
3373 } 3445 }
3374 3446
@@ -3453,15 +3525,21 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3453#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3525#endif /* CONFIG_FLAT_NODE_MEM_MAP */
3454} 3526}
3455 3527
3456void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, 3528void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3457 unsigned long *zones_size, unsigned long node_start_pfn, 3529 unsigned long node_start_pfn, unsigned long *zholes_size)
3458 unsigned long *zholes_size)
3459{ 3530{
3531 pg_data_t *pgdat = NODE_DATA(nid);
3532
3460 pgdat->node_id = nid; 3533 pgdat->node_id = nid;
3461 pgdat->node_start_pfn = node_start_pfn; 3534 pgdat->node_start_pfn = node_start_pfn;
3462 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3535 calculate_node_totalpages(pgdat, zones_size, zholes_size);
3463 3536
3464 alloc_node_mem_map(pgdat); 3537 alloc_node_mem_map(pgdat);
3538#ifdef CONFIG_FLAT_NODE_MEM_MAP
3539 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
3540 nid, (unsigned long)pgdat,
3541 (unsigned long)pgdat->node_mem_map);
3542#endif
3465 3543
3466 free_area_init_core(pgdat, zones_size, zholes_size); 3544 free_area_init_core(pgdat, zones_size, zholes_size);
3467} 3545}
@@ -3504,10 +3582,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3504{ 3582{
3505 int i; 3583 int i;
3506 3584
3507 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " 3585 mminit_dprintk(MMINIT_TRACE, "memory_register",
3508 "%d entries of %d used\n", 3586 "Entering add_active_range(%d, %#lx, %#lx) "
3509 nid, start_pfn, end_pfn, 3587 "%d entries of %d used\n",
3510 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3588 nid, start_pfn, end_pfn,
3589 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
3590
3591 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
3511 3592
3512 /* Merge with existing active regions if possible */ 3593 /* Merge with existing active regions if possible */
3513 for (i = 0; i < nr_nodemap_entries; i++) { 3594 for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3548,27 +3629,68 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3548} 3629}
3549 3630
3550/** 3631/**
3551 * shrink_active_range - Shrink an existing registered range of PFNs 3632 * remove_active_range - Shrink an existing registered range of PFNs
3552 * @nid: The node id the range is on that should be shrunk 3633 * @nid: The node id the range is on that should be shrunk
3553 * @old_end_pfn: The old end PFN of the range 3634 * @start_pfn: The new PFN of the range
3554 * @new_end_pfn: The new PFN of the range 3635 * @end_pfn: The new PFN of the range
3555 * 3636 *
3556 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. 3637 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
3557 * The map is kept at the end physical page range that has already been 3638 * The map is kept near the end physical page range that has already been
3558 * registered with add_active_range(). This function allows an arch to shrink 3639 * registered. This function allows an arch to shrink an existing registered
3559 * an existing registered range. 3640 * range.
3560 */ 3641 */
3561void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, 3642void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
3562 unsigned long new_end_pfn) 3643 unsigned long end_pfn)
3563{ 3644{
3564 int i; 3645 int i, j;
3646 int removed = 0;
3647
3648 printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
3649 nid, start_pfn, end_pfn);
3565 3650
3566 /* Find the old active region end and shrink */ 3651 /* Find the old active region end and shrink */
3567 for_each_active_range_index_in_nid(i, nid) 3652 for_each_active_range_index_in_nid(i, nid) {
3568 if (early_node_map[i].end_pfn == old_end_pfn) { 3653 if (early_node_map[i].start_pfn >= start_pfn &&
3569 early_node_map[i].end_pfn = new_end_pfn; 3654 early_node_map[i].end_pfn <= end_pfn) {
3570 break; 3655 /* clear it */
3656 early_node_map[i].start_pfn = 0;
3657 early_node_map[i].end_pfn = 0;
3658 removed = 1;
3659 continue;
3660 }
3661 if (early_node_map[i].start_pfn < start_pfn &&
3662 early_node_map[i].end_pfn > start_pfn) {
3663 unsigned long temp_end_pfn = early_node_map[i].end_pfn;
3664 early_node_map[i].end_pfn = start_pfn;
3665 if (temp_end_pfn > end_pfn)
3666 add_active_range(nid, end_pfn, temp_end_pfn);
3667 continue;
3668 }
3669 if (early_node_map[i].start_pfn >= start_pfn &&
3670 early_node_map[i].end_pfn > end_pfn &&
3671 early_node_map[i].start_pfn < end_pfn) {
3672 early_node_map[i].start_pfn = end_pfn;
3673 continue;
3571 } 3674 }
3675 }
3676
3677 if (!removed)
3678 return;
3679
3680 /* remove the blank ones */
3681 for (i = nr_nodemap_entries - 1; i > 0; i--) {
3682 if (early_node_map[i].nid != nid)
3683 continue;
3684 if (early_node_map[i].end_pfn)
3685 continue;
3686 /* we found it, get rid of it */
3687 for (j = i; j < nr_nodemap_entries - 1; j++)
3688 memcpy(&early_node_map[j], &early_node_map[j+1],
3689 sizeof(early_node_map[j]));
3690 j = nr_nodemap_entries - 1;
3691 memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
3692 nr_nodemap_entries--;
3693 }
3572} 3694}
3573 3695
3574/** 3696/**
@@ -3612,7 +3734,7 @@ static void __init sort_node_map(void)
3612} 3734}
3613 3735
3614/* Find the lowest pfn for a node */ 3736/* Find the lowest pfn for a node */
3615unsigned long __init find_min_pfn_for_node(unsigned long nid) 3737static unsigned long __init find_min_pfn_for_node(int nid)
3616{ 3738{
3617 int i; 3739 int i;
3618 unsigned long min_pfn = ULONG_MAX; 3740 unsigned long min_pfn = ULONG_MAX;
@@ -3623,7 +3745,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid)
3623 3745
3624 if (min_pfn == ULONG_MAX) { 3746 if (min_pfn == ULONG_MAX) {
3625 printk(KERN_WARNING 3747 printk(KERN_WARNING
3626 "Could not find start_pfn for node %lu\n", nid); 3748 "Could not find start_pfn for node %d\n", nid);
3627 return 0; 3749 return 0;
3628 } 3750 }
3629 3751
@@ -3641,23 +3763,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
3641 return find_min_pfn_for_node(MAX_NUMNODES); 3763 return find_min_pfn_for_node(MAX_NUMNODES);
3642} 3764}
3643 3765
3644/**
3645 * find_max_pfn_with_active_regions - Find the maximum PFN registered
3646 *
3647 * It returns the maximum PFN based on information provided via
3648 * add_active_range().
3649 */
3650unsigned long __init find_max_pfn_with_active_regions(void)
3651{
3652 int i;
3653 unsigned long max_pfn = 0;
3654
3655 for (i = 0; i < nr_nodemap_entries; i++)
3656 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
3657
3658 return max_pfn;
3659}
3660
3661/* 3766/*
3662 * early_calculate_totalpages() 3767 * early_calculate_totalpages()
3663 * Sum pages in active regions for movable zone. 3768 * Sum pages in active regions for movable zone.
@@ -3684,7 +3789,7 @@ static unsigned long __init early_calculate_totalpages(void)
3684 * memory. When they don't, some nodes will have more kernelcore than 3789 * memory. When they don't, some nodes will have more kernelcore than
3685 * others 3790 * others
3686 */ 3791 */
3687void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3792static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3688{ 3793{
3689 int i, nid; 3794 int i, nid;
3690 unsigned long usable_startpfn; 3795 unsigned long usable_startpfn;
@@ -3879,7 +3984,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3879 for (i = 0; i < MAX_NR_ZONES; i++) { 3984 for (i = 0; i < MAX_NR_ZONES; i++) {
3880 if (i == ZONE_MOVABLE) 3985 if (i == ZONE_MOVABLE)
3881 continue; 3986 continue;
3882 printk(" %-8s %8lu -> %8lu\n", 3987 printk(" %-8s %0#10lx -> %0#10lx\n",
3883 zone_names[i], 3988 zone_names[i],
3884 arch_zone_lowest_possible_pfn[i], 3989 arch_zone_lowest_possible_pfn[i],
3885 arch_zone_highest_possible_pfn[i]); 3990 arch_zone_highest_possible_pfn[i]);
@@ -3895,15 +4000,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3895 /* Print out the early_node_map[] */ 4000 /* Print out the early_node_map[] */
3896 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 4001 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
3897 for (i = 0; i < nr_nodemap_entries; i++) 4002 for (i = 0; i < nr_nodemap_entries; i++)
3898 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, 4003 printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
3899 early_node_map[i].start_pfn, 4004 early_node_map[i].start_pfn,
3900 early_node_map[i].end_pfn); 4005 early_node_map[i].end_pfn);
3901 4006
3902 /* Initialise every node */ 4007 /* Initialise every node */
4008 mminit_verify_pageflags_layout();
3903 setup_nr_node_ids(); 4009 setup_nr_node_ids();
3904 for_each_online_node(nid) { 4010 for_each_online_node(nid) {
3905 pg_data_t *pgdat = NODE_DATA(nid); 4011 pg_data_t *pgdat = NODE_DATA(nid);
3906 free_area_init_node(nid, pgdat, NULL, 4012 free_area_init_node(nid, NULL,
3907 find_min_pfn_for_node(nid), NULL); 4013 find_min_pfn_for_node(nid), NULL);
3908 4014
3909 /* Any memory on that node */ 4015 /* Any memory on that node */
@@ -3968,15 +4074,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
3968} 4074}
3969 4075
3970#ifndef CONFIG_NEED_MULTIPLE_NODES 4076#ifndef CONFIG_NEED_MULTIPLE_NODES
3971static bootmem_data_t contig_bootmem_data; 4077struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
3972struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
3973
3974EXPORT_SYMBOL(contig_page_data); 4078EXPORT_SYMBOL(contig_page_data);
3975#endif 4079#endif
3976 4080
3977void __init free_area_init(unsigned long *zones_size) 4081void __init free_area_init(unsigned long *zones_size)
3978{ 4082{
3979 free_area_init_node(0, NODE_DATA(0), zones_size, 4083 free_area_init_node(0, zones_size,
3980 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4084 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
3981} 4085}
3982 4086
@@ -4343,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename,
4343 do { 4447 do {
4344 size = bucketsize << log2qty; 4448 size = bucketsize << log2qty;
4345 if (flags & HASH_EARLY) 4449 if (flags & HASH_EARLY)
4346 table = alloc_bootmem(size); 4450 table = alloc_bootmem_nopanic(size);
4347 else if (hashdist) 4451 else if (hashdist)
4348 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4452 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4349 else { 4453 else {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c8..b70a7fec1ff6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
2 * linux/mm/page_isolation.c 2 * linux/mm/page_isolation.c
3 */ 3 */
4 4
5#include <stddef.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
7#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
8#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
115 114
116int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 115int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
117{ 116{
118 unsigned long pfn; 117 unsigned long pfn, flags;
119 struct page *page; 118 struct page *page;
119 struct zone *zone;
120 int ret;
120 121
121 pfn = start_pfn; 122 pfn = start_pfn;
122 /* 123 /*
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
132 if (pfn < end_pfn) 133 if (pfn < end_pfn)
133 return -EBUSY; 134 return -EBUSY;
134 /* Check all pages are free or Marked as ISOLATED */ 135 /* Check all pages are free or Marked as ISOLATED */
135 if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) 136 zone = page_zone(pfn_to_page(pfn));
136 return 0; 137 spin_lock_irqsave(&zone->lock, flags);
137 return -EBUSY; 138 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
139 spin_unlock_irqrestore(&zone->lock, flags);
140 return ret ? 0 : -EBUSY;
138} 141}
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..0cbe0c60c6bf 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
130 * Thread creation: For how long have there been zero 130 * Thread creation: For how long have there been zero
131 * available threads? 131 * available threads?
132 */ 132 */
133 if (jiffies - last_empty_jifs > 1 * HZ) { 133 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
134 /* unlocked list_empty() test is OK here */ 134 /* unlocked list_empty() test is OK here */
135 if (list_empty(&pdflush_list)) { 135 if (list_empty(&pdflush_list)) {
136 /* unlocked test is OK here */ 136 /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) 151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
152 continue; 152 continue;
153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
154 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { 154 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
155 /* Limit exit rate */ 155 /* Limit exit rate */
156 pdf->when_i_went_to_sleep = jiffies; 156 pdf->when_i_went_to_sleep = jiffies;
157 break; /* exeunt */ 157 break; /* exeunt */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb398..8dbb6805ef35 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
26static unsigned long max_pages(unsigned long min_pages) 26static unsigned long max_pages(unsigned long min_pages)
27{ 27{
28 unsigned long node_free_pages, max; 28 unsigned long node_free_pages, max;
29 struct zone *zones = NODE_DATA(numa_node_id())->node_zones; 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node;
32 node_to_cpumask_ptr(cpumask_on_node, node);
30 33
31 node_free_pages = 34 node_free_pages =
32#ifdef CONFIG_ZONE_DMA 35#ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
38 zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); 41 zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
39 42
40 max = node_free_pages / FRACTION_OF_NODE_MEM; 43 max = node_free_pages / FRACTION_OF_NODE_MEM;
44
45 num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
46 max /= num_cpus_on_node;
47
41 return max(max, min_pages); 48 return max(max, min_pages);
42} 49}
43 50
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
382 if (hit_readahead_marker) { 382 if (hit_readahead_marker) {
383 pgoff_t start; 383 pgoff_t start;
384 384
385 read_lock_irq(&mapping->tree_lock); 385 rcu_read_lock();
386 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); 386 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
387 read_unlock_irq(&mapping->tree_lock); 387 rcu_read_unlock();
388 388
389 if (!start || start - offset > max) 389 if (!start || start - offset > max)
390 return 0; 390 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..0383acfcb068 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 51#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
54 55
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
138 anon_vma_free(anon_vma); 139 anon_vma_free(anon_vma);
139} 140}
140 141
141static void anon_vma_ctor(struct kmem_cache *cachep, void *data) 142static void anon_vma_ctor(void *data)
142{ 143{
143 struct anon_vma *anon_vma = data; 144 struct anon_vma *anon_vma = data;
144 145
@@ -223,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
223/* 224/*
224 * Check that @page is mapped at @address into @mm. 225 * Check that @page is mapped at @address into @mm.
225 * 226 *
227 * If @sync is false, page_check_address may perform a racy check to avoid
228 * the page table lock when the pte is not present (helpful when reclaiming
229 * highly shared pages).
230 *
226 * On success returns with pte mapped and locked. 231 * On success returns with pte mapped and locked.
227 */ 232 */
228pte_t *page_check_address(struct page *page, struct mm_struct *mm, 233pte_t *page_check_address(struct page *page, struct mm_struct *mm,
229 unsigned long address, spinlock_t **ptlp) 234 unsigned long address, spinlock_t **ptlp, int sync)
230{ 235{
231 pgd_t *pgd; 236 pgd_t *pgd;
232 pud_t *pud; 237 pud_t *pud;
@@ -248,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
248 253
249 pte = pte_offset_map(pmd, address); 254 pte = pte_offset_map(pmd, address);
250 /* Make a quick check before getting the lock */ 255 /* Make a quick check before getting the lock */
251 if (!pte_present(*pte)) { 256 if (!sync && !pte_present(*pte)) {
252 pte_unmap(pte); 257 pte_unmap(pte);
253 return NULL; 258 return NULL;
254 } 259 }
@@ -280,14 +285,14 @@ static int page_referenced_one(struct page *page,
280 if (address == -EFAULT) 285 if (address == -EFAULT)
281 goto out; 286 goto out;
282 287
283 pte = page_check_address(page, mm, address, &ptl); 288 pte = page_check_address(page, mm, address, &ptl, 0);
284 if (!pte) 289 if (!pte)
285 goto out; 290 goto out;
286 291
287 if (vma->vm_flags & VM_LOCKED) { 292 if (vma->vm_flags & VM_LOCKED) {
288 referenced++; 293 referenced++;
289 *mapcount = 1; /* break early from loop */ 294 *mapcount = 1; /* break early from loop */
290 } else if (ptep_clear_flush_young(vma, address, pte)) 295 } else if (ptep_clear_flush_young_notify(vma, address, pte))
291 referenced++; 296 referenced++;
292 297
293 /* Pretend the page is referenced if the task has the 298 /* Pretend the page is referenced if the task has the
@@ -421,7 +426,7 @@ int page_referenced(struct page *page, int is_locked,
421 referenced += page_referenced_anon(page, mem_cont); 426 referenced += page_referenced_anon(page, mem_cont);
422 else if (is_locked) 427 else if (is_locked)
423 referenced += page_referenced_file(page, mem_cont); 428 referenced += page_referenced_file(page, mem_cont);
424 else if (TestSetPageLocked(page)) 429 else if (!trylock_page(page))
425 referenced++; 430 referenced++;
426 else { 431 else {
427 if (page->mapping) 432 if (page->mapping)
@@ -449,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
449 if (address == -EFAULT) 454 if (address == -EFAULT)
450 goto out; 455 goto out;
451 456
452 pte = page_check_address(page, mm, address, &ptl); 457 pte = page_check_address(page, mm, address, &ptl, 1);
453 if (!pte) 458 if (!pte)
454 goto out; 459 goto out;
455 460
@@ -457,7 +462,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
457 pte_t entry; 462 pte_t entry;
458 463
459 flush_cache_page(vma, address, pte_pfn(*pte)); 464 flush_cache_page(vma, address, pte_pfn(*pte));
460 entry = ptep_clear_flush(vma, address, pte); 465 entry = ptep_clear_flush_notify(vma, address, pte);
461 entry = pte_wrprotect(entry); 466 entry = pte_wrprotect(entry);
462 entry = pte_mkclean(entry); 467 entry = pte_mkclean(entry);
463 set_pte_at(mm, address, pte, entry); 468 set_pte_at(mm, address, pte, entry);
@@ -576,14 +581,8 @@ void page_add_anon_rmap(struct page *page,
576 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 581 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
577 if (atomic_inc_and_test(&page->_mapcount)) 582 if (atomic_inc_and_test(&page->_mapcount))
578 __page_set_anon_rmap(page, vma, address); 583 __page_set_anon_rmap(page, vma, address);
579 else { 584 else
580 __page_check_anon_rmap(page, vma, address); 585 __page_check_anon_rmap(page, vma, address);
581 /*
582 * We unconditionally charged during prepare, we uncharge here
583 * This takes care of balancing the reference counts
584 */
585 mem_cgroup_uncharge_page(page);
586 }
587} 586}
588 587
589/** 588/**
@@ -614,12 +613,6 @@ void page_add_file_rmap(struct page *page)
614{ 613{
615 if (atomic_inc_and_test(&page->_mapcount)) 614 if (atomic_inc_and_test(&page->_mapcount))
616 __inc_zone_page_state(page, NR_FILE_MAPPED); 615 __inc_zone_page_state(page, NR_FILE_MAPPED);
617 else
618 /*
619 * We unconditionally charged during prepare, we uncharge here
620 * This takes care of balancing the reference counts
621 */
622 mem_cgroup_uncharge_page(page);
623} 616}
624 617
625#ifdef CONFIG_DEBUG_VM 618#ifdef CONFIG_DEBUG_VM
@@ -670,6 +663,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
670 } 663 }
671 664
672 /* 665 /*
666 * Now that the last pte has gone, s390 must transfer dirty
667 * flag from storage key to struct page. We can usually skip
668 * this if the page is anon, so about to be freed; but perhaps
669 * not if it's in swapcache - there might be another pte slot
670 * containing the swap entry, but page not yet written to swap.
671 */
672 if ((!PageAnon(page) || PageSwapCache(page)) &&
673 page_test_dirty(page)) {
674 page_clear_dirty(page);
675 set_page_dirty(page);
676 }
677
678 mem_cgroup_uncharge_page(page);
679 __dec_zone_page_state(page,
680 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
681 /*
673 * It would be tidy to reset the PageAnon mapping here, 682 * It would be tidy to reset the PageAnon mapping here,
674 * but that might overwrite a racing page_add_anon_rmap 683 * but that might overwrite a racing page_add_anon_rmap
675 * which increments mapcount after us but sets mapping 684 * which increments mapcount after us but sets mapping
@@ -678,14 +687,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
678 * Leaving it set also helps swapoff to reinstate ptes 687 * Leaving it set also helps swapoff to reinstate ptes
679 * faster for those pages still in swapcache. 688 * faster for those pages still in swapcache.
680 */ 689 */
681 if (page_test_dirty(page)) {
682 page_clear_dirty(page);
683 set_page_dirty(page);
684 }
685 mem_cgroup_uncharge_page(page);
686
687 __dec_zone_page_state(page,
688 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
689 } 690 }
690} 691}
691 692
@@ -707,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
707 if (address == -EFAULT) 708 if (address == -EFAULT)
708 goto out; 709 goto out;
709 710
710 pte = page_check_address(page, mm, address, &ptl); 711 pte = page_check_address(page, mm, address, &ptl, 0);
711 if (!pte) 712 if (!pte)
712 goto out; 713 goto out;
713 714
@@ -717,14 +718,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
717 * skipped over this mm) then we should reactivate it. 718 * skipped over this mm) then we should reactivate it.
718 */ 719 */
719 if (!migration && ((vma->vm_flags & VM_LOCKED) || 720 if (!migration && ((vma->vm_flags & VM_LOCKED) ||
720 (ptep_clear_flush_young(vma, address, pte)))) { 721 (ptep_clear_flush_young_notify(vma, address, pte)))) {
721 ret = SWAP_FAIL; 722 ret = SWAP_FAIL;
722 goto out_unmap; 723 goto out_unmap;
723 } 724 }
724 725
725 /* Nuke the page table entry. */ 726 /* Nuke the page table entry. */
726 flush_cache_page(vma, address, page_to_pfn(page)); 727 flush_cache_page(vma, address, page_to_pfn(page));
727 pteval = ptep_clear_flush(vma, address, pte); 728 pteval = ptep_clear_flush_notify(vma, address, pte);
728 729
729 /* Move the dirty bit to the physical page now the pte is gone. */ 730 /* Move the dirty bit to the physical page now the pte is gone. */
730 if (pte_dirty(pteval)) 731 if (pte_dirty(pteval))
@@ -849,12 +850,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
849 page = vm_normal_page(vma, address, *pte); 850 page = vm_normal_page(vma, address, *pte);
850 BUG_ON(!page || PageAnon(page)); 851 BUG_ON(!page || PageAnon(page));
851 852
852 if (ptep_clear_flush_young(vma, address, pte)) 853 if (ptep_clear_flush_young_notify(vma, address, pte))
853 continue; 854 continue;
854 855
855 /* Nuke the page table entry. */ 856 /* Nuke the page table entry. */
856 flush_cache_page(vma, address, pte_pfn(*pte)); 857 flush_cache_page(vma, address, pte_pfn(*pte));
857 pteval = ptep_clear_flush(vma, address, pte); 858 pteval = ptep_clear_flush_notify(vma, address, pte);
858 859
859 /* If nonlinear, store the file page offset in the pte. */ 860 /* If nonlinear, store the file page offset in the pte. */
860 if (page->index != linear_page_index(vma, address)) 861 if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..04fb4f1ab88e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,20 +922,26 @@ found:
922 error = 1; 922 error = 1;
923 if (!inode) 923 if (!inode)
924 goto out; 924 goto out;
925 /* Precharge page while we can wait, compensate afterwards */ 925 /* Precharge page using GFP_KERNEL while we can wait */
926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
927 if (error) 927 if (error)
928 goto out; 928 goto out;
929 error = radix_tree_preload(GFP_KERNEL); 929 error = radix_tree_preload(GFP_KERNEL);
930 if (error) 930 if (error) {
931 goto uncharge; 931 mem_cgroup_uncharge_cache_page(page);
932 goto out;
933 }
932 error = 1; 934 error = 1;
933 935
934 spin_lock(&info->lock); 936 spin_lock(&info->lock);
935 ptr = shmem_swp_entry(info, idx, NULL); 937 ptr = shmem_swp_entry(info, idx, NULL);
936 if (ptr && ptr->val == entry.val) 938 if (ptr && ptr->val == entry.val) {
937 error = add_to_page_cache(page, inode->i_mapping, 939 error = add_to_page_cache_locked(page, inode->i_mapping,
938 idx, GFP_NOWAIT); 940 idx, GFP_NOWAIT);
941 /* does mem_cgroup_uncharge_cache_page on error */
942 } else /* we must compensate for our precharge above */
943 mem_cgroup_uncharge_cache_page(page);
944
939 if (error == -EEXIST) { 945 if (error == -EEXIST) {
940 struct page *filepage = find_get_page(inode->i_mapping, idx); 946 struct page *filepage = find_get_page(inode->i_mapping, idx);
941 error = 1; 947 error = 1;
@@ -961,8 +967,6 @@ found:
961 shmem_swp_unmap(ptr); 967 shmem_swp_unmap(ptr);
962 spin_unlock(&info->lock); 968 spin_unlock(&info->lock);
963 radix_tree_preload_end(); 969 radix_tree_preload_end();
964uncharge:
965 mem_cgroup_uncharge_page(page);
966out: 970out:
967 unlock_page(page); 971 unlock_page(page);
968 page_cache_release(page); 972 page_cache_release(page);
@@ -1261,7 +1265,7 @@ repeat:
1261 } 1265 }
1262 1266
1263 /* We have to do this with page locked to prevent races */ 1267 /* We have to do this with page locked to prevent races */
1264 if (TestSetPageLocked(swappage)) { 1268 if (!trylock_page(swappage)) {
1265 shmem_swp_unmap(entry); 1269 shmem_swp_unmap(entry);
1266 spin_unlock(&info->lock); 1270 spin_unlock(&info->lock);
1267 wait_on_page_locked(swappage); 1271 wait_on_page_locked(swappage);
@@ -1297,8 +1301,8 @@ repeat:
1297 SetPageUptodate(filepage); 1301 SetPageUptodate(filepage);
1298 set_page_dirty(filepage); 1302 set_page_dirty(filepage);
1299 swap_free(swap); 1303 swap_free(swap);
1300 } else if (!(error = add_to_page_cache( 1304 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1301 swappage, mapping, idx, GFP_NOWAIT))) { 1305 idx, GFP_NOWAIT))) {
1302 info->flags |= SHMEM_PAGEIN; 1306 info->flags |= SHMEM_PAGEIN;
1303 shmem_swp_set(info, entry, 0); 1307 shmem_swp_set(info, entry, 0);
1304 shmem_swp_unmap(entry); 1308 shmem_swp_unmap(entry);
@@ -1311,24 +1315,21 @@ repeat:
1311 shmem_swp_unmap(entry); 1315 shmem_swp_unmap(entry);
1312 spin_unlock(&info->lock); 1316 spin_unlock(&info->lock);
1313 unlock_page(swappage); 1317 unlock_page(swappage);
1318 page_cache_release(swappage);
1314 if (error == -ENOMEM) { 1319 if (error == -ENOMEM) {
1315 /* allow reclaim from this memory cgroup */ 1320 /* allow reclaim from this memory cgroup */
1316 error = mem_cgroup_cache_charge(swappage, 1321 error = mem_cgroup_shrink_usage(current->mm,
1317 current->mm, gfp & ~__GFP_HIGHMEM); 1322 gfp);
1318 if (error) { 1323 if (error)
1319 page_cache_release(swappage);
1320 goto failed; 1324 goto failed;
1321 }
1322 mem_cgroup_uncharge_page(swappage);
1323 } 1325 }
1324 page_cache_release(swappage);
1325 goto repeat; 1326 goto repeat;
1326 } 1327 }
1327 } else if (sgp == SGP_READ && !filepage) { 1328 } else if (sgp == SGP_READ && !filepage) {
1328 shmem_swp_unmap(entry); 1329 shmem_swp_unmap(entry);
1329 filepage = find_get_page(mapping, idx); 1330 filepage = find_get_page(mapping, idx);
1330 if (filepage && 1331 if (filepage &&
1331 (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { 1332 (!PageUptodate(filepage) || !trylock_page(filepage))) {
1332 spin_unlock(&info->lock); 1333 spin_unlock(&info->lock);
1333 wait_on_page_locked(filepage); 1334 wait_on_page_locked(filepage);
1334 page_cache_release(filepage); 1335 page_cache_release(filepage);
@@ -1358,6 +1359,8 @@ repeat:
1358 } 1359 }
1359 1360
1360 if (!filepage) { 1361 if (!filepage) {
1362 int ret;
1363
1361 spin_unlock(&info->lock); 1364 spin_unlock(&info->lock);
1362 filepage = shmem_alloc_page(gfp, info, idx); 1365 filepage = shmem_alloc_page(gfp, info, idx);
1363 if (!filepage) { 1366 if (!filepage) {
@@ -1386,10 +1389,18 @@ repeat:
1386 swap = *entry; 1389 swap = *entry;
1387 shmem_swp_unmap(entry); 1390 shmem_swp_unmap(entry);
1388 } 1391 }
1389 if (error || swap.val || 0 != add_to_page_cache_lru( 1392 ret = error || swap.val;
1390 filepage, mapping, idx, GFP_NOWAIT)) { 1393 if (ret)
1394 mem_cgroup_uncharge_cache_page(filepage);
1395 else
1396 ret = add_to_page_cache_lru(filepage, mapping,
1397 idx, GFP_NOWAIT);
1398 /*
1399 * At add_to_page_cache_lru() failure, uncharge will
1400 * be done automatically.
1401 */
1402 if (ret) {
1391 spin_unlock(&info->lock); 1403 spin_unlock(&info->lock);
1392 mem_cgroup_uncharge_page(filepage);
1393 page_cache_release(filepage); 1404 page_cache_release(filepage);
1394 shmem_unacct_blocks(info->flags, 1); 1405 shmem_unacct_blocks(info->flags, 1);
1395 shmem_free_blocks(inode, 1); 1406 shmem_free_blocks(inode, 1);
@@ -1398,7 +1409,6 @@ repeat:
1398 goto failed; 1409 goto failed;
1399 goto repeat; 1410 goto repeat;
1400 } 1411 }
1401 mem_cgroup_uncharge_page(filepage);
1402 info->flags |= SHMEM_PAGEIN; 1412 info->flags |= SHMEM_PAGEIN;
1403 } 1413 }
1404 1414
@@ -1503,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1503 inode->i_uid = current->fsuid; 1513 inode->i_uid = current->fsuid;
1504 inode->i_gid = current->fsgid; 1514 inode->i_gid = current->fsgid;
1505 inode->i_blocks = 0; 1515 inode->i_blocks = 0;
1506 inode->i_mapping->a_ops = &shmem_aops;
1507 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1516 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1508 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1517 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1509 inode->i_generation = get_seconds(); 1518 inode->i_generation = get_seconds();
@@ -1518,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1518 init_special_inode(inode, mode, dev); 1527 init_special_inode(inode, mode, dev);
1519 break; 1528 break;
1520 case S_IFREG: 1529 case S_IFREG:
1530 inode->i_mapping->a_ops = &shmem_aops;
1521 inode->i_op = &shmem_inode_operations; 1531 inode->i_op = &shmem_inode_operations;
1522 inode->i_fop = &shmem_file_operations; 1532 inode->i_fop = &shmem_file_operations;
1523 mpol_shared_policy_init(&info->policy, 1533 mpol_shared_policy_init(&info->policy,
@@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1690 file_accessed(filp); 1700 file_accessed(filp);
1691} 1701}
1692 1702
1693static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 1703static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1704 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1694{ 1705{
1695 read_descriptor_t desc; 1706 struct file *filp = iocb->ki_filp;
1707 ssize_t retval;
1708 unsigned long seg;
1709 size_t count;
1710 loff_t *ppos = &iocb->ki_pos;
1696 1711
1697 if ((ssize_t) count < 0) 1712 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1698 return -EINVAL; 1713 if (retval)
1699 if (!access_ok(VERIFY_WRITE, buf, count)) 1714 return retval;
1700 return -EFAULT;
1701 if (!count)
1702 return 0;
1703 1715
1704 desc.written = 0; 1716 for (seg = 0; seg < nr_segs; seg++) {
1705 desc.count = count; 1717 read_descriptor_t desc;
1706 desc.arg.buf = buf;
1707 desc.error = 0;
1708 1718
1709 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1719 desc.written = 0;
1710 if (desc.written) 1720 desc.arg.buf = iov[seg].iov_base;
1711 return desc.written; 1721 desc.count = iov[seg].iov_len;
1712 return desc.error; 1722 if (desc.count == 0)
1723 continue;
1724 desc.error = 0;
1725 do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1726 retval += desc.written;
1727 if (desc.error) {
1728 retval = retval ?: desc.error;
1729 break;
1730 }
1731 if (desc.count > 0)
1732 break;
1733 }
1734 return retval;
1713} 1735}
1714 1736
1715static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1737static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1907,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1907 return error; 1929 return error;
1908 } 1930 }
1909 unlock_page(page); 1931 unlock_page(page);
1932 inode->i_mapping->a_ops = &shmem_aops;
1910 inode->i_op = &shmem_symlink_inode_operations; 1933 inode->i_op = &shmem_symlink_inode_operations;
1911 kaddr = kmap_atomic(page, KM_USER0); 1934 kaddr = kmap_atomic(page, KM_USER0);
1912 memcpy(kaddr, symname, len); 1935 memcpy(kaddr, symname, len);
@@ -2330,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
2330 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2353 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2331} 2354}
2332 2355
2333static void init_once(struct kmem_cache *cachep, void *foo) 2356static void init_once(void *foo)
2334{ 2357{
2335 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2358 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2336 2359
@@ -2369,8 +2392,9 @@ static const struct file_operations shmem_file_operations = {
2369 .mmap = shmem_mmap, 2392 .mmap = shmem_mmap,
2370#ifdef CONFIG_TMPFS 2393#ifdef CONFIG_TMPFS
2371 .llseek = generic_file_llseek, 2394 .llseek = generic_file_llseek,
2372 .read = shmem_file_read, 2395 .read = do_sync_read,
2373 .write = do_sync_write, 2396 .write = do_sync_write,
2397 .aio_read = shmem_file_aio_read,
2374 .aio_write = generic_file_aio_write, 2398 .aio_write = generic_file_aio_write,
2375 .fsync = simple_sync_file, 2399 .fsync = simple_sync_file,
2376 .splice_read = generic_file_splice_read, 2400 .splice_read = generic_file_splice_read,
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
191 * shmem_permission - permission() inode operation 191 * shmem_permission - permission() inode operation
192 */ 192 */
193int 193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd) 194shmem_permission(struct inode *inode, int mask)
195{ 195{
196 return generic_permission(inode, mask, shmem_check_acl); 196 return generic_permission(inode, mask, shmem_check_acl);
197} 197}
diff --git a/mm/slab.c b/mm/slab.c
index 06236e4ddc1b..e76eee466886 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
406 unsigned int dflags; /* dynamic flags */ 406 unsigned int dflags; /* dynamic flags */
407 407
408 /* constructor func */ 408 /* constructor func */
409 void (*ctor)(struct kmem_cache *, void *); 409 void (*ctor)(void *obj);
410 410
411/* 5) cache creation/removal */ 411/* 5) cache creation/removal */
412 const char *name; 412 const char *name;
@@ -1901,15 +1901,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1901#endif 1901#endif
1902 1902
1903#if DEBUG 1903#if DEBUG
1904/** 1904static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1905 * slab_destroy_objs - destroy a slab and its objects
1906 * @cachep: cache pointer being destroyed
1907 * @slabp: slab pointer being destroyed
1908 *
1909 * Call the registered destructor for each object in a slab that is being
1910 * destroyed.
1911 */
1912static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1913{ 1905{
1914 int i; 1906 int i;
1915 for (i = 0; i < cachep->num; i++) { 1907 for (i = 0; i < cachep->num; i++) {
@@ -1938,7 +1930,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1938 } 1930 }
1939} 1931}
1940#else 1932#else
1941static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1933static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
1942{ 1934{
1943} 1935}
1944#endif 1936#endif
@@ -1956,7 +1948,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1956{ 1948{
1957 void *addr = slabp->s_mem - slabp->colouroff; 1949 void *addr = slabp->s_mem - slabp->colouroff;
1958 1950
1959 slab_destroy_objs(cachep, slabp); 1951 slab_destroy_debugcheck(cachep, slabp);
1960 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1952 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1961 struct slab_rcu *slab_rcu; 1953 struct slab_rcu *slab_rcu;
1962 1954
@@ -2145,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2145 */ 2137 */
2146struct kmem_cache * 2138struct kmem_cache *
2147kmem_cache_create (const char *name, size_t size, size_t align, 2139kmem_cache_create (const char *name, size_t size, size_t align,
2148 unsigned long flags, 2140 unsigned long flags, void (*ctor)(void *))
2149 void (*ctor)(struct kmem_cache *, void *))
2150{ 2141{
2151 size_t left_over, slab_size, ralign; 2142 size_t left_over, slab_size, ralign;
2152 struct kmem_cache *cachep = NULL, *pc; 2143 struct kmem_cache *cachep = NULL, *pc;
@@ -2454,7 +2445,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2454 struct kmem_list3 *l3; 2445 struct kmem_list3 *l3;
2455 int node; 2446 int node;
2456 2447
2457 on_each_cpu(do_drain, cachep, 1, 1); 2448 on_each_cpu(do_drain, cachep, 1);
2458 check_irq_on(); 2449 check_irq_on();
2459 for_each_online_node(node) { 2450 for_each_online_node(node) {
2460 l3 = cachep->nodelists[node]; 2451 l3 = cachep->nodelists[node];
@@ -2661,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2661 * They must also be threaded. 2652 * They must also be threaded.
2662 */ 2653 */
2663 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2654 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2664 cachep->ctor(cachep, objp + obj_offset(cachep)); 2655 cachep->ctor(objp + obj_offset(cachep));
2665 2656
2666 if (cachep->flags & SLAB_RED_ZONE) { 2657 if (cachep->flags & SLAB_RED_ZONE) {
2667 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2658 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2677,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2677 cachep->buffer_size / PAGE_SIZE, 0); 2668 cachep->buffer_size / PAGE_SIZE, 0);
2678#else 2669#else
2679 if (cachep->ctor) 2670 if (cachep->ctor)
2680 cachep->ctor(cachep, objp); 2671 cachep->ctor(objp);
2681#endif 2672#endif
2682 slab_bufctl(slabp)[i] = i + 1; 2673 slab_bufctl(slabp)[i] = i + 1;
2683 } 2674 }
@@ -3101,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3101#endif 3092#endif
3102 objp += obj_offset(cachep); 3093 objp += obj_offset(cachep);
3103 if (cachep->ctor && cachep->flags & SLAB_POISON) 3094 if (cachep->ctor && cachep->flags & SLAB_POISON)
3104 cachep->ctor(cachep, objp); 3095 cachep->ctor(objp);
3105#if ARCH_SLAB_MINALIGN 3096#if ARCH_SLAB_MINALIGN
3106 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3097 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3107 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3098 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -3263,9 +3254,12 @@ retry:
3263 3254
3264 if (cpuset_zone_allowed_hardwall(zone, flags) && 3255 if (cpuset_zone_allowed_hardwall(zone, flags) &&
3265 cache->nodelists[nid] && 3256 cache->nodelists[nid] &&
3266 cache->nodelists[nid]->free_objects) 3257 cache->nodelists[nid]->free_objects) {
3267 obj = ____cache_alloc_node(cache, 3258 obj = ____cache_alloc_node(cache,
3268 flags | GFP_THISNODE, nid); 3259 flags | GFP_THISNODE, nid);
3260 if (obj)
3261 break;
3262 }
3269 } 3263 }
3270 3264
3271 if (!obj) { 3265 if (!obj) {
@@ -3936,7 +3930,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3936 } 3930 }
3937 new->cachep = cachep; 3931 new->cachep = cachep;
3938 3932
3939 on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); 3933 on_each_cpu(do_ccupdate_local, (void *)new, 1);
3940 3934
3941 check_irq_on(); 3935 check_irq_on();
3942 cachep->batchcount = batchcount; 3936 cachep->batchcount = batchcount;
@@ -4478,4 +4472,3 @@ size_t ksize(const void *objp)
4478 4472
4479 return obj_size(virt_to_cache(objp)); 4473 return obj_size(virt_to_cache(objp));
4480} 4474}
4481EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..cb675d126791 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
130 */ 130 */
131static inline int slob_page(struct slob_page *sp) 131static inline int slob_page(struct slob_page *sp)
132{ 132{
133 return test_bit(PG_active, &sp->flags); 133 return PageSlobPage((struct page *)sp);
134} 134}
135 135
136static inline void set_slob_page(struct slob_page *sp) 136static inline void set_slob_page(struct slob_page *sp)
137{ 137{
138 __set_bit(PG_active, &sp->flags); 138 __SetPageSlobPage((struct page *)sp);
139} 139}
140 140
141static inline void clear_slob_page(struct slob_page *sp) 141static inline void clear_slob_page(struct slob_page *sp)
142{ 142{
143 __clear_bit(PG_active, &sp->flags); 143 __ClearPageSlobPage((struct page *)sp);
144} 144}
145 145
146/* 146/*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
148 */ 148 */
149static inline int slob_page_free(struct slob_page *sp) 149static inline int slob_page_free(struct slob_page *sp)
150{ 150{
151 return test_bit(PG_private, &sp->flags); 151 return PageSlobFree((struct page *)sp);
152} 152}
153 153
154static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 154static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
155{ 155{
156 list_add(&sp->list, list); 156 list_add(&sp->list, list);
157 __set_bit(PG_private, &sp->flags); 157 __SetPageSlobFree((struct page *)sp);
158} 158}
159 159
160static inline void clear_slob_page_free(struct slob_page *sp) 160static inline void clear_slob_page_free(struct slob_page *sp)
161{ 161{
162 list_del(&sp->list); 162 list_del(&sp->list);
163 __clear_bit(PG_private, &sp->flags); 163 __ClearPageSlobFree((struct page *)sp);
164} 164}
165 165
166#define SLOB_UNIT sizeof(slob_t) 166#define SLOB_UNIT sizeof(slob_t)
@@ -514,23 +514,23 @@ size_t ksize(const void *block)
514 return 0; 514 return 0;
515 515
516 sp = (struct slob_page *)virt_to_page(block); 516 sp = (struct slob_page *)virt_to_page(block);
517 if (slob_page(sp)) 517 if (slob_page(sp)) {
518 return ((slob_t *)block - 1)->units + SLOB_UNIT; 518 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
519 else 519 unsigned int *m = (unsigned int *)(block - align);
520 return SLOB_UNITS(*m) * SLOB_UNIT;
521 } else
520 return sp->page.private; 522 return sp->page.private;
521} 523}
522EXPORT_SYMBOL(ksize);
523 524
524struct kmem_cache { 525struct kmem_cache {
525 unsigned int size, align; 526 unsigned int size, align;
526 unsigned long flags; 527 unsigned long flags;
527 const char *name; 528 const char *name;
528 void (*ctor)(struct kmem_cache *, void *); 529 void (*ctor)(void *);
529}; 530};
530 531
531struct kmem_cache *kmem_cache_create(const char *name, size_t size, 532struct kmem_cache *kmem_cache_create(const char *name, size_t size,
532 size_t align, unsigned long flags, 533 size_t align, unsigned long flags, void (*ctor)(void *))
533 void (*ctor)(struct kmem_cache *, void *))
534{ 534{
535 struct kmem_cache *c; 535 struct kmem_cache *c;
536 536
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
575 b = slob_new_page(flags, get_order(c->size), node); 575 b = slob_new_page(flags, get_order(c->size), node);
576 576
577 if (c->ctor) 577 if (c->ctor)
578 c->ctor(c, b); 578 c->ctor(b);
579 579
580 return b; 580 return b;
581} 581}
diff --git a/mm/slub.c b/mm/slub.c
index 0987d1cd943c..0c83e6afe7b2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5,7 +5,7 @@
5 * The allocator synchronizes using per slab locks and only 5 * The allocator synchronizes using per slab locks and only
6 * uses a centralized lock to manage a pool of partial slabs. 6 * uses a centralized lock to manage a pool of partial slabs.
7 * 7 *
8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 8 * (C) 2007 SGI, Christoph Lameter
9 */ 9 */
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
@@ -102,44 +102,12 @@
102 * the fast path and disables lockless freelists. 102 * the fast path and disables lockless freelists.
103 */ 103 */
104 104
105#define FROZEN (1 << PG_active)
106
107#ifdef CONFIG_SLUB_DEBUG 105#ifdef CONFIG_SLUB_DEBUG
108#define SLABDEBUG (1 << PG_error) 106#define SLABDEBUG 1
109#else 107#else
110#define SLABDEBUG 0 108#define SLABDEBUG 0
111#endif 109#endif
112 110
113static inline int SlabFrozen(struct page *page)
114{
115 return page->flags & FROZEN;
116}
117
118static inline void SetSlabFrozen(struct page *page)
119{
120 page->flags |= FROZEN;
121}
122
123static inline void ClearSlabFrozen(struct page *page)
124{
125 page->flags &= ~FROZEN;
126}
127
128static inline int SlabDebug(struct page *page)
129{
130 return page->flags & SLABDEBUG;
131}
132
133static inline void SetSlabDebug(struct page *page)
134{
135 page->flags |= SLABDEBUG;
136}
137
138static inline void ClearSlabDebug(struct page *page)
139{
140 page->flags &= ~SLABDEBUG;
141}
142
143/* 111/*
144 * Issues still to be resolved: 112 * Issues still to be resolved:
145 * 113 *
@@ -411,7 +379,7 @@ static void set_track(struct kmem_cache *s, void *object,
411 if (addr) { 379 if (addr) {
412 p->addr = addr; 380 p->addr = addr;
413 p->cpu = smp_processor_id(); 381 p->cpu = smp_processor_id();
414 p->pid = current ? current->pid : -1; 382 p->pid = current->pid;
415 p->when = jiffies; 383 p->when = jiffies;
416 } else 384 } else
417 memset(p, 0, sizeof(struct track)); 385 memset(p, 0, sizeof(struct track));
@@ -431,9 +399,8 @@ static void print_track(const char *s, struct track *t)
431 if (!t->addr) 399 if (!t->addr)
432 return; 400 return;
433 401
434 printk(KERN_ERR "INFO: %s in ", s); 402 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
435 __print_symbol("%s", (unsigned long)t->addr); 403 s, t->addr, jiffies - t->when, t->cpu, t->pid);
436 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
437} 404}
438 405
439static void print_tracking(struct kmem_cache *s, void *object) 406static void print_tracking(struct kmem_cache *s, void *object)
@@ -493,7 +460,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
493 if (p > addr + 16) 460 if (p > addr + 16)
494 print_section("Bytes b4", p - 16, 16); 461 print_section("Bytes b4", p - 16, 16);
495 462
496 print_section("Object", p, min(s->objsize, 128)); 463 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
497 464
498 if (s->flags & SLAB_RED_ZONE) 465 if (s->flags & SLAB_RED_ZONE)
499 print_section("Redzone", p + s->objsize, 466 print_section("Redzone", p + s->objsize,
@@ -972,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
972 } 939 }
973 940
974 /* Special debug activities for freeing objects */ 941 /* Special debug activities for freeing objects */
975 if (!SlabFrozen(page) && !page->freelist) 942 if (!PageSlubFrozen(page) && !page->freelist)
976 remove_full(s, page); 943 remove_full(s, page);
977 if (s->flags & SLAB_STORE_USER) 944 if (s->flags & SLAB_STORE_USER)
978 set_track(s, object, TRACK_FREE, addr); 945 set_track(s, object, TRACK_FREE, addr);
@@ -1045,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
1045 1012
1046static unsigned long kmem_cache_flags(unsigned long objsize, 1013static unsigned long kmem_cache_flags(unsigned long objsize,
1047 unsigned long flags, const char *name, 1014 unsigned long flags, const char *name,
1048 void (*ctor)(struct kmem_cache *, void *)) 1015 void (*ctor)(void *))
1049{ 1016{
1050 /* 1017 /*
1051 * Enable debugging if selected on the kernel commandline. 1018 * Enable debugging if selected on the kernel commandline.
@@ -1073,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1073static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1040static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1074static inline unsigned long kmem_cache_flags(unsigned long objsize, 1041static inline unsigned long kmem_cache_flags(unsigned long objsize,
1075 unsigned long flags, const char *name, 1042 unsigned long flags, const char *name,
1076 void (*ctor)(struct kmem_cache *, void *)) 1043 void (*ctor)(void *))
1077{ 1044{
1078 return flags; 1045 return flags;
1079} 1046}
@@ -1136,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1136{ 1103{
1137 setup_object_debug(s, page, object); 1104 setup_object_debug(s, page, object);
1138 if (unlikely(s->ctor)) 1105 if (unlikely(s->ctor))
1139 s->ctor(s, object); 1106 s->ctor(object);
1140} 1107}
1141 1108
1142static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1109static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1158,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1158 page->flags |= 1 << PG_slab; 1125 page->flags |= 1 << PG_slab;
1159 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1126 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1160 SLAB_STORE_USER | SLAB_TRACE)) 1127 SLAB_STORE_USER | SLAB_TRACE))
1161 SetSlabDebug(page); 1128 __SetPageSlubDebug(page);
1162 1129
1163 start = page_address(page); 1130 start = page_address(page);
1164 1131
@@ -1185,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1185 int order = compound_order(page); 1152 int order = compound_order(page);
1186 int pages = 1 << order; 1153 int pages = 1 << order;
1187 1154
1188 if (unlikely(SlabDebug(page))) { 1155 if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
1189 void *p; 1156 void *p;
1190 1157
1191 slab_pad_check(s, page); 1158 slab_pad_check(s, page);
1192 for_each_object(p, s, page_address(page), 1159 for_each_object(p, s, page_address(page),
1193 page->objects) 1160 page->objects)
1194 check_object(s, page, p, 0); 1161 check_object(s, page, p, 0);
1195 ClearSlabDebug(page); 1162 __ClearPageSlubDebug(page);
1196 } 1163 }
1197 1164
1198 mod_zone_page_state(page_zone(page), 1165 mod_zone_page_state(page_zone(page),
@@ -1289,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1289 if (slab_trylock(page)) { 1256 if (slab_trylock(page)) {
1290 list_del(&page->lru); 1257 list_del(&page->lru);
1291 n->nr_partial--; 1258 n->nr_partial--;
1292 SetSlabFrozen(page); 1259 __SetPageSlubFrozen(page);
1293 return 1; 1260 return 1;
1294 } 1261 }
1295 return 0; 1262 return 0;
@@ -1362,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1362 n = get_node(s, zone_to_nid(zone)); 1329 n = get_node(s, zone_to_nid(zone));
1363 1330
1364 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1331 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1365 n->nr_partial > MIN_PARTIAL) { 1332 n->nr_partial > n->min_partial) {
1366 page = get_partial_node(n); 1333 page = get_partial_node(n);
1367 if (page) 1334 if (page)
1368 return page; 1335 return page;
@@ -1399,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1399 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1366 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1400 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); 1367 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1401 1368
1402 ClearSlabFrozen(page); 1369 __ClearPageSlubFrozen(page);
1403 if (page->inuse) { 1370 if (page->inuse) {
1404 1371
1405 if (page->freelist) { 1372 if (page->freelist) {
@@ -1407,13 +1374,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1407 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1374 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1408 } else { 1375 } else {
1409 stat(c, DEACTIVATE_FULL); 1376 stat(c, DEACTIVATE_FULL);
1410 if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1377 if (SLABDEBUG && PageSlubDebug(page) &&
1378 (s->flags & SLAB_STORE_USER))
1411 add_full(n, page); 1379 add_full(n, page);
1412 } 1380 }
1413 slab_unlock(page); 1381 slab_unlock(page);
1414 } else { 1382 } else {
1415 stat(c, DEACTIVATE_EMPTY); 1383 stat(c, DEACTIVATE_EMPTY);
1416 if (n->nr_partial < MIN_PARTIAL) { 1384 if (n->nr_partial < n->min_partial) {
1417 /* 1385 /*
1418 * Adding an empty slab to the partial slabs in order 1386 * Adding an empty slab to the partial slabs in order
1419 * to avoid page allocator overhead. This slab needs 1387 * to avoid page allocator overhead. This slab needs
@@ -1496,15 +1464,7 @@ static void flush_cpu_slab(void *d)
1496 1464
1497static void flush_all(struct kmem_cache *s) 1465static void flush_all(struct kmem_cache *s)
1498{ 1466{
1499#ifdef CONFIG_SMP 1467 on_each_cpu(flush_cpu_slab, s, 1);
1500 on_each_cpu(flush_cpu_slab, s, 1, 1);
1501#else
1502 unsigned long flags;
1503
1504 local_irq_save(flags);
1505 flush_cpu_slab(s);
1506 local_irq_restore(flags);
1507#endif
1508} 1468}
1509 1469
1510/* 1470/*
@@ -1560,7 +1520,7 @@ load_freelist:
1560 object = c->page->freelist; 1520 object = c->page->freelist;
1561 if (unlikely(!object)) 1521 if (unlikely(!object))
1562 goto another_slab; 1522 goto another_slab;
1563 if (unlikely(SlabDebug(c->page))) 1523 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1564 goto debug; 1524 goto debug;
1565 1525
1566 c->freelist = object[c->offset]; 1526 c->freelist = object[c->offset];
@@ -1597,7 +1557,7 @@ new_slab:
1597 if (c->page) 1557 if (c->page)
1598 flush_slab(s, c); 1558 flush_slab(s, c);
1599 slab_lock(new); 1559 slab_lock(new);
1600 SetSlabFrozen(new); 1560 __SetPageSlubFrozen(new);
1601 c->page = new; 1561 c->page = new;
1602 goto load_freelist; 1562 goto load_freelist;
1603 } 1563 }
@@ -1628,9 +1588,11 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1628 void **object; 1588 void **object;
1629 struct kmem_cache_cpu *c; 1589 struct kmem_cache_cpu *c;
1630 unsigned long flags; 1590 unsigned long flags;
1591 unsigned int objsize;
1631 1592
1632 local_irq_save(flags); 1593 local_irq_save(flags);
1633 c = get_cpu_slab(s, smp_processor_id()); 1594 c = get_cpu_slab(s, smp_processor_id());
1595 objsize = c->objsize;
1634 if (unlikely(!c->freelist || !node_match(c, node))) 1596 if (unlikely(!c->freelist || !node_match(c, node)))
1635 1597
1636 object = __slab_alloc(s, gfpflags, node, addr, c); 1598 object = __slab_alloc(s, gfpflags, node, addr, c);
@@ -1643,7 +1605,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1643 local_irq_restore(flags); 1605 local_irq_restore(flags);
1644 1606
1645 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1607 if (unlikely((gfpflags & __GFP_ZERO) && object))
1646 memset(object, 0, c->objsize); 1608 memset(object, 0, objsize);
1647 1609
1648 return object; 1610 return object;
1649} 1611}
@@ -1681,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1681 stat(c, FREE_SLOWPATH); 1643 stat(c, FREE_SLOWPATH);
1682 slab_lock(page); 1644 slab_lock(page);
1683 1645
1684 if (unlikely(SlabDebug(page))) 1646 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1685 goto debug; 1647 goto debug;
1686 1648
1687checks_ok: 1649checks_ok:
@@ -1689,7 +1651,7 @@ checks_ok:
1689 page->freelist = object; 1651 page->freelist = object;
1690 page->inuse--; 1652 page->inuse--;
1691 1653
1692 if (unlikely(SlabFrozen(page))) { 1654 if (unlikely(PageSlubFrozen(page))) {
1693 stat(c, FREE_FROZEN); 1655 stat(c, FREE_FROZEN);
1694 goto out_unlock; 1656 goto out_unlock;
1695 } 1657 }
@@ -1951,13 +1913,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
1951#endif 1913#endif
1952} 1914}
1953 1915
1954static void init_kmem_cache_node(struct kmem_cache_node *n) 1916static void
1917init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1955{ 1918{
1956 n->nr_partial = 0; 1919 n->nr_partial = 0;
1920
1921 /*
1922 * The larger the object size is, the more pages we want on the partial
1923 * list to avoid pounding the page allocator excessively.
1924 */
1925 n->min_partial = ilog2(s->size);
1926 if (n->min_partial < MIN_PARTIAL)
1927 n->min_partial = MIN_PARTIAL;
1928 else if (n->min_partial > MAX_PARTIAL)
1929 n->min_partial = MAX_PARTIAL;
1930
1957 spin_lock_init(&n->list_lock); 1931 spin_lock_init(&n->list_lock);
1958 INIT_LIST_HEAD(&n->partial); 1932 INIT_LIST_HEAD(&n->partial);
1959#ifdef CONFIG_SLUB_DEBUG 1933#ifdef CONFIG_SLUB_DEBUG
1960 atomic_long_set(&n->nr_slabs, 0); 1934 atomic_long_set(&n->nr_slabs, 0);
1935 atomic_long_set(&n->total_objects, 0);
1961 INIT_LIST_HEAD(&n->full); 1936 INIT_LIST_HEAD(&n->full);
1962#endif 1937#endif
1963} 1938}
@@ -2125,7 +2100,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2125 init_object(kmalloc_caches, n, 1); 2100 init_object(kmalloc_caches, n, 1);
2126 init_tracking(kmalloc_caches, n); 2101 init_tracking(kmalloc_caches, n);
2127#endif 2102#endif
2128 init_kmem_cache_node(n); 2103 init_kmem_cache_node(n, kmalloc_caches);
2129 inc_slabs_node(kmalloc_caches, node, page->objects); 2104 inc_slabs_node(kmalloc_caches, node, page->objects);
2130 2105
2131 /* 2106 /*
@@ -2182,7 +2157,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2182 2157
2183 } 2158 }
2184 s->node[node] = n; 2159 s->node[node] = n;
2185 init_kmem_cache_node(n); 2160 init_kmem_cache_node(n, s);
2186 } 2161 }
2187 return 1; 2162 return 1;
2188} 2163}
@@ -2193,7 +2168,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
2193 2168
2194static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2169static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2195{ 2170{
2196 init_kmem_cache_node(&s->local_node); 2171 init_kmem_cache_node(&s->local_node, s);
2197 return 1; 2172 return 1;
2198} 2173}
2199#endif 2174#endif
@@ -2324,7 +2299,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2324static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2299static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2325 const char *name, size_t size, 2300 const char *name, size_t size,
2326 size_t align, unsigned long flags, 2301 size_t align, unsigned long flags,
2327 void (*ctor)(struct kmem_cache *, void *)) 2302 void (*ctor)(void *))
2328{ 2303{
2329 memset(s, 0, kmem_size); 2304 memset(s, 0, kmem_size);
2330 s->name = name; 2305 s->name = name;
@@ -2338,7 +2313,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2338 2313
2339 s->refcount = 1; 2314 s->refcount = 1;
2340#ifdef CONFIG_NUMA 2315#ifdef CONFIG_NUMA
2341 s->remote_node_defrag_ratio = 100; 2316 s->remote_node_defrag_ratio = 1000;
2342#endif 2317#endif
2343 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2318 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2344 goto error; 2319 goto error;
@@ -2753,7 +2728,6 @@ size_t ksize(const void *object)
2753 */ 2728 */
2754 return s->size; 2729 return s->size;
2755} 2730}
2756EXPORT_SYMBOL(ksize);
2757 2731
2758void kfree(const void *x) 2732void kfree(const void *x)
2759{ 2733{
@@ -2765,6 +2739,7 @@ void kfree(const void *x)
2765 2739
2766 page = virt_to_head_page(x); 2740 page = virt_to_head_page(x);
2767 if (unlikely(!PageSlab(page))) { 2741 if (unlikely(!PageSlab(page))) {
2742 BUG_ON(!PageCompound(page));
2768 put_page(page); 2743 put_page(page);
2769 return; 2744 return;
2770 } 2745 }
@@ -2927,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg)
2927 ret = -ENOMEM; 2902 ret = -ENOMEM;
2928 goto out; 2903 goto out;
2929 } 2904 }
2930 init_kmem_cache_node(n); 2905 init_kmem_cache_node(n, s);
2931 s->node[nid] = n; 2906 s->node[nid] = n;
2932 } 2907 }
2933out: 2908out:
@@ -2995,8 +2970,6 @@ void __init kmem_cache_init(void)
2995 create_kmalloc_cache(&kmalloc_caches[1], 2970 create_kmalloc_cache(&kmalloc_caches[1],
2996 "kmalloc-96", 96, GFP_KERNEL); 2971 "kmalloc-96", 96, GFP_KERNEL);
2997 caches++; 2972 caches++;
2998 }
2999 if (KMALLOC_MIN_SIZE <= 128) {
3000 create_kmalloc_cache(&kmalloc_caches[2], 2973 create_kmalloc_cache(&kmalloc_caches[2],
3001 "kmalloc-192", 192, GFP_KERNEL); 2974 "kmalloc-192", 192, GFP_KERNEL);
3002 caches++; 2975 caches++;
@@ -3026,6 +2999,16 @@ void __init kmem_cache_init(void)
3026 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 2999 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
3027 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 3000 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
3028 3001
3002 if (KMALLOC_MIN_SIZE == 128) {
3003 /*
3004 * The 192 byte sized cache is not used if the alignment
3005 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3006 * instead.
3007 */
3008 for (i = 128 + 8; i <= 192; i += 8)
3009 size_index[(i - 1) / 8] = 8;
3010 }
3011
3029 slab_state = UP; 3012 slab_state = UP;
3030 3013
3031 /* Provide the correct kmalloc names now that the caches are up */ 3014 /* Provide the correct kmalloc names now that the caches are up */
@@ -3071,7 +3054,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3071 3054
3072static struct kmem_cache *find_mergeable(size_t size, 3055static struct kmem_cache *find_mergeable(size_t size,
3073 size_t align, unsigned long flags, const char *name, 3056 size_t align, unsigned long flags, const char *name,
3074 void (*ctor)(struct kmem_cache *, void *)) 3057 void (*ctor)(void *))
3075{ 3058{
3076 struct kmem_cache *s; 3059 struct kmem_cache *s;
3077 3060
@@ -3111,8 +3094,7 @@ static struct kmem_cache *find_mergeable(size_t size,
3111} 3094}
3112 3095
3113struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3096struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3114 size_t align, unsigned long flags, 3097 size_t align, unsigned long flags, void (*ctor)(void *))
3115 void (*ctor)(struct kmem_cache *, void *))
3116{ 3098{
3117 struct kmem_cache *s; 3099 struct kmem_cache *s;
3118 3100
@@ -3315,12 +3297,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3315 s->name, page); 3297 s->name, page);
3316 3298
3317 if (s->flags & DEBUG_DEFAULT_FLAGS) { 3299 if (s->flags & DEBUG_DEFAULT_FLAGS) {
3318 if (!SlabDebug(page)) 3300 if (!PageSlubDebug(page))
3319 printk(KERN_ERR "SLUB %s: SlabDebug not set " 3301 printk(KERN_ERR "SLUB %s: SlubDebug not set "
3320 "on slab 0x%p\n", s->name, page); 3302 "on slab 0x%p\n", s->name, page);
3321 } else { 3303 } else {
3322 if (SlabDebug(page)) 3304 if (PageSlubDebug(page))
3323 printk(KERN_ERR "SLUB %s: SlabDebug set on " 3305 printk(KERN_ERR "SLUB %s: SlubDebug set on "
3324 "slab 0x%p\n", s->name, page); 3306 "slab 0x%p\n", s->name, page);
3325 } 3307 }
3326} 3308}
@@ -4077,7 +4059,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4077 if (err) 4059 if (err)
4078 return err; 4060 return err;
4079 4061
4080 if (ratio < 100) 4062 if (ratio <= 100)
4081 s->remote_node_defrag_ratio = ratio * 10; 4063 s->remote_node_defrag_ratio = ratio * 10;
4082 4064
4083 return length; 4065 return length;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 99c4f36eb8a3..a91b5f8fcaf6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Virtual Memory Map support 2 * Virtual Memory Map support
3 * 3 *
4 * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>. 4 * (C) 2007 sgi. Christoph Lameter.
5 * 5 *
6 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, 6 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
7 * virt_to_page, page_address() to be implemented as a base offset 7 * virt_to_page, page_address() to be implemented as a base offset
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section)
147 return (section->section_mem_map >> SECTION_NID_SHIFT); 147 return (section->section_mem_map >> SECTION_NID_SHIFT);
148} 148}
149 149
150/* Record a memory area against a node. */ 150/* Validate the physical addressing limitations of the model */
151void __init memory_present(int nid, unsigned long start, unsigned long end) 151void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
152 unsigned long *end_pfn)
152{ 153{
153 unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); 154 unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
154 unsigned long pfn;
155 155
156 /* 156 /*
157 * Sanity checks - do not allow an architecture to pass 157 * Sanity checks - do not allow an architecture to pass
158 * in larger pfns than the maximum scope of sparsemem: 158 * in larger pfns than the maximum scope of sparsemem:
159 */ 159 */
160 if (start >= max_arch_pfn) 160 if (*start_pfn > max_sparsemem_pfn) {
161 return; 161 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
162 if (end >= max_arch_pfn) 162 "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
163 end = max_arch_pfn; 163 *start_pfn, *end_pfn, max_sparsemem_pfn);
164 WARN_ON_ONCE(1);
165 *start_pfn = max_sparsemem_pfn;
166 *end_pfn = max_sparsemem_pfn;
167 }
168
169 if (*end_pfn > max_sparsemem_pfn) {
170 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
171 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
172 *start_pfn, *end_pfn, max_sparsemem_pfn);
173 WARN_ON_ONCE(1);
174 *end_pfn = max_sparsemem_pfn;
175 }
176}
177
178/* Record a memory area against a node. */
179void __init memory_present(int nid, unsigned long start, unsigned long end)
180{
181 unsigned long pfn;
164 182
165 start &= PAGE_SECTION_MASK; 183 start &= PAGE_SECTION_MASK;
184 mminit_validate_memmodel_limits(&start, &end);
166 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 185 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
167 unsigned long section = pfn_to_section_nr(pfn); 186 unsigned long section = pfn_to_section_nr(pfn);
168 struct mem_section *ms; 187 struct mem_section *ms;
@@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
187 unsigned long pfn; 206 unsigned long pfn;
188 unsigned long nr_pages = 0; 207 unsigned long nr_pages = 0;
189 208
209 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
190 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 210 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
191 if (nid != early_pfn_to_nid(pfn)) 211 if (nid != early_pfn_to_nid(pfn))
192 continue; 212 continue;
@@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void)
248} 268}
249#endif /* CONFIG_MEMORY_HOTPLUG */ 269#endif /* CONFIG_MEMORY_HOTPLUG */
250 270
271#ifdef CONFIG_MEMORY_HOTREMOVE
272static unsigned long * __init
273sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
274{
275 unsigned long section_nr;
276
277 /*
278 * A page may contain usemaps for other sections preventing the
279 * page being freed and making a section unremovable while
280 * other sections referencing the usemap retmain active. Similarly,
281 * a pgdat can prevent a section being removed. If section A
282 * contains a pgdat and section B contains the usemap, both
283 * sections become inter-dependent. This allocates usemaps
284 * from the same section as the pgdat where possible to avoid
285 * this problem.
286 */
287 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
288 return alloc_bootmem_section(usemap_size(), section_nr);
289}
290
291static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
292{
293 unsigned long usemap_snr, pgdat_snr;
294 static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
295 static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
296 struct pglist_data *pgdat = NODE_DATA(nid);
297 int usemap_nid;
298
299 usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
300 pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
301 if (usemap_snr == pgdat_snr)
302 return;
303
304 if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
305 /* skip redundant message */
306 return;
307
308 old_usemap_snr = usemap_snr;
309 old_pgdat_snr = pgdat_snr;
310
311 usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
312 if (usemap_nid != nid) {
313 printk(KERN_INFO
314 "node %d must be removed before remove section %ld\n",
315 nid, usemap_snr);
316 return;
317 }
318 /*
319 * There is a circular dependency.
320 * Some platforms allow un-removable section because they will just
321 * gather other removable sections for dynamic partitioning.
322 * Just notify un-removable section's number here.
323 */
324 printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
325 pgdat_snr, nid);
326 printk(KERN_CONT
327 " have a circular dependency on usemap and pgdat allocations\n");
328}
329#else
330static unsigned long * __init
331sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
332{
333 return NULL;
334}
335
336static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
337{
338}
339#endif /* CONFIG_MEMORY_HOTREMOVE */
340
251static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 341static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
252{ 342{
253 unsigned long *usemap; 343 unsigned long *usemap;
254 struct mem_section *ms = __nr_to_section(pnum); 344 struct mem_section *ms = __nr_to_section(pnum);
255 int nid = sparse_early_nid(ms); 345 int nid = sparse_early_nid(ms);
256 346
257 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 347 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
258 if (usemap) 348 if (usemap)
259 return usemap; 349 return usemap;
260 350
351 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
352 if (usemap) {
353 check_usemap_section_nr(nid, usemap);
354 return usemap;
355 }
356
261 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 357 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
262 nid = 0; 358 nid = 0;
263 359
@@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
280} 376}
281#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 377#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
282 378
283struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 379static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
284{ 380{
285 struct page *map; 381 struct page *map;
286 struct mem_section *ms = __nr_to_section(pnum); 382 struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..9e0cb3118079 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,9 +34,9 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; 39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
40 40
41/* 41/*
42 * This path almost never happens for VM activity - pages are normally 42 * This path almost never happens for VM activity - pages are normally
@@ -278,9 +278,10 @@ int lru_add_drain_all(void)
278 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 278 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
279 * for the remainder of the operation. 279 * for the remainder of the operation.
280 * 280 *
281 * The locking in this function is against shrink_cache(): we recheck the 281 * The locking in this function is against shrink_inactive_list(): we recheck
282 * page count inside the lock to see whether shrink_cache grabbed the page 282 * the page count inside the lock to see whether shrink_inactive_list()
283 * via the LRU. If it did, give up: shrink_cache will free it. 283 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
284 * will free it.
284 */ 285 */
285void release_pages(struct page **pages, int nr, int cold) 286void release_pages(struct page **pages, int nr, int cold)
286{ 287{
@@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec)
443 for (i = 0; i < pagevec_count(pvec); i++) { 444 for (i = 0; i < pagevec_count(pvec); i++) {
444 struct page *page = pvec->pages[i]; 445 struct page *page = pvec->pages[i];
445 446
446 if (PagePrivate(page) && !TestSetPageLocked(page)) { 447 if (PagePrivate(page) && trylock_page(page)) {
447 if (PagePrivate(page)) 448 if (PagePrivate(page))
448 try_to_release_page(page, 0); 449 try_to_release_page(page, 0);
449 unlock_page(page); 450 unlock_page(page);
@@ -493,7 +494,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
493 */ 494 */
494#define ACCT_THRESHOLD max(16, NR_CPUS * 2) 495#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
495 496
496static DEFINE_PER_CPU(long, committed_space) = 0; 497static DEFINE_PER_CPU(long, committed_space);
497 498
498void vm_acct_memory(long pages) 499void vm_acct_memory(long pages)
499{ 500{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..797c3831cbec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
39 39
40struct address_space swapper_space = { 40struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), 42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
43 .a_ops = &swap_aops, 43 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_info, 45 .backing_dev_info = &swap_backing_dev_info,
@@ -56,15 +56,16 @@ static struct {
56 56
57void show_swap_cache_info(void) 57void show_swap_cache_info(void)
58{ 58{
59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", 59 printk("%lu pages in swap cache\n", total_swapcache_pages);
60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 61 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 62 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 64 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64} 65}
65 66
66/* 67/*
67 * add_to_swap_cache resembles add_to_page_cache on swapper_space, 68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 69 * but sets SwapCache flag and private instead of mapping and index.
69 */ 70 */
70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 71int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 78 error = radix_tree_preload(gfp_mask);
78 if (!error) { 79 if (!error) {
79 write_lock_irq(&swapper_space.tree_lock); 80 page_cache_get(page);
81 SetPageSwapCache(page);
82 set_page_private(page, entry.val);
83
84 spin_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 85 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 86 entry.val, page);
82 if (!error) { 87 if (likely(!error)) {
83 page_cache_get(page);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86 total_swapcache_pages++; 88 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 89 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 90 INC_CACHE_INFO(add_total);
89 } 91 }
90 write_unlock_irq(&swapper_space.tree_lock); 92 spin_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 93 radix_tree_preload_end();
94
95 if (unlikely(error)) {
96 set_page_private(page, 0UL);
97 ClearPageSwapCache(page);
98 page_cache_release(page);
99 }
92 } 100 }
93 return error; 101 return error;
94} 102}
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
175 183
176 entry.val = page_private(page); 184 entry.val = page_private(page);
177 185
178 write_lock_irq(&swapper_space.tree_lock); 186 spin_lock_irq(&swapper_space.tree_lock);
179 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
180 write_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
181 189
182 swap_free(entry); 190 swap_free(entry);
183 page_cache_release(page); 191 page_cache_release(page);
@@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page)
193 */ 201 */
194static inline void free_swap_cache(struct page *page) 202static inline void free_swap_cache(struct page *page)
195{ 203{
196 if (PageSwapCache(page) && !TestSetPageLocked(page)) { 204 if (PageSwapCache(page) && trylock_page(page)) {
197 remove_exclusive_swap_page(page); 205 remove_exclusive_swap_page(page);
198 unlock_page(page); 206 unlock_page(page);
199 } 207 }
@@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
294 * re-using the just freed swap entry for an existing page. 302 * re-using the just freed swap entry for an existing page.
295 * May fail (-ENOMEM) if radix-tree node allocation failed. 303 * May fail (-ENOMEM) if radix-tree node allocation failed.
296 */ 304 */
297 SetPageLocked(new_page); 305 set_page_locked(new_page);
298 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 306 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
299 if (!err) { 307 if (likely(!err)) {
300 /* 308 /*
301 * Initiate read into locked page and return. 309 * Initiate read into locked page and return.
302 */ 310 */
@@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
304 swap_readpage(NULL, new_page); 312 swap_readpage(NULL, new_page);
305 return new_page; 313 return new_page;
306 } 314 }
307 ClearPageLocked(new_page); 315 clear_page_locked(new_page);
308 swap_free(entry); 316 swap_free(entry);
309 } while (err != -ENOMEM); 317 } while (err != -ENOMEM);
310 318
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..1e330f2998fa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,17 +33,18 @@
33#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <linux/swapops.h> 34#include <linux/swapops.h>
35 35
36DEFINE_SPINLOCK(swap_lock); 36static DEFINE_SPINLOCK(swap_lock);
37unsigned int nr_swapfiles; 37static unsigned int nr_swapfiles;
38long total_swap_pages; 38long total_swap_pages;
39static int swap_overflow; 39static int swap_overflow;
40static int least_priority;
40 41
41static const char Bad_file[] = "Bad swap file entry "; 42static const char Bad_file[] = "Bad swap file entry ";
42static const char Unused_file[] = "Unused swap file entry "; 43static const char Unused_file[] = "Unused swap file entry ";
43static const char Bad_offset[] = "Bad swap offset entry "; 44static const char Bad_offset[] = "Bad swap offset entry ";
44static const char Unused_offset[] = "Unused swap offset entry "; 45static const char Unused_offset[] = "Unused swap offset entry ";
45 46
46struct swap_list_t swap_list = {-1, -1}; 47static struct swap_list_t swap_list = {-1, -1};
47 48
48static struct swap_info_struct swap_info[MAX_SWAPFILES]; 49static struct swap_info_struct swap_info[MAX_SWAPFILES];
49 50
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
368 retval = 0; 369 retval = 0;
369 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
370 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
371 write_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
372 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == 2) && !PageWriteback(page)) {
373 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
374 SetPageDirty(page); 375 SetPageDirty(page);
375 retval = 1; 376 retval = 1;
376 } 377 }
377 write_unlock_irq(&swapper_space.tree_lock); 378 spin_unlock_irq(&swapper_space.tree_lock);
378 } 379 }
379 spin_unlock(&swap_lock); 380 spin_unlock(&swap_lock);
380 381
@@ -402,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry)
402 if (p) { 403 if (p) {
403 if (swap_entry_free(p, swp_offset(entry)) == 1) { 404 if (swap_entry_free(p, swp_offset(entry)) == 1) {
404 page = find_get_page(&swapper_space, entry.val); 405 page = find_get_page(&swapper_space, entry.val);
405 if (page && unlikely(TestSetPageLocked(page))) { 406 if (page && unlikely(!trylock_page(page))) {
406 page_cache_release(page); 407 page_cache_release(page);
407 page = NULL; 408 page = NULL;
408 } 409 }
@@ -655,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm,
655 656
656 if (!down_read_trylock(&mm->mmap_sem)) { 657 if (!down_read_trylock(&mm->mmap_sem)) {
657 /* 658 /*
658 * Activate page so shrink_cache is unlikely to unmap its 659 * Activate page so shrink_inactive_list is unlikely to unmap
659 * ptes while lock is dropped, so swapoff can make progress. 660 * its ptes while lock is dropped, so swapoff can make progress.
660 */ 661 */
661 activate_page(page); 662 activate_page(page);
662 unlock_page(page); 663 unlock_page(page);
@@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1260 /* just pick something that's safe... */ 1261 /* just pick something that's safe... */
1261 swap_list.next = swap_list.head; 1262 swap_list.next = swap_list.head;
1262 } 1263 }
1264 if (p->prio < 0) {
1265 for (i = p->next; i >= 0; i = swap_info[i].next)
1266 swap_info[i].prio = p->prio--;
1267 least_priority++;
1268 }
1263 nr_swap_pages -= p->pages; 1269 nr_swap_pages -= p->pages;
1264 total_swap_pages -= p->pages; 1270 total_swap_pages -= p->pages;
1265 p->flags &= ~SWP_WRITEOK; 1271 p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1272 if (err) { 1278 if (err) {
1273 /* re-insert swap space back into swap_list */ 1279 /* re-insert swap space back into swap_list */
1274 spin_lock(&swap_lock); 1280 spin_lock(&swap_lock);
1275 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 1281 if (p->prio < 0)
1282 p->prio = --least_priority;
1283 prev = -1;
1284 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1276 if (p->prio >= swap_info[i].prio) 1285 if (p->prio >= swap_info[i].prio)
1277 break; 1286 break;
1287 prev = i;
1288 }
1278 p->next = i; 1289 p->next = i;
1279 if (prev < 0) 1290 if (prev < 0)
1280 swap_list.head = swap_list.next = p - swap_info; 1291 swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1447 unsigned int type; 1458 unsigned int type;
1448 int i, prev; 1459 int i, prev;
1449 int error; 1460 int error;
1450 static int least_priority;
1451 union swap_header *swap_header = NULL; 1461 union swap_header *swap_header = NULL;
1452 int swap_header_version; 1462 int swap_header_version;
1453 unsigned int nr_good_pages = 0; 1463 unsigned int nr_good_pages = 0;
@@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1455 sector_t span; 1465 sector_t span;
1456 unsigned long maxpages = 1; 1466 unsigned long maxpages = 1;
1457 int swapfilesize; 1467 int swapfilesize;
1458 unsigned short *swap_map; 1468 unsigned short *swap_map = NULL;
1459 struct page *page = NULL; 1469 struct page *page = NULL;
1460 struct inode *inode = NULL; 1470 struct inode *inode = NULL;
1461 int did_down = 0; 1471 int did_down = 0;
@@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1474 } 1484 }
1475 if (type >= nr_swapfiles) 1485 if (type >= nr_swapfiles)
1476 nr_swapfiles = type+1; 1486 nr_swapfiles = type+1;
1487 memset(p, 0, sizeof(*p));
1477 INIT_LIST_HEAD(&p->extent_list); 1488 INIT_LIST_HEAD(&p->extent_list);
1478 p->flags = SWP_USED; 1489 p->flags = SWP_USED;
1479 p->swap_file = NULL;
1480 p->old_block_size = 0;
1481 p->swap_map = NULL;
1482 p->lowest_bit = 0;
1483 p->highest_bit = 0;
1484 p->cluster_nr = 0;
1485 p->inuse_pages = 0;
1486 p->next = -1; 1490 p->next = -1;
1487 if (swap_flags & SWAP_FLAG_PREFER) {
1488 p->prio =
1489 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1490 } else {
1491 p->prio = --least_priority;
1492 }
1493 spin_unlock(&swap_lock); 1491 spin_unlock(&swap_lock);
1494 name = getname(specialfile); 1492 name = getname(specialfile);
1495 error = PTR_ERR(name); 1493 error = PTR_ERR(name);
@@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1632 goto bad_swap; 1630 goto bad_swap;
1633 1631
1634 /* OK, set up the swap map and apply the bad block list */ 1632 /* OK, set up the swap map and apply the bad block list */
1635 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1633 swap_map = vmalloc(maxpages * sizeof(short));
1634 if (!swap_map) {
1636 error = -ENOMEM; 1635 error = -ENOMEM;
1637 goto bad_swap; 1636 goto bad_swap;
1638 } 1637 }
1639 1638
1640 error = 0; 1639 error = 0;
1641 memset(p->swap_map, 0, maxpages * sizeof(short)); 1640 memset(swap_map, 0, maxpages * sizeof(short));
1642 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1641 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1643 int page_nr = swap_header->info.badpages[i]; 1642 int page_nr = swap_header->info.badpages[i];
1644 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1643 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1645 error = -EINVAL; 1644 error = -EINVAL;
1646 else 1645 else
1647 p->swap_map[page_nr] = SWAP_MAP_BAD; 1646 swap_map[page_nr] = SWAP_MAP_BAD;
1648 } 1647 }
1649 nr_good_pages = swap_header->info.last_page - 1648 nr_good_pages = swap_header->info.last_page -
1650 swap_header->info.nr_badpages - 1649 swap_header->info.nr_badpages -
@@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1654 } 1653 }
1655 1654
1656 if (nr_good_pages) { 1655 if (nr_good_pages) {
1657 p->swap_map[0] = SWAP_MAP_BAD; 1656 swap_map[0] = SWAP_MAP_BAD;
1658 p->max = maxpages; 1657 p->max = maxpages;
1659 p->pages = nr_good_pages; 1658 p->pages = nr_good_pages;
1660 nr_extents = setup_swap_extents(p, &span); 1659 nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1672 1671
1673 mutex_lock(&swapon_mutex); 1672 mutex_lock(&swapon_mutex);
1674 spin_lock(&swap_lock); 1673 spin_lock(&swap_lock);
1674 if (swap_flags & SWAP_FLAG_PREFER)
1675 p->prio =
1676 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1677 else
1678 p->prio = --least_priority;
1679 p->swap_map = swap_map;
1675 p->flags = SWP_ACTIVE; 1680 p->flags = SWP_ACTIVE;
1676 nr_swap_pages += nr_good_pages; 1681 nr_swap_pages += nr_good_pages;
1677 total_swap_pages += nr_good_pages; 1682 total_swap_pages += nr_good_pages;
@@ -1707,12 +1712,8 @@ bad_swap:
1707 destroy_swap_extents(p); 1712 destroy_swap_extents(p);
1708bad_swap_2: 1713bad_swap_2:
1709 spin_lock(&swap_lock); 1714 spin_lock(&swap_lock);
1710 swap_map = p->swap_map;
1711 p->swap_file = NULL; 1715 p->swap_file = NULL;
1712 p->swap_map = NULL;
1713 p->flags = 0; 1716 p->flags = 0;
1714 if (!(swap_flags & SWAP_FLAG_PREFER))
1715 ++least_priority;
1716 spin_unlock(&swap_lock); 1717 spin_unlock(&swap_lock);
1717 vfree(swap_map); 1718 vfree(swap_map);
1718 if (swap_file) 1719 if (swap_file)
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f501943..8d7a27a6335c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,31 +65,31 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
65 if (!dentry) 65 if (!dentry)
66 goto put_memory; 66 goto put_memory;
67 67
68 error = -ENFILE;
69 file = get_empty_filp();
70 if (!file)
71 goto put_dentry;
72
68 error = -ENOSPC; 73 error = -ENOSPC;
69 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 74 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
70 if (!inode) 75 if (!inode)
71 goto put_dentry; 76 goto close_file;
72 77
73 d_instantiate(dentry, inode); 78 d_instantiate(dentry, inode);
74 error = -ENFILE; 79 inode->i_size = size;
75 file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
76 &ramfs_file_operations);
77 if (!file)
78 goto put_dentry;
79
80 inode->i_nlink = 0; /* It is unlinked */ 80 inode->i_nlink = 0; /* It is unlinked */
81 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
82 &ramfs_file_operations);
81 83
82 /* notify everyone as to the change of file size */ 84#ifndef CONFIG_MMU
83 error = do_truncate(dentry, size, 0, file); 85 error = ramfs_nommu_expand_for_mapping(inode, size);
84 if (error < 0) 86 if (error)
85 goto close_file; 87 goto close_file;
86 88#endif
87 return file; 89 return file;
88 90
89close_file: 91close_file:
90 put_filp(file); 92 put_filp(file);
91 return ERR_PTR(error);
92
93put_dentry: 93put_dentry:
94 dput(dentry); 94 dput(dentry);
95put_memory: 95put_memory:
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..6650c1d878b4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
104 cancel_dirty_page(page, PAGE_CACHE_SIZE); 104 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105 105
106 remove_from_page_cache(page); 106 remove_from_page_cache(page);
107 ClearPageUptodate(page);
108 ClearPageMappedToDisk(page); 107 ClearPageMappedToDisk(page);
109 page_cache_release(page); /* pagecache ref */ 108 page_cache_release(page); /* pagecache ref */
110} 109}
@@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
188 if (page_index > next) 187 if (page_index > next)
189 next = page_index; 188 next = page_index;
190 next++; 189 next++;
191 if (TestSetPageLocked(page)) 190 if (!trylock_page(page))
192 continue; 191 continue;
193 if (PageWriteback(page)) { 192 if (PageWriteback(page)) {
194 unlock_page(page); 193 unlock_page(page);
@@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
281 pgoff_t index; 280 pgoff_t index;
282 int lock_failed; 281 int lock_failed;
283 282
284 lock_failed = TestSetPageLocked(page); 283 lock_failed = !trylock_page(page);
285 284
286 /* 285 /*
287 * We really shouldn't be looking at the ->index of an 286 * We really shouldn't be looking at the ->index of an
@@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 348 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
350 return 0; 349 return 0;
351 350
352 write_lock_irq(&mapping->tree_lock); 351 spin_lock_irq(&mapping->tree_lock);
353 if (PageDirty(page)) 352 if (PageDirty(page))
354 goto failed; 353 goto failed;
355 354
356 BUG_ON(PagePrivate(page)); 355 BUG_ON(PagePrivate(page));
357 __remove_from_page_cache(page); 356 __remove_from_page_cache(page);
358 write_unlock_irq(&mapping->tree_lock); 357 spin_unlock_irq(&mapping->tree_lock);
359 ClearPageUptodate(page);
360 page_cache_release(page); /* pagecache ref */ 358 page_cache_release(page); /* pagecache ref */
361 return 1; 359 return 1;
362failed: 360failed:
363 write_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
364 return 0; 362 return 0;
365} 363}
366 364
@@ -382,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
382 * Any pages which are found to be mapped into pagetables are unmapped prior to 380 * Any pages which are found to be mapped into pagetables are unmapped prior to
383 * invalidation. 381 * invalidation.
384 * 382 *
385 * Returns -EIO if any pages could not be invalidated. 383 * Returns -EBUSY if any pages could not be invalidated.
386 */ 384 */
387int invalidate_inode_pages2_range(struct address_space *mapping, 385int invalidate_inode_pages2_range(struct address_space *mapping,
388 pgoff_t start, pgoff_t end) 386 pgoff_t start, pgoff_t end)
@@ -442,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
442 ret2 = do_launder_page(mapping, page); 440 ret2 = do_launder_page(mapping, page);
443 if (ret2 == 0) { 441 if (ret2 == 0) {
444 if (!invalidate_complete_page2(mapping, page)) 442 if (!invalidate_complete_page2(mapping, page))
445 ret2 = -EIO; 443 ret2 = -EBUSY;
446 } 444 }
447 if (ret2 < 0) 445 if (ret2 < 0)
448 ret = ret2; 446 ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
1#include <linux/mm.h>
1#include <linux/slab.h> 2#include <linux/slab.h>
2#include <linux/string.h> 3#include <linux/string.h>
3#include <linux/module.h> 4#include <linux/module.h>
4#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h>
5#include <asm/uaccess.h> 7#include <asm/uaccess.h>
6 8
7/** 9/**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
68EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
69 71
70/** 72/**
71 * krealloc - reallocate memory. The contents will remain unchanged. 73 * __krealloc - like krealloc() but don't free @p.
72 * @p: object to reallocate memory for. 74 * @p: object to reallocate memory for.
73 * @new_size: how many bytes of memory are required. 75 * @new_size: how many bytes of memory are required.
74 * @flags: the type of memory to allocate. 76 * @flags: the type of memory to allocate.
75 * 77 *
76 * The contents of the object pointed to are preserved up to the 78 * This function is like krealloc() except it never frees the originally
77 * lesser of the new and old sizes. If @p is %NULL, krealloc() 79 * allocated buffer. Use this if you don't want to free the buffer immediately
78 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 80 * like, for example, with RCU.
79 * %NULL pointer, the object pointed to is freed.
80 */ 81 */
81void *krealloc(const void *p, size_t new_size, gfp_t flags) 82void *__krealloc(const void *p, size_t new_size, gfp_t flags)
82{ 83{
83 void *ret; 84 void *ret;
84 size_t ks = 0; 85 size_t ks = 0;
85 86
86 if (unlikely(!new_size)) { 87 if (unlikely(!new_size))
87 kfree(p);
88 return ZERO_SIZE_PTR; 88 return ZERO_SIZE_PTR;
89 }
90 89
91 if (p) 90 if (p)
92 ks = ksize(p); 91 ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
95 return (void *)p; 94 return (void *)p;
96 95
97 ret = kmalloc_track_caller(new_size, flags); 96 ret = kmalloc_track_caller(new_size, flags);
98 if (ret && p) { 97 if (ret && p)
99 memcpy(ret, p, ks); 98 memcpy(ret, p, ks);
99
100 return ret;
101}
102EXPORT_SYMBOL(__krealloc);
103
104/**
105 * krealloc - reallocate memory. The contents will remain unchanged.
106 * @p: object to reallocate memory for.
107 * @new_size: how many bytes of memory are required.
108 * @flags: the type of memory to allocate.
109 *
110 * The contents of the object pointed to are preserved up to the
111 * lesser of the new and old sizes. If @p is %NULL, krealloc()
112 * behaves exactly like kmalloc(). If @size is 0 and @p is not a
113 * %NULL pointer, the object pointed to is freed.
114 */
115void *krealloc(const void *p, size_t new_size, gfp_t flags)
116{
117 void *ret;
118
119 if (unlikely(!new_size)) {
100 kfree(p); 120 kfree(p);
121 return ZERO_SIZE_PTR;
101 } 122 }
123
124 ret = __krealloc(p, new_size, flags);
125 if (ret && p != ret)
126 kfree(p);
127
102 return ret; 128 return ret;
103} 129}
104EXPORT_SYMBOL(krealloc); 130EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
136 return p; 162 return p;
137} 163}
138EXPORT_SYMBOL(strndup_user); 164EXPORT_SYMBOL(strndup_user);
165
166#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
167void arch_pick_mmap_layout(struct mm_struct *mm)
168{
169 mm->mmap_base = TASK_UNMAPPED_BASE;
170 mm->get_unmapped_area = arch_get_unmapped_area;
171 mm->unmap_area = arch_unmap_area;
172}
173#endif
174
175int __attribute__((weak)) get_user_pages_fast(unsigned long start,
176 int nr_pages, int write, struct page **pages)
177{
178 struct mm_struct *mm = current->mm;
179 int ret;
180
181 down_read(&mm->mmap_sem);
182 ret = get_user_pages(current, mm, start, nr_pages,
183 write, 0, pages, NULL);
184 up_read(&mm->mmap_sem);
185
186 return ret;
187}
188EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 830a5580c5d7..bba06c41fc59 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -388,16 +388,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
388 return; 388 return;
389 389
390 if ((PAGE_SIZE-1) & (unsigned long)addr) { 390 if ((PAGE_SIZE-1) & (unsigned long)addr) {
391 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 391 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
392 WARN_ON(1);
393 return; 392 return;
394 } 393 }
395 394
396 area = remove_vm_area(addr); 395 area = remove_vm_area(addr);
397 if (unlikely(!area)) { 396 if (unlikely(!area)) {
398 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 397 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
399 addr); 398 addr);
400 WARN_ON(1);
401 return; 399 return;
402 } 400 }
403 401
@@ -938,6 +936,25 @@ static void s_stop(struct seq_file *m, void *p)
938 read_unlock(&vmlist_lock); 936 read_unlock(&vmlist_lock);
939} 937}
940 938
939static void show_numa_info(struct seq_file *m, struct vm_struct *v)
940{
941 if (NUMA_BUILD) {
942 unsigned int nr, *counters = m->private;
943
944 if (!counters)
945 return;
946
947 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
948
949 for (nr = 0; nr < v->nr_pages; nr++)
950 counters[page_to_nid(v->pages[nr])]++;
951
952 for_each_node_state(nr, N_HIGH_MEMORY)
953 if (counters[nr])
954 seq_printf(m, " N%u=%u", nr, counters[nr]);
955 }
956}
957
941static int s_show(struct seq_file *m, void *p) 958static int s_show(struct seq_file *m, void *p)
942{ 959{
943 struct vm_struct *v = p; 960 struct vm_struct *v = p;
@@ -974,6 +991,7 @@ static int s_show(struct seq_file *m, void *p)
974 if (v->flags & VM_VPAGES) 991 if (v->flags & VM_VPAGES)
975 seq_printf(m, " vpages"); 992 seq_printf(m, " vpages");
976 993
994 show_numa_info(m, v);
977 seq_putc(m, '\n'); 995 seq_putc(m, '\n');
978 return 0; 996 return 0;
979} 997}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..1ff1a58e7c10 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
41 42
42#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
43#include <asm/div64.h> 44#include <asm/div64.h>
@@ -390,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
390} 391}
391 392
392/* 393/*
393 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 394 * Same as remove_mapping, but if the page is removed from the mapping, it
394 * someone else has a ref on the page, abort and return 0. If it was 395 * gets returned with a refcount of 0.
395 * successfully detached, return 1. Assumes the caller has a single ref on
396 * this page.
397 */ 396 */
398int remove_mapping(struct address_space *mapping, struct page *page) 397static int __remove_mapping(struct address_space *mapping, struct page *page)
399{ 398{
400 BUG_ON(!PageLocked(page)); 399 BUG_ON(!PageLocked(page));
401 BUG_ON(mapping != page_mapping(page)); 400 BUG_ON(mapping != page_mapping(page));
402 401
403 write_lock_irq(&mapping->tree_lock); 402 spin_lock_irq(&mapping->tree_lock);
404 /* 403 /*
405 * The non racy check for a busy page. 404 * The non racy check for a busy page.
406 * 405 *
@@ -426,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
426 * Note that if SetPageDirty is always performed via set_page_dirty, 425 * Note that if SetPageDirty is always performed via set_page_dirty,
427 * and thus under tree_lock, then this ordering is not required. 426 * and thus under tree_lock, then this ordering is not required.
428 */ 427 */
429 if (unlikely(page_count(page) != 2)) 428 if (!page_freeze_refs(page, 2))
430 goto cannot_free; 429 goto cannot_free;
431 smp_rmb(); 430 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
432 if (unlikely(PageDirty(page))) 431 if (unlikely(PageDirty(page))) {
432 page_unfreeze_refs(page, 2);
433 goto cannot_free; 433 goto cannot_free;
434 }
434 435
435 if (PageSwapCache(page)) { 436 if (PageSwapCache(page)) {
436 swp_entry_t swap = { .val = page_private(page) }; 437 swp_entry_t swap = { .val = page_private(page) };
437 __delete_from_swap_cache(page); 438 __delete_from_swap_cache(page);
438 write_unlock_irq(&mapping->tree_lock); 439 spin_unlock_irq(&mapping->tree_lock);
439 swap_free(swap); 440 swap_free(swap);
440 __put_page(page); /* The pagecache ref */ 441 } else {
441 return 1; 442 __remove_from_page_cache(page);
443 spin_unlock_irq(&mapping->tree_lock);
442 } 444 }
443 445
444 __remove_from_page_cache(page);
445 write_unlock_irq(&mapping->tree_lock);
446 __put_page(page);
447 return 1; 446 return 1;
448 447
449cannot_free: 448cannot_free:
450 write_unlock_irq(&mapping->tree_lock); 449 spin_unlock_irq(&mapping->tree_lock);
450 return 0;
451}
452
453/*
454 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
455 * someone else has a ref on the page, abort and return 0. If it was
456 * successfully detached, return 1. Assumes the caller has a single ref on
457 * this page.
458 */
459int remove_mapping(struct address_space *mapping, struct page *page)
460{
461 if (__remove_mapping(mapping, page)) {
462 /*
463 * Unfreezing the refcount with 1 rather than 2 effectively
464 * drops the pagecache ref for us without requiring another
465 * atomic operation.
466 */
467 page_unfreeze_refs(page, 1);
468 return 1;
469 }
451 return 0; 470 return 0;
452} 471}
453 472
@@ -477,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
477 page = lru_to_page(page_list); 496 page = lru_to_page(page_list);
478 list_del(&page->lru); 497 list_del(&page->lru);
479 498
480 if (TestSetPageLocked(page)) 499 if (!trylock_page(page))
481 goto keep; 500 goto keep;
482 501
483 VM_BUG_ON(PageActive(page)); 502 VM_BUG_ON(PageActive(page));
@@ -563,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
563 * A synchronous write - probably a ramdisk. Go 582 * A synchronous write - probably a ramdisk. Go
564 * ahead and try to reclaim the page. 583 * ahead and try to reclaim the page.
565 */ 584 */
566 if (TestSetPageLocked(page)) 585 if (!trylock_page(page))
567 goto keep; 586 goto keep;
568 if (PageDirty(page) || PageWriteback(page)) 587 if (PageDirty(page) || PageWriteback(page))
569 goto keep_locked; 588 goto keep_locked;
@@ -597,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
597 if (PagePrivate(page)) { 616 if (PagePrivate(page)) {
598 if (!try_to_release_page(page, sc->gfp_mask)) 617 if (!try_to_release_page(page, sc->gfp_mask))
599 goto activate_locked; 618 goto activate_locked;
600 if (!mapping && page_count(page) == 1) 619 if (!mapping && page_count(page) == 1) {
601 goto free_it; 620 unlock_page(page);
621 if (put_page_testzero(page))
622 goto free_it;
623 else {
624 /*
625 * rare race with speculative reference.
626 * the speculative reference will free
627 * this page shortly, so we may
628 * increment nr_reclaimed here (and
629 * leave it off the LRU).
630 */
631 nr_reclaimed++;
632 continue;
633 }
634 }
602 } 635 }
603 636
604 if (!mapping || !remove_mapping(mapping, page)) 637 if (!mapping || !__remove_mapping(mapping, page))
605 goto keep_locked; 638 goto keep_locked;
606 639
607free_it:
608 unlock_page(page); 640 unlock_page(page);
641free_it:
609 nr_reclaimed++; 642 nr_reclaimed++;
610 if (!pagevec_add(&freed_pvec, page)) 643 if (!pagevec_add(&freed_pvec, page)) {
611 __pagevec_release_nonlru(&freed_pvec); 644 __pagevec_free(&freed_pvec);
645 pagevec_reinit(&freed_pvec);
646 }
612 continue; 647 continue;
613 648
614activate_locked: 649activate_locked:
@@ -622,7 +657,7 @@ keep:
622 } 657 }
623 list_splice(&ret_pages, page_list); 658 list_splice(&ret_pages, page_list);
624 if (pagevec_count(&freed_pvec)) 659 if (pagevec_count(&freed_pvec))
625 __pagevec_release_nonlru(&freed_pvec); 660 __pagevec_free(&freed_pvec);
626 count_vm_events(PGACTIVATE, pgactivate); 661 count_vm_events(PGACTIVATE, pgactivate);
627 return nr_reclaimed; 662 return nr_reclaimed;
628} 663}
@@ -1316,6 +1351,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1316 struct zone *zone; 1351 struct zone *zone;
1317 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1352 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1318 1353
1354 delayacct_freepages_start();
1355
1319 if (scan_global_lru(sc)) 1356 if (scan_global_lru(sc))
1320 count_vm_event(ALLOCSTALL); 1357 count_vm_event(ALLOCSTALL);
1321 /* 1358 /*
@@ -1371,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1371 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1408 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1372 congestion_wait(WRITE, HZ/10); 1409 congestion_wait(WRITE, HZ/10);
1373 } 1410 }
1374 /* top priority shrink_caches still had more to do? don't OOM, then */ 1411 /* top priority shrink_zones still had more to do? don't OOM, then */
1375 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1412 if (!sc->all_unreclaimable && scan_global_lru(sc))
1376 ret = nr_reclaimed; 1413 ret = nr_reclaimed;
1377out: 1414out:
@@ -1396,6 +1433,8 @@ out:
1396 } else 1433 } else
1397 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1434 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1398 1435
1436 delayacct_freepages_end();
1437
1399 return ret; 1438 return ret;
1400} 1439}
1401 1440
@@ -1940,7 +1979,7 @@ module_init(kswapd_init)
1940int zone_reclaim_mode __read_mostly; 1979int zone_reclaim_mode __read_mostly;
1941 1980
1942#define RECLAIM_OFF 0 1981#define RECLAIM_OFF 0
1943#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1982#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
1944#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1983#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1945#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1984#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1946 1985
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db9eabb2c5b3..d7826af2fb07 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/vmstat.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17 18
18#ifdef CONFIG_VM_EVENT_COUNTERS 19#ifdef CONFIG_VM_EVENT_COUNTERS
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
26 27
27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
28 29
29 for_each_cpu_mask(cpu, *cpumask) { 30 for_each_cpu_mask_nr(cpu, *cpumask) {
30 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
31 32
32 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -515,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
515 continue; 516 continue;
516 517
517 page = pfn_to_page(pfn); 518 page = pfn_to_page(pfn);
519#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
520 /*
521 * Ordinarily, memory holes in flatmem still have a valid
522 * memmap for the PFN range. However, an architecture for
523 * embedded systems (e.g. ARM) can free up the memmap backing
524 * holes to save memory on the assumption the memmap is
525 * never used. The page_zone linkages are then broken even
526 * though pfn_valid() returns true. Skip the page if the
527 * linkages are broken. Even if this test passed, the impact
528 * is that the counters for the movable type are off but
529 * fragmentation monitoring is likely meaningless on small
530 * systems.
531 */
532 if (page_zone(page) != zone)
533 continue;
534#endif
518 mtype = get_pageblock_migratetype(page); 535 mtype = get_pageblock_migratetype(page);
519 536
520 count[mtype]++; 537 if (mtype < MIGRATE_TYPES)
538 count[mtype]++;
521 } 539 }
522 540
523 /* Print counts */ 541 /* Print counts */