aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-08-14 06:19:59 -0400
committerIngo Molnar <mingo@elte.hu>2008-08-14 06:19:59 -0400
commit8d7ccaa545490cdffdfaff0842436a8dd85cf47b (patch)
tree8129b5907161bc6ae26deb3645ce1e280c5e1f51 /mm
parentb2139aa0eec330c711c5a279db361e5ef1178e78 (diff)
parent30a2f3c60a84092c8084dfe788b710f8d0768cd4 (diff)
Merge commit 'v2.6.27-rc3' into x86/prototypes
Conflicts: include/asm-x86/dma-mapping.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile3
-rw-r--r--mm/allocpercpu.c24
-rw-r--r--mm/bootmem.c935
-rw-r--r--mm/filemap.c422
-rw-r--r--mm/filemap_xip.c5
-rw-r--r--mm/fremap.c3
-rw-r--r--mm/hugetlb.c1681
-rw-r--r--mm/internal.h61
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memcontrol.c369
-rw-r--r--mm/memory.c322
-rw-r--r--mm/memory_hotplug.c80
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/migrate.c53
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c152
-rw-r--r--mm/mmap.c180
-rw-r--r--mm/mmu_notifier.c277
-rw-r--r--mm/mprotect.c9
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c25
-rw-r--r--mm/page-writeback.c12
-rw-r--r--mm/page_alloc.c175
-rw-r--r--mm/pdflush.c4
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c34
-rw-r--r--mm/shmem.c106
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/slab.c12
-rw-r--r--mm/slob.c20
-rw-r--r--mm/slub.c105
-rw-r--r--mm/sparse.c116
-rw-r--r--mm/swap.c17
-rw-r--r--mm/swap_state.c38
-rw-r--r--mm/swapfile.c65
-rw-r--r--mm/truncate.c12
-rw-r--r--mm/util.c70
-rw-r--r--mm/vmalloc.c26
-rw-r--r--mm/vmscan.c93
-rw-r--r--mm/vmstat.c3
41 files changed, 3918 insertions, 1626 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c4de85285bb4..0bd9c2dbb2a0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
174config MIGRATION 174config MIGRATION
175 bool "Page migration" 175 bool "Page migration"
176 def_bool y 176 def_bool y
177 depends on NUMA 177 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
178 help 178 help
179 Allows the migration of the physical location of pages of processes 179 Allows the migration of the physical location of pages of processes
180 while the virtual addresses are not changed. This is useful for 180 while the virtual addresses are not changed. This is useful for
@@ -205,3 +205,6 @@ config NR_QUICK
205config VIRT_TO_BUS 205config VIRT_TO_BUS
206 def_bool y 206 def_bool y
207 depends on !ARCH_NO_VIRT_TO_BUS 207 depends on !ARCH_NO_VIRT_TO_BUS
208
209config MMU_NOTIFIER
210 bool
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..da4ccf015aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 17obj-$(CONFIG_BOUNCE) += bounce.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 27obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 05f2b4009ccc..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
18 * Depopulating per-cpu data for a cpu going offline would be a typical 18 * Depopulating per-cpu data for a cpu going offline would be a typical
19 * use case. You need to register a cpu hotplug handler for that purpose. 19 * use case. You need to register a cpu hotplug handler for that purpose.
20 */ 20 */
21void percpu_depopulate(void *__pdata, int cpu) 21static void percpu_depopulate(void *__pdata, int cpu)
22{ 22{
23 struct percpu_data *pdata = __percpu_disguise(__pdata); 23 struct percpu_data *pdata = __percpu_disguise(__pdata);
24 24
25 kfree(pdata->ptrs[cpu]); 25 kfree(pdata->ptrs[cpu]);
26 pdata->ptrs[cpu] = NULL; 26 pdata->ptrs[cpu] = NULL;
27} 27}
28EXPORT_SYMBOL_GPL(percpu_depopulate);
29 28
30/** 29/**
31 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's 30 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
32 * @__pdata: per-cpu data to depopulate 31 * @__pdata: per-cpu data to depopulate
33 * @mask: depopulate per-cpu data for cpu's selected through mask bits 32 * @mask: depopulate per-cpu data for cpu's selected through mask bits
34 */ 33 */
35void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
36{ 35{
37 int cpu; 36 int cpu;
38 for_each_cpu_mask(cpu, *mask) 37 for_each_cpu_mask_nr(cpu, *mask)
39 percpu_depopulate(__pdata, cpu); 38 percpu_depopulate(__pdata, cpu);
40} 39}
41EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); 40
41#define percpu_depopulate_mask(__pdata, mask) \
42 __percpu_depopulate_mask((__pdata), &(mask))
42 43
43/** 44/**
44 * percpu_populate - populate per-cpu data for given cpu 45 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
51 * use case. You need to register a cpu hotplug handler for that purpose. 52 * use case. You need to register a cpu hotplug handler for that purpose.
52 * Per-cpu object is populated with zeroed buffer. 53 * Per-cpu object is populated with zeroed buffer.
53 */ 54 */
54void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) 55static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
55{ 56{
56 struct percpu_data *pdata = __percpu_disguise(__pdata); 57 struct percpu_data *pdata = __percpu_disguise(__pdata);
57 int node = cpu_to_node(cpu); 58 int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
68 pdata->ptrs[cpu] = kzalloc(size, gfp); 69 pdata->ptrs[cpu] = kzalloc(size, gfp);
69 return pdata->ptrs[cpu]; 70 return pdata->ptrs[cpu];
70} 71}
71EXPORT_SYMBOL_GPL(percpu_populate);
72 72
73/** 73/**
74 * percpu_populate_mask - populate per-cpu data for more cpu's 74 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate);
79 * 79 *
80 * Per-cpu objects are populated with zeroed buffers. 80 * Per-cpu objects are populated with zeroed buffers.
81 */ 81 */
82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, 82static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
83 cpumask_t *mask) 83 cpumask_t *mask)
84{ 84{
85 cpumask_t populated; 85 cpumask_t populated;
86 int cpu; 86 int cpu;
87 87
88 cpus_clear(populated); 88 cpus_clear(populated);
89 for_each_cpu_mask(cpu, *mask) 89 for_each_cpu_mask_nr(cpu, *mask)
90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { 90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
91 __percpu_depopulate_mask(__pdata, &populated); 91 __percpu_depopulate_mask(__pdata, &populated);
92 return -ENOMEM; 92 return -ENOMEM;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
94 cpu_set(cpu, populated); 94 cpu_set(cpu, populated);
95 return 0; 95 return 0;
96} 96}
97EXPORT_SYMBOL_GPL(__percpu_populate_mask); 97
98#define percpu_populate_mask(__pdata, size, gfp, mask) \
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
98 100
99/** 101/**
100 * percpu_alloc_mask - initial setup of per-cpu data 102 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8d9f60e06f62..4af15d0340ad 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/mm/bootmem.c 2 * bootmem - A boot-time physical memory allocator and configurator
3 * 3 *
4 * Copyright (C) 1999 Ingo Molnar 4 * Copyright (C) 1999 Ingo Molnar
5 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
6 * 7 *
7 * simple boot-time physical memory area allocator and 8 * Access to this subsystem has to be serialized externally (which is true
8 * free memory collector. It's used to deal with reserved 9 * for the boot process anyway).
9 * system memory and memory holes as well.
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
@@ -19,15 +19,10 @@
19 19
20#include "internal.h" 20#include "internal.h"
21 21
22/*
23 * Access to this subsystem has to be serialized externally. (this is
24 * true for the boot process anyway)
25 */
26unsigned long max_low_pfn; 22unsigned long max_low_pfn;
27unsigned long min_low_pfn; 23unsigned long min_low_pfn;
28unsigned long max_pfn; 24unsigned long max_pfn;
29 25
30static LIST_HEAD(bdata_list);
31#ifdef CONFIG_CRASH_DUMP 26#ifdef CONFIG_CRASH_DUMP
32/* 27/*
33 * If we have booted due to a crash, max_pfn will be a very low value. We need 28 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
36unsigned long saved_max_pfn; 31unsigned long saved_max_pfn;
37#endif 32#endif
38 33
39/* return the number of _pages_ that will be allocated for the boot bitmap */ 34bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
40unsigned long __init bootmem_bootmap_pages(unsigned long pages) 35
36static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
37
38static int bootmem_debug;
39
40static int __init bootmem_debug_setup(char *buf)
41{ 41{
42 unsigned long mapsize; 42 bootmem_debug = 1;
43 return 0;
44}
45early_param("bootmem_debug", bootmem_debug_setup);
43 46
44 mapsize = (pages+7)/8; 47#define bdebug(fmt, args...) ({ \
45 mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; 48 if (unlikely(bootmem_debug)) \
46 mapsize >>= PAGE_SHIFT; 49 printk(KERN_INFO \
50 "bootmem::%s " fmt, \
51 __FUNCTION__, ## args); \
52})
47 53
48 return mapsize; 54static unsigned long __init bootmap_bytes(unsigned long pages)
55{
56 unsigned long bytes = (pages + 7) / 8;
57
58 return ALIGN(bytes, sizeof(long));
49} 59}
50 60
51/* 61/**
52 * link bdata in order 62 * bootmem_bootmap_pages - calculate bitmap size in pages
63 * @pages: number of pages the bitmap has to represent
53 */ 64 */
54static void __init link_bootmem(bootmem_data_t *bdata) 65unsigned long __init bootmem_bootmap_pages(unsigned long pages)
55{ 66{
56 bootmem_data_t *ent; 67 unsigned long bytes = bootmap_bytes(pages);
57 68
58 if (list_empty(&bdata_list)) { 69 return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
59 list_add(&bdata->list, &bdata_list);
60 return;
61 }
62 /* insert in order */
63 list_for_each_entry(ent, &bdata_list, list) {
64 if (bdata->node_boot_start < ent->node_boot_start) {
65 list_add_tail(&bdata->list, &ent->list);
66 return;
67 }
68 }
69 list_add_tail(&bdata->list, &bdata_list);
70} 70}
71 71
72/* 72/*
73 * Given an initialised bdata, it returns the size of the boot bitmap 73 * link bdata in order
74 */ 74 */
75static unsigned long __init get_mapsize(bootmem_data_t *bdata) 75static void __init link_bootmem(bootmem_data_t *bdata)
76{ 76{
77 unsigned long mapsize; 77 struct list_head *iter;
78 unsigned long start = PFN_DOWN(bdata->node_boot_start);
79 unsigned long end = bdata->node_low_pfn;
80 78
81 mapsize = ((end - start) + 7) / 8; 79 list_for_each(iter, &bdata_list) {
82 return ALIGN(mapsize, sizeof(long)); 80 bootmem_data_t *ent;
81
82 ent = list_entry(iter, bootmem_data_t, list);
83 if (bdata->node_min_pfn < ent->node_min_pfn)
84 break;
85 }
86 list_add_tail(&bdata->list, iter);
83} 87}
84 88
85/* 89/*
86 * Called once to set up the allocator itself. 90 * Called once to set up the allocator itself.
87 */ 91 */
88static unsigned long __init init_bootmem_core(pg_data_t *pgdat, 92static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
89 unsigned long mapstart, unsigned long start, unsigned long end) 93 unsigned long mapstart, unsigned long start, unsigned long end)
90{ 94{
91 bootmem_data_t *bdata = pgdat->bdata;
92 unsigned long mapsize; 95 unsigned long mapsize;
93 96
97 mminit_validate_memmodel_limits(&start, &end);
94 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); 98 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
95 bdata->node_boot_start = PFN_PHYS(start); 99 bdata->node_min_pfn = start;
96 bdata->node_low_pfn = end; 100 bdata->node_low_pfn = end;
97 link_bootmem(bdata); 101 link_bootmem(bdata);
98 102
@@ -100,429 +104,461 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
100 * Initially all pages are reserved - setup_arch() has to 104 * Initially all pages are reserved - setup_arch() has to
101 * register free RAM areas explicitly. 105 * register free RAM areas explicitly.
102 */ 106 */
103 mapsize = get_mapsize(bdata); 107 mapsize = bootmap_bytes(end - start);
104 memset(bdata->node_bootmem_map, 0xff, mapsize); 108 memset(bdata->node_bootmem_map, 0xff, mapsize);
105 109
110 bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
111 bdata - bootmem_node_data, start, mapstart, end, mapsize);
112
106 return mapsize; 113 return mapsize;
107} 114}
108 115
109/* 116/**
110 * Marks a particular physical memory range as unallocatable. Usable RAM 117 * init_bootmem_node - register a node as boot memory
111 * might be used for boot-time allocations - or it might get added 118 * @pgdat: node to register
112 * to the free page pool later on. 119 * @freepfn: pfn where the bitmap for this node is to be placed
120 * @startpfn: first pfn on the node
121 * @endpfn: first pfn after the node
122 *
123 * Returns the number of bytes needed to hold the bitmap for this node.
113 */ 124 */
114static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, 125unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
115 unsigned long addr, unsigned long size, int flags) 126 unsigned long startpfn, unsigned long endpfn)
116{ 127{
117 unsigned long sidx, eidx; 128 return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
118 unsigned long i; 129}
119 130
120 BUG_ON(!size); 131/**
132 * init_bootmem - register boot memory
133 * @start: pfn where the bitmap is to be placed
134 * @pages: number of available physical pages
135 *
136 * Returns the number of bytes needed to hold the bitmap.
137 */
138unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
139{
140 max_low_pfn = pages;
141 min_low_pfn = start;
142 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
143}
121 144
122 /* out of range, don't hold other */ 145static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
123 if (addr + size < bdata->node_boot_start || 146{
124 PFN_DOWN(addr) > bdata->node_low_pfn) 147 int aligned;
148 struct page *page;
149 unsigned long start, end, pages, count = 0;
150
151 if (!bdata->node_bootmem_map)
125 return 0; 152 return 0;
126 153
154 start = bdata->node_min_pfn;
155 end = bdata->node_low_pfn;
156
127 /* 157 /*
128 * Round up to index to the range. 158 * If the start is aligned to the machines wordsize, we might
159 * be able to free pages in bulks of that order.
129 */ 160 */
130 if (addr > bdata->node_boot_start) 161 aligned = !(start & (BITS_PER_LONG - 1));
131 sidx= PFN_DOWN(addr - bdata->node_boot_start);
132 else
133 sidx = 0;
134 162
135 eidx = PFN_UP(addr + size - bdata->node_boot_start); 163 bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
136 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 164 bdata - bootmem_node_data, start, end, aligned);
137 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
138 165
139 for (i = sidx; i < eidx; i++) { 166 while (start < end) {
140 if (test_bit(i, bdata->node_bootmem_map)) { 167 unsigned long *map, idx, vec;
141 if (flags & BOOTMEM_EXCLUSIVE)
142 return -EBUSY;
143 }
144 }
145 168
146 return 0; 169 map = bdata->node_bootmem_map;
170 idx = start - bdata->node_min_pfn;
171 vec = ~map[idx / BITS_PER_LONG];
147 172
148} 173 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
174 int order = ilog2(BITS_PER_LONG);
149 175
150static void __init reserve_bootmem_core(bootmem_data_t *bdata, 176 __free_pages_bootmem(pfn_to_page(start), order);
151 unsigned long addr, unsigned long size, int flags) 177 count += BITS_PER_LONG;
152{ 178 } else {
153 unsigned long sidx, eidx; 179 unsigned long off = 0;
154 unsigned long i;
155
156 BUG_ON(!size);
157 180
158 /* out of range */ 181 while (vec && off < BITS_PER_LONG) {
159 if (addr + size < bdata->node_boot_start || 182 if (vec & 1) {
160 PFN_DOWN(addr) > bdata->node_low_pfn) 183 page = pfn_to_page(start + off);
161 return; 184 __free_pages_bootmem(page, 0);
185 count++;
186 }
187 vec >>= 1;
188 off++;
189 }
190 }
191 start += BITS_PER_LONG;
192 }
162 193
163 /* 194 page = virt_to_page(bdata->node_bootmem_map);
164 * Round up to index to the range. 195 pages = bdata->node_low_pfn - bdata->node_min_pfn;
165 */ 196 pages = bootmem_bootmap_pages(pages);
166 if (addr > bdata->node_boot_start) 197 count += pages;
167 sidx= PFN_DOWN(addr - bdata->node_boot_start); 198 while (pages--)
168 else 199 __free_pages_bootmem(page++, 0);
169 sidx = 0;
170 200
171 eidx = PFN_UP(addr + size - bdata->node_boot_start); 201 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
172 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
173 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
174 202
175 for (i = sidx; i < eidx; i++) { 203 return count;
176 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
177#ifdef CONFIG_DEBUG_BOOTMEM
178 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
179#endif
180 }
181 }
182} 204}
183 205
184static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 206/**
185 unsigned long size) 207 * free_all_bootmem_node - release a node's free pages to the buddy allocator
208 * @pgdat: node to be released
209 *
210 * Returns the number of pages actually released.
211 */
212unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
186{ 213{
187 unsigned long sidx, eidx; 214 register_page_bootmem_info_node(pgdat);
188 unsigned long i; 215 return free_all_bootmem_core(pgdat->bdata);
189 216}
190 BUG_ON(!size);
191 217
192 /* out range */ 218/**
193 if (addr + size < bdata->node_boot_start || 219 * free_all_bootmem - release free pages to the buddy allocator
194 PFN_DOWN(addr) > bdata->node_low_pfn) 220 *
195 return; 221 * Returns the number of pages actually released.
196 /* 222 */
197 * round down end of usable mem, partially free pages are 223unsigned long __init free_all_bootmem(void)
198 * considered reserved. 224{
199 */ 225 return free_all_bootmem_core(NODE_DATA(0)->bdata);
226}
200 227
201 if (addr >= bdata->node_boot_start && addr < bdata->last_success) 228static void __init __free(bootmem_data_t *bdata,
202 bdata->last_success = addr; 229 unsigned long sidx, unsigned long eidx)
230{
231 unsigned long idx;
203 232
204 /* 233 bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
205 * Round up to index to the range. 234 sidx + bdata->node_min_pfn,
206 */ 235 eidx + bdata->node_min_pfn);
207 if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
208 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
209 else
210 sidx = 0;
211 236
212 eidx = PFN_DOWN(addr + size - bdata->node_boot_start); 237 if (bdata->hint_idx > sidx)
213 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 238 bdata->hint_idx = sidx;
214 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
215 239
216 for (i = sidx; i < eidx; i++) { 240 for (idx = sidx; idx < eidx; idx++)
217 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 241 if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
218 BUG(); 242 BUG();
219 }
220} 243}
221 244
222/* 245static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
223 * We 'merge' subsequent allocations to save space. We might 'lose' 246 unsigned long eidx, int flags)
224 * some fraction of a page if allocations cannot be satisfied due to
225 * size constraints on boxes where there is physical RAM space
226 * fragmentation - in these cases (mostly large memory boxes) this
227 * is not a problem.
228 *
229 * On low memory boxes we get it right in 100% of the cases.
230 *
231 * alignment has to be a power of 2 value.
232 *
233 * NOTE: This function is _not_ reentrant.
234 */
235void * __init
236__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
237 unsigned long align, unsigned long goal, unsigned long limit)
238{ 247{
239 unsigned long areasize, preferred; 248 unsigned long idx;
240 unsigned long i, start = 0, incr, eidx, end_pfn; 249 int exclusive = flags & BOOTMEM_EXCLUSIVE;
241 void *ret; 250
242 unsigned long node_boot_start; 251 bdebug("nid=%td start=%lx end=%lx flags=%x\n",
243 void *node_bootmem_map; 252 bdata - bootmem_node_data,
244 253 sidx + bdata->node_min_pfn,
245 if (!size) { 254 eidx + bdata->node_min_pfn,
246 printk("__alloc_bootmem_core(): zero-sized request\n"); 255 flags);
247 BUG(); 256
248 } 257 for (idx = sidx; idx < eidx; idx++)
249 BUG_ON(align & (align-1)); 258 if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
250 259 if (exclusive) {
251 /* on nodes without memory - bootmem_map is NULL */ 260 __free(bdata, sidx, idx);
252 if (!bdata->node_bootmem_map) 261 return -EBUSY;
253 return NULL; 262 }
263 bdebug("silent double reserve of PFN %lx\n",
264 idx + bdata->node_min_pfn);
265 }
266 return 0;
267}
254 268
255 /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ 269static int __init mark_bootmem_node(bootmem_data_t *bdata,
256 node_boot_start = bdata->node_boot_start; 270 unsigned long start, unsigned long end,
257 node_bootmem_map = bdata->node_bootmem_map; 271 int reserve, int flags)
258 if (align) { 272{
259 node_boot_start = ALIGN(bdata->node_boot_start, align); 273 unsigned long sidx, eidx;
260 if (node_boot_start > bdata->node_boot_start)
261 node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
262 PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
263 }
264 274
265 if (limit && node_boot_start >= limit) 275 bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
266 return NULL; 276 bdata - bootmem_node_data, start, end, reserve, flags);
267 277
268 end_pfn = bdata->node_low_pfn; 278 BUG_ON(start < bdata->node_min_pfn);
269 limit = PFN_DOWN(limit); 279 BUG_ON(end > bdata->node_low_pfn);
270 if (limit && end_pfn > limit)
271 end_pfn = limit;
272 280
273 eidx = end_pfn - PFN_DOWN(node_boot_start); 281 sidx = start - bdata->node_min_pfn;
282 eidx = end - bdata->node_min_pfn;
274 283
275 /* 284 if (reserve)
276 * We try to allocate bootmem pages above 'goal' 285 return __reserve(bdata, sidx, eidx, flags);
277 * first, then we try to allocate lower pages. 286 else
278 */ 287 __free(bdata, sidx, eidx);
279 preferred = 0; 288 return 0;
280 if (goal && PFN_DOWN(goal) < end_pfn) { 289}
281 if (goal > node_boot_start)
282 preferred = goal - node_boot_start;
283
284 if (bdata->last_success > node_boot_start &&
285 bdata->last_success - node_boot_start >= preferred)
286 if (!limit || (limit && limit > bdata->last_success))
287 preferred = bdata->last_success - node_boot_start;
288 }
289 290
290 preferred = PFN_DOWN(ALIGN(preferred, align)); 291static int __init mark_bootmem(unsigned long start, unsigned long end,
291 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; 292 int reserve, int flags)
292 incr = align >> PAGE_SHIFT ? : 1; 293{
294 unsigned long pos;
295 bootmem_data_t *bdata;
293 296
294restart_scan: 297 pos = start;
295 for (i = preferred; i < eidx;) { 298 list_for_each_entry(bdata, &bdata_list, list) {
296 unsigned long j; 299 int err;
300 unsigned long max;
297 301
298 i = find_next_zero_bit(node_bootmem_map, eidx, i); 302 if (pos < bdata->node_min_pfn ||
299 i = ALIGN(i, incr); 303 pos >= bdata->node_low_pfn) {
300 if (i >= eidx) 304 BUG_ON(pos != start);
301 break;
302 if (test_bit(i, node_bootmem_map)) {
303 i += incr;
304 continue; 305 continue;
305 } 306 }
306 for (j = i + 1; j < i + areasize; ++j) {
307 if (j >= eidx)
308 goto fail_block;
309 if (test_bit(j, node_bootmem_map))
310 goto fail_block;
311 }
312 start = i;
313 goto found;
314 fail_block:
315 i = ALIGN(j, incr);
316 if (i == j)
317 i += incr;
318 }
319 307
320 if (preferred > 0) { 308 max = min(bdata->node_low_pfn, end);
321 preferred = 0;
322 goto restart_scan;
323 }
324 return NULL;
325 309
326found: 310 err = mark_bootmem_node(bdata, pos, max, reserve, flags);
327 bdata->last_success = PFN_PHYS(start) + node_boot_start; 311 if (reserve && err) {
328 BUG_ON(start >= eidx); 312 mark_bootmem(start, pos, 0, 0);
329 313 return err;
330 /*
331 * Is the next page of the previous allocation-end the start
332 * of this allocation's buffer? If yes then we can 'merge'
333 * the previous partial page with this allocation.
334 */
335 if (align < PAGE_SIZE &&
336 bdata->last_offset && bdata->last_pos+1 == start) {
337 unsigned long offset, remaining_size;
338 offset = ALIGN(bdata->last_offset, align);
339 BUG_ON(offset > PAGE_SIZE);
340 remaining_size = PAGE_SIZE - offset;
341 if (size < remaining_size) {
342 areasize = 0;
343 /* last_pos unchanged */
344 bdata->last_offset = offset + size;
345 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
346 offset + node_boot_start);
347 } else {
348 remaining_size = size - remaining_size;
349 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
350 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
351 offset + node_boot_start);
352 bdata->last_pos = start + areasize - 1;
353 bdata->last_offset = remaining_size;
354 } 314 }
355 bdata->last_offset &= ~PAGE_MASK;
356 } else {
357 bdata->last_pos = start + areasize - 1;
358 bdata->last_offset = size & ~PAGE_MASK;
359 ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
360 }
361 315
362 /* 316 if (max == end)
363 * Reserve the area now: 317 return 0;
364 */ 318 pos = bdata->node_low_pfn;
365 for (i = start; i < start + areasize; i++) 319 }
366 if (unlikely(test_and_set_bit(i, node_bootmem_map))) 320 BUG();
367 BUG();
368 memset(ret, 0, size);
369 return ret;
370} 321}
371 322
372static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 323/**
324 * free_bootmem_node - mark a page range as usable
325 * @pgdat: node the range resides on
326 * @physaddr: starting address of the range
327 * @size: size of the range in bytes
328 *
329 * Partial pages will be considered reserved and left as they are.
330 *
331 * The range must reside completely on the specified node.
332 */
333void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
334 unsigned long size)
373{ 335{
374 struct page *page; 336 unsigned long start, end;
375 unsigned long pfn;
376 bootmem_data_t *bdata = pgdat->bdata;
377 unsigned long i, count, total = 0;
378 unsigned long idx;
379 unsigned long *map;
380 int gofast = 0;
381
382 BUG_ON(!bdata->node_bootmem_map);
383
384 count = 0;
385 /* first extant page of the node */
386 pfn = PFN_DOWN(bdata->node_boot_start);
387 idx = bdata->node_low_pfn - pfn;
388 map = bdata->node_bootmem_map;
389 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
390 if (bdata->node_boot_start == 0 ||
391 ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
392 gofast = 1;
393 for (i = 0; i < idx; ) {
394 unsigned long v = ~map[i / BITS_PER_LONG];
395
396 if (gofast && v == ~0UL) {
397 int order;
398
399 page = pfn_to_page(pfn);
400 count += BITS_PER_LONG;
401 order = ffs(BITS_PER_LONG) - 1;
402 __free_pages_bootmem(page, order);
403 i += BITS_PER_LONG;
404 page += BITS_PER_LONG;
405 } else if (v) {
406 unsigned long m;
407
408 page = pfn_to_page(pfn);
409 for (m = 1; m && i < idx; m<<=1, page++, i++) {
410 if (v & m) {
411 count++;
412 __free_pages_bootmem(page, 0);
413 }
414 }
415 } else {
416 i += BITS_PER_LONG;
417 }
418 pfn += BITS_PER_LONG;
419 }
420 total += count;
421 337
422 /* 338 start = PFN_UP(physaddr);
423 * Now free the allocator bitmap itself, it's not 339 end = PFN_DOWN(physaddr + size);
424 * needed anymore:
425 */
426 page = virt_to_page(bdata->node_bootmem_map);
427 count = 0;
428 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
429 for (i = 0; i < idx; i++, page++) {
430 __free_pages_bootmem(page, 0);
431 count++;
432 }
433 total += count;
434 bdata->node_bootmem_map = NULL;
435 340
436 return total; 341 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
437} 342}
438 343
439unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, 344/**
440 unsigned long startpfn, unsigned long endpfn) 345 * free_bootmem - mark a page range as usable
441{ 346 * @addr: starting address of the range
442 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); 347 * @size: size of the range in bytes
443} 348 *
444 349 * Partial pages will be considered reserved and left as they are.
445int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 350 *
446 unsigned long size, int flags) 351 * The range must be contiguous but may span node boundaries.
352 */
353void __init free_bootmem(unsigned long addr, unsigned long size)
447{ 354{
448 int ret; 355 unsigned long start, end;
449 356
450 ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); 357 start = PFN_UP(addr);
451 if (ret < 0) 358 end = PFN_DOWN(addr + size);
452 return -ENOMEM;
453 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
454 359
455 return 0; 360 mark_bootmem(start, end, 0, 0);
456} 361}
457 362
458void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 363/**
459 unsigned long size) 364 * reserve_bootmem_node - mark a page range as reserved
365 * @pgdat: node the range resides on
366 * @physaddr: starting address of the range
367 * @size: size of the range in bytes
368 * @flags: reservation flags (see linux/bootmem.h)
369 *
370 * Partial pages will be reserved.
371 *
372 * The range must reside completely on the specified node.
373 */
374int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size, int flags)
460{ 376{
461 free_bootmem_core(pgdat->bdata, physaddr, size); 377 unsigned long start, end;
462}
463 378
464unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 379 start = PFN_DOWN(physaddr);
465{ 380 end = PFN_UP(physaddr + size);
466 register_page_bootmem_info_node(pgdat);
467 return free_all_bootmem_core(pgdat);
468}
469 381
470unsigned long __init init_bootmem(unsigned long start, unsigned long pages) 382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
471{
472 max_low_pfn = pages;
473 min_low_pfn = start;
474 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
475} 383}
476 384
477#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386/**
387 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range
389 * @size: size of the range in bytes
390 * @flags: reservation flags (see linux/bootmem.h)
391 *
392 * Partial pages will be reserved.
393 *
394 * The range must be contiguous but may span node boundaries.
395 */
478int __init reserve_bootmem(unsigned long addr, unsigned long size, 396int __init reserve_bootmem(unsigned long addr, unsigned long size,
479 int flags) 397 int flags)
480{ 398{
481 bootmem_data_t *bdata; 399 unsigned long start, end;
482 int ret;
483 400
484 list_for_each_entry(bdata, &bdata_list, list) { 401 start = PFN_DOWN(addr);
485 ret = can_reserve_bootmem_core(bdata, addr, size, flags); 402 end = PFN_UP(addr + size);
486 if (ret < 0)
487 return ret;
488 }
489 list_for_each_entry(bdata, &bdata_list, list)
490 reserve_bootmem_core(bdata, addr, size, flags);
491 403
492 return 0; 404 return mark_bootmem(start, end, 1, flags);
493} 405}
494#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
495 407
496void __init free_bootmem(unsigned long addr, unsigned long size) 408static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
409 unsigned long size, unsigned long align,
410 unsigned long goal, unsigned long limit)
497{ 411{
498 bootmem_data_t *bdata; 412 unsigned long fallback = 0;
499 list_for_each_entry(bdata, &bdata_list, list) 413 unsigned long min, max, start, sidx, midx, step;
500 free_bootmem_core(bdata, addr, size);
501}
502 414
503unsigned long __init free_all_bootmem(void) 415 BUG_ON(!size);
504{ 416 BUG_ON(align & (align - 1));
505 return free_all_bootmem_core(NODE_DATA(0)); 417 BUG_ON(limit && goal + size > limit);
418
419 if (!bdata->node_bootmem_map)
420 return NULL;
421
422 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
423 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
424 align, goal, limit);
425
426 min = bdata->node_min_pfn;
427 max = bdata->node_low_pfn;
428
429 goal >>= PAGE_SHIFT;
430 limit >>= PAGE_SHIFT;
431
432 if (limit && max > limit)
433 max = limit;
434 if (max <= min)
435 return NULL;
436
437 step = max(align >> PAGE_SHIFT, 1UL);
438
439 if (goal && min < goal && goal < max)
440 start = ALIGN(goal, step);
441 else
442 start = ALIGN(min, step);
443
444 sidx = start - bdata->node_min_pfn;;
445 midx = max - bdata->node_min_pfn;
446
447 if (bdata->hint_idx > sidx) {
448 /*
449 * Handle the valid case of sidx being zero and still
450 * catch the fallback below.
451 */
452 fallback = sidx + 1;
453 sidx = ALIGN(bdata->hint_idx, step);
454 }
455
456 while (1) {
457 int merge;
458 void *region;
459 unsigned long eidx, i, start_off, end_off;
460find_block:
461 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
462 sidx = ALIGN(sidx, step);
463 eidx = sidx + PFN_UP(size);
464
465 if (sidx >= midx || eidx > midx)
466 break;
467
468 for (i = sidx; i < eidx; i++)
469 if (test_bit(i, bdata->node_bootmem_map)) {
470 sidx = ALIGN(i, step);
471 if (sidx == i)
472 sidx += step;
473 goto find_block;
474 }
475
476 if (bdata->last_end_off &&
477 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
478 start_off = ALIGN(bdata->last_end_off, align);
479 else
480 start_off = PFN_PHYS(sidx);
481
482 merge = PFN_DOWN(start_off) < sidx;
483 end_off = start_off + size;
484
485 bdata->last_end_off = end_off;
486 bdata->hint_idx = PFN_UP(end_off);
487
488 /*
489 * Reserve the area now:
490 */
491 if (__reserve(bdata, PFN_DOWN(start_off) + merge,
492 PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
493 BUG();
494
495 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
496 start_off);
497 memset(region, 0, size);
498 return region;
499 }
500
501 if (fallback) {
502 sidx = ALIGN(fallback - 1, step);
503 fallback = 0;
504 goto find_block;
505 }
506
507 return NULL;
506} 508}
507 509
508void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 510static void * __init ___alloc_bootmem_nopanic(unsigned long size,
509 unsigned long goal) 511 unsigned long align,
512 unsigned long goal,
513 unsigned long limit)
510{ 514{
511 bootmem_data_t *bdata; 515 bootmem_data_t *bdata;
512 void *ptr;
513 516
517restart:
514 list_for_each_entry(bdata, &bdata_list, list) { 518 list_for_each_entry(bdata, &bdata_list, list) {
515 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); 519 void *region;
516 if (ptr) 520
517 return ptr; 521 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
522 continue;
523 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
524 break;
525
526 region = alloc_bootmem_core(bdata, size, align, goal, limit);
527 if (region)
528 return region;
529 }
530
531 if (goal) {
532 goal = 0;
533 goto restart;
518 } 534 }
535
519 return NULL; 536 return NULL;
520} 537}
521 538
522void * __init __alloc_bootmem(unsigned long size, unsigned long align, 539/**
523 unsigned long goal) 540 * __alloc_bootmem_nopanic - allocate boot memory without panicking
541 * @size: size of the request in bytes
542 * @align: alignment of the region
543 * @goal: preferred starting address of the region
544 *
545 * The goal is dropped if it can not be satisfied and the allocation will
546 * fall back to memory below @goal.
547 *
548 * Allocation may happen on any node in the system.
549 *
550 * Returns NULL on failure.
551 */
552void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
553 unsigned long goal)
524{ 554{
525 void *mem = __alloc_bootmem_nopanic(size,align,goal); 555 return ___alloc_bootmem_nopanic(size, align, goal, 0);
556}
557
558static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
559 unsigned long goal, unsigned long limit)
560{
561 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
526 562
527 if (mem) 563 if (mem)
528 return mem; 564 return mem;
@@ -534,78 +570,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
534 return NULL; 570 return NULL;
535} 571}
536 572
573/**
574 * __alloc_bootmem - allocate boot memory
575 * @size: size of the request in bytes
576 * @align: alignment of the region
577 * @goal: preferred starting address of the region
578 *
579 * The goal is dropped if it can not be satisfied and the allocation will
580 * fall back to memory below @goal.
581 *
582 * Allocation may happen on any node in the system.
583 *
584 * The function panics if the request can not be satisfied.
585 */
586void * __init __alloc_bootmem(unsigned long size, unsigned long align,
587 unsigned long goal)
588{
589 return ___alloc_bootmem(size, align, goal, 0);
590}
537 591
538void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 592static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
539 unsigned long align, unsigned long goal) 593 unsigned long size, unsigned long align,
594 unsigned long goal, unsigned long limit)
540{ 595{
541 void *ptr; 596 void *ptr;
542 597
543 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 598 ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
544 if (ptr) 599 if (ptr)
545 return ptr; 600 return ptr;
546 601
547 return __alloc_bootmem(size, align, goal); 602 return ___alloc_bootmem(size, align, goal, limit);
603}
604
605/**
606 * __alloc_bootmem_node - allocate boot memory from a specific node
607 * @pgdat: node to allocate from
608 * @size: size of the request in bytes
609 * @align: alignment of the region
610 * @goal: preferred starting address of the region
611 *
612 * The goal is dropped if it can not be satisfied and the allocation will
613 * fall back to memory below @goal.
614 *
615 * Allocation may fall back to any node in the system if the specified node
616 * can not hold the requested memory.
617 *
618 * The function panics if the request can not be satisfied.
619 */
620void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
621 unsigned long align, unsigned long goal)
622{
623 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
548} 624}
549 625
550#ifdef CONFIG_SPARSEMEM 626#ifdef CONFIG_SPARSEMEM
627/**
628 * alloc_bootmem_section - allocate boot memory from a specific section
629 * @size: size of the request in bytes
630 * @section_nr: sparse map section to allocate from
631 *
632 * Return NULL on failure.
633 */
551void * __init alloc_bootmem_section(unsigned long size, 634void * __init alloc_bootmem_section(unsigned long size,
552 unsigned long section_nr) 635 unsigned long section_nr)
553{ 636{
554 void *ptr; 637 bootmem_data_t *bdata;
555 unsigned long limit, goal, start_nr, end_nr, pfn; 638 unsigned long pfn, goal, limit;
556 struct pglist_data *pgdat;
557 639
558 pfn = section_nr_to_pfn(section_nr); 640 pfn = section_nr_to_pfn(section_nr);
559 goal = PFN_PHYS(pfn); 641 goal = pfn << PAGE_SHIFT;
560 limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; 642 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
561 pgdat = NODE_DATA(early_pfn_to_nid(pfn)); 643 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
562 ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
563 limit);
564 644
565 if (!ptr) 645 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
566 return NULL; 646}
647#endif
567 648
568 start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); 649void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
569 end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); 650 unsigned long align, unsigned long goal)
570 if (start_nr != section_nr || end_nr != section_nr) { 651{
571 printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", 652 void *ptr;
572 section_nr);
573 free_bootmem_core(pgdat->bdata, __pa(ptr), size);
574 ptr = NULL;
575 }
576 653
577 return ptr; 654 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
655 if (ptr)
656 return ptr;
657
658 return __alloc_bootmem_nopanic(size, align, goal);
578} 659}
579#endif
580 660
581#ifndef ARCH_LOW_ADDRESS_LIMIT 661#ifndef ARCH_LOW_ADDRESS_LIMIT
582#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 662#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
583#endif 663#endif
584 664
665/**
666 * __alloc_bootmem_low - allocate low boot memory
667 * @size: size of the request in bytes
668 * @align: alignment of the region
669 * @goal: preferred starting address of the region
670 *
671 * The goal is dropped if it can not be satisfied and the allocation will
672 * fall back to memory below @goal.
673 *
674 * Allocation may happen on any node in the system.
675 *
676 * The function panics if the request can not be satisfied.
677 */
585void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, 678void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
586 unsigned long goal) 679 unsigned long goal)
587{ 680{
588 bootmem_data_t *bdata; 681 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
589 void *ptr;
590
591 list_for_each_entry(bdata, &bdata_list, list) {
592 ptr = __alloc_bootmem_core(bdata, size, align, goal,
593 ARCH_LOW_ADDRESS_LIMIT);
594 if (ptr)
595 return ptr;
596 }
597
598 /*
599 * Whoops, we cannot satisfy the allocation request.
600 */
601 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
602 panic("Out of low memory");
603 return NULL;
604} 682}
605 683
684/**
685 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
686 * @pgdat: node to allocate from
687 * @size: size of the request in bytes
688 * @align: alignment of the region
689 * @goal: preferred starting address of the region
690 *
691 * The goal is dropped if it can not be satisfied and the allocation will
692 * fall back to memory below @goal.
693 *
694 * Allocation may fall back to any node in the system if the specified node
695 * can not hold the requested memory.
696 *
697 * The function panics if the request can not be satisfied.
698 */
606void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 699void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
607 unsigned long align, unsigned long goal) 700 unsigned long align, unsigned long goal)
608{ 701{
609 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, 702 return ___alloc_bootmem_node(pgdat->bdata, size, align,
610 ARCH_LOW_ADDRESS_LIMIT); 703 goal, ARCH_LOW_ADDRESS_LIMIT);
611} 704}
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d9d9e2b755..54e968650855 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,9 +42,6 @@
42 42
43#include <asm/mman.h> 43#include <asm/mman.h>
44 44
45static ssize_t
46generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
47 loff_t offset, unsigned long nr_segs);
48 45
49/* 46/*
50 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
112/* 109/*
113 * Remove a page from the page cache and free it. Caller has to make 110 * Remove a page from the page cache and free it. Caller has to make
114 * sure the page is locked and that nobody else uses it - or that usage 111 * sure the page is locked and that nobody else uses it - or that usage
115 * is safe. The caller must hold a write_lock on the mapping's tree_lock. 112 * is safe. The caller must hold the mapping's tree_lock.
116 */ 113 */
117void __remove_from_page_cache(struct page *page) 114void __remove_from_page_cache(struct page *page)
118{ 115{
119 struct address_space *mapping = page->mapping; 116 struct address_space *mapping = page->mapping;
120 117
121 mem_cgroup_uncharge_page(page); 118 mem_cgroup_uncharge_cache_page(page);
122 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
123 page->mapping = NULL; 120 page->mapping = NULL;
124 mapping->nrpages--; 121 mapping->nrpages--;
@@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page)
144 141
145 BUG_ON(!PageLocked(page)); 142 BUG_ON(!PageLocked(page));
146 143
147 write_lock_irq(&mapping->tree_lock); 144 spin_lock_irq(&mapping->tree_lock);
148 __remove_from_page_cache(page); 145 __remove_from_page_cache(page);
149 write_unlock_irq(&mapping->tree_lock); 146 spin_unlock_irq(&mapping->tree_lock);
150} 147}
151 148
152static int sync_page(void *word) 149static int sync_page(void *word)
@@ -445,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
445} 442}
446 443
447/** 444/**
448 * add_to_page_cache - add newly allocated pagecache pages 445 * add_to_page_cache_locked - add a locked page to the pagecache
449 * @page: page to add 446 * @page: page to add
450 * @mapping: the page's address_space 447 * @mapping: the page's address_space
451 * @offset: page index 448 * @offset: page index
452 * @gfp_mask: page allocation mode 449 * @gfp_mask: page allocation mode
453 * 450 *
454 * This function is used to add newly allocated pagecache pages; 451 * This function is used to add a page to the pagecache. It must be locked.
455 * the page is new, so we can just run SetPageLocked() against it.
456 * The other page state flags were set by rmqueue().
457 *
458 * This function does not add the page to the LRU. The caller must do that. 452 * This function does not add the page to the LRU. The caller must do that.
459 */ 453 */
460int add_to_page_cache(struct page *page, struct address_space *mapping, 454int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
461 pgoff_t offset, gfp_t gfp_mask) 455 pgoff_t offset, gfp_t gfp_mask)
462{ 456{
463 int error = mem_cgroup_cache_charge(page, current->mm, 457 int error;
458
459 VM_BUG_ON(!PageLocked(page));
460
461 error = mem_cgroup_cache_charge(page, current->mm,
464 gfp_mask & ~__GFP_HIGHMEM); 462 gfp_mask & ~__GFP_HIGHMEM);
465 if (error) 463 if (error)
466 goto out; 464 goto out;
467 465
468 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 466 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
469 if (error == 0) { 467 if (error == 0) {
470 write_lock_irq(&mapping->tree_lock); 468 page_cache_get(page);
469 page->mapping = mapping;
470 page->index = offset;
471
472 spin_lock_irq(&mapping->tree_lock);
471 error = radix_tree_insert(&mapping->page_tree, offset, page); 473 error = radix_tree_insert(&mapping->page_tree, offset, page);
472 if (!error) { 474 if (likely(!error)) {
473 page_cache_get(page);
474 SetPageLocked(page);
475 page->mapping = mapping;
476 page->index = offset;
477 mapping->nrpages++; 475 mapping->nrpages++;
478 __inc_zone_page_state(page, NR_FILE_PAGES); 476 __inc_zone_page_state(page, NR_FILE_PAGES);
479 } else 477 } else {
480 mem_cgroup_uncharge_page(page); 478 page->mapping = NULL;
479 mem_cgroup_uncharge_cache_page(page);
480 page_cache_release(page);
481 }
481 482
482 write_unlock_irq(&mapping->tree_lock); 483 spin_unlock_irq(&mapping->tree_lock);
483 radix_tree_preload_end(); 484 radix_tree_preload_end();
484 } else 485 } else
485 mem_cgroup_uncharge_page(page); 486 mem_cgroup_uncharge_cache_page(page);
486out: 487out:
487 return error; 488 return error;
488} 489}
489EXPORT_SYMBOL(add_to_page_cache); 490EXPORT_SYMBOL(add_to_page_cache_locked);
490 491
491int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 492int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
492 pgoff_t offset, gfp_t gfp_mask) 493 pgoff_t offset, gfp_t gfp_mask)
@@ -557,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
557 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 558 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
558 * 559 *
559 * The first mb is necessary to safely close the critical section opened by the 560 * The first mb is necessary to safely close the critical section opened by the
560 * TestSetPageLocked(), the second mb is necessary to enforce ordering between 561 * test_and_set_bit() to lock the page; the second mb is necessary to enforce
561 * the clear_bit and the read of the waitqueue (to avoid SMP races with a 562 * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
562 * parallel wait_on_page_locked()). 563 * races with a parallel wait_on_page_locked()).
563 */ 564 */
564void unlock_page(struct page *page) 565void unlock_page(struct page *page)
565{ 566{
566 smp_mb__before_clear_bit(); 567 smp_mb__before_clear_bit();
567 if (!TestClearPageLocked(page)) 568 if (!test_and_clear_bit(PG_locked, &page->flags))
568 BUG(); 569 BUG();
569 smp_mb__after_clear_bit(); 570 smp_mb__after_clear_bit();
570 wake_up_page(page, PG_locked); 571 wake_up_page(page, PG_locked);
@@ -636,15 +637,35 @@ void __lock_page_nosync(struct page *page)
636 * Is there a pagecache struct page at the given (mapping, offset) tuple? 637 * Is there a pagecache struct page at the given (mapping, offset) tuple?
637 * If yes, increment its refcount and return it; if no, return NULL. 638 * If yes, increment its refcount and return it; if no, return NULL.
638 */ 639 */
639struct page * find_get_page(struct address_space *mapping, pgoff_t offset) 640struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
640{ 641{
642 void **pagep;
641 struct page *page; 643 struct page *page;
642 644
643 read_lock_irq(&mapping->tree_lock); 645 rcu_read_lock();
644 page = radix_tree_lookup(&mapping->page_tree, offset); 646repeat:
645 if (page) 647 page = NULL;
646 page_cache_get(page); 648 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
647 read_unlock_irq(&mapping->tree_lock); 649 if (pagep) {
650 page = radix_tree_deref_slot(pagep);
651 if (unlikely(!page || page == RADIX_TREE_RETRY))
652 goto repeat;
653
654 if (!page_cache_get_speculative(page))
655 goto repeat;
656
657 /*
658 * Has the page moved?
659 * This is part of the lockless pagecache protocol. See
660 * include/linux/pagemap.h for details.
661 */
662 if (unlikely(page != *pagep)) {
663 page_cache_release(page);
664 goto repeat;
665 }
666 }
667 rcu_read_unlock();
668
648 return page; 669 return page;
649} 670}
650EXPORT_SYMBOL(find_get_page); 671EXPORT_SYMBOL(find_get_page);
@@ -659,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
659 * 680 *
660 * Returns zero if the page was not present. find_lock_page() may sleep. 681 * Returns zero if the page was not present. find_lock_page() may sleep.
661 */ 682 */
662struct page *find_lock_page(struct address_space *mapping, 683struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
663 pgoff_t offset)
664{ 684{
665 struct page *page; 685 struct page *page;
666 686
667repeat: 687repeat:
668 read_lock_irq(&mapping->tree_lock); 688 page = find_get_page(mapping, offset);
669 page = radix_tree_lookup(&mapping->page_tree, offset);
670 if (page) { 689 if (page) {
671 page_cache_get(page); 690 lock_page(page);
672 if (TestSetPageLocked(page)) { 691 /* Has the page been truncated? */
673 read_unlock_irq(&mapping->tree_lock); 692 if (unlikely(page->mapping != mapping)) {
674 __lock_page(page); 693 unlock_page(page);
675 694 page_cache_release(page);
676 /* Has the page been truncated while we slept? */ 695 goto repeat;
677 if (unlikely(page->mapping != mapping)) {
678 unlock_page(page);
679 page_cache_release(page);
680 goto repeat;
681 }
682 VM_BUG_ON(page->index != offset);
683 goto out;
684 } 696 }
697 VM_BUG_ON(page->index != offset);
685 } 698 }
686 read_unlock_irq(&mapping->tree_lock);
687out:
688 return page; 699 return page;
689} 700}
690EXPORT_SYMBOL(find_lock_page); 701EXPORT_SYMBOL(find_lock_page);
@@ -750,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
750{ 761{
751 unsigned int i; 762 unsigned int i;
752 unsigned int ret; 763 unsigned int ret;
764 unsigned int nr_found;
765
766 rcu_read_lock();
767restart:
768 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
769 (void ***)pages, start, nr_pages);
770 ret = 0;
771 for (i = 0; i < nr_found; i++) {
772 struct page *page;
773repeat:
774 page = radix_tree_deref_slot((void **)pages[i]);
775 if (unlikely(!page))
776 continue;
777 /*
778 * this can only trigger if nr_found == 1, making livelock
779 * a non issue.
780 */
781 if (unlikely(page == RADIX_TREE_RETRY))
782 goto restart;
753 783
754 read_lock_irq(&mapping->tree_lock); 784 if (!page_cache_get_speculative(page))
755 ret = radix_tree_gang_lookup(&mapping->page_tree, 785 goto repeat;
756 (void **)pages, start, nr_pages); 786
757 for (i = 0; i < ret; i++) 787 /* Has the page moved? */
758 page_cache_get(pages[i]); 788 if (unlikely(page != *((void **)pages[i]))) {
759 read_unlock_irq(&mapping->tree_lock); 789 page_cache_release(page);
790 goto repeat;
791 }
792
793 pages[ret] = page;
794 ret++;
795 }
796 rcu_read_unlock();
760 return ret; 797 return ret;
761} 798}
762 799
@@ -777,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
777{ 814{
778 unsigned int i; 815 unsigned int i;
779 unsigned int ret; 816 unsigned int ret;
817 unsigned int nr_found;
818
819 rcu_read_lock();
820restart:
821 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
822 (void ***)pages, index, nr_pages);
823 ret = 0;
824 for (i = 0; i < nr_found; i++) {
825 struct page *page;
826repeat:
827 page = radix_tree_deref_slot((void **)pages[i]);
828 if (unlikely(!page))
829 continue;
830 /*
831 * this can only trigger if nr_found == 1, making livelock
832 * a non issue.
833 */
834 if (unlikely(page == RADIX_TREE_RETRY))
835 goto restart;
780 836
781 read_lock_irq(&mapping->tree_lock); 837 if (page->mapping == NULL || page->index != index)
782 ret = radix_tree_gang_lookup(&mapping->page_tree,
783 (void **)pages, index, nr_pages);
784 for (i = 0; i < ret; i++) {
785 if (pages[i]->mapping == NULL || pages[i]->index != index)
786 break; 838 break;
787 839
788 page_cache_get(pages[i]); 840 if (!page_cache_get_speculative(page))
841 goto repeat;
842
843 /* Has the page moved? */
844 if (unlikely(page != *((void **)pages[i]))) {
845 page_cache_release(page);
846 goto repeat;
847 }
848
849 pages[ret] = page;
850 ret++;
789 index++; 851 index++;
790 } 852 }
791 read_unlock_irq(&mapping->tree_lock); 853 rcu_read_unlock();
792 return i; 854 return ret;
793} 855}
794EXPORT_SYMBOL(find_get_pages_contig); 856EXPORT_SYMBOL(find_get_pages_contig);
795 857
@@ -809,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
809{ 871{
810 unsigned int i; 872 unsigned int i;
811 unsigned int ret; 873 unsigned int ret;
874 unsigned int nr_found;
875
876 rcu_read_lock();
877restart:
878 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
879 (void ***)pages, *index, nr_pages, tag);
880 ret = 0;
881 for (i = 0; i < nr_found; i++) {
882 struct page *page;
883repeat:
884 page = radix_tree_deref_slot((void **)pages[i]);
885 if (unlikely(!page))
886 continue;
887 /*
888 * this can only trigger if nr_found == 1, making livelock
889 * a non issue.
890 */
891 if (unlikely(page == RADIX_TREE_RETRY))
892 goto restart;
893
894 if (!page_cache_get_speculative(page))
895 goto repeat;
896
897 /* Has the page moved? */
898 if (unlikely(page != *((void **)pages[i]))) {
899 page_cache_release(page);
900 goto repeat;
901 }
902
903 pages[ret] = page;
904 ret++;
905 }
906 rcu_read_unlock();
812 907
813 read_lock_irq(&mapping->tree_lock);
814 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
815 (void **)pages, *index, nr_pages, tag);
816 for (i = 0; i < ret; i++)
817 page_cache_get(pages[i]);
818 if (ret) 908 if (ret)
819 *index = pages[ret - 1]->index + 1; 909 *index = pages[ret - 1]->index + 1;
820 read_unlock_irq(&mapping->tree_lock); 910
821 return ret; 911 return ret;
822} 912}
823EXPORT_SYMBOL(find_get_pages_tag); 913EXPORT_SYMBOL(find_get_pages_tag);
@@ -841,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
841 struct page *page = find_get_page(mapping, index); 931 struct page *page = find_get_page(mapping, index);
842 932
843 if (page) { 933 if (page) {
844 if (!TestSetPageLocked(page)) 934 if (trylock_page(page))
845 return page; 935 return page;
846 page_cache_release(page); 936 page_cache_release(page);
847 return NULL; 937 return NULL;
@@ -933,8 +1023,17 @@ find_page:
933 ra, filp, page, 1023 ra, filp, page,
934 index, last_index - index); 1024 index, last_index - index);
935 } 1025 }
936 if (!PageUptodate(page)) 1026 if (!PageUptodate(page)) {
937 goto page_not_up_to_date; 1027 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1028 !mapping->a_ops->is_partially_uptodate)
1029 goto page_not_up_to_date;
1030 if (!trylock_page(page))
1031 goto page_not_up_to_date;
1032 if (!mapping->a_ops->is_partially_uptodate(page,
1033 desc, offset))
1034 goto page_not_up_to_date_locked;
1035 unlock_page(page);
1036 }
938page_ok: 1037page_ok:
939 /* 1038 /*
940 * i_size must be checked after we know the page is Uptodate. 1039 * i_size must be checked after we know the page is Uptodate.
@@ -1004,6 +1103,7 @@ page_not_up_to_date:
1004 if (lock_page_killable(page)) 1103 if (lock_page_killable(page))
1005 goto readpage_eio; 1104 goto readpage_eio;
1006 1105
1106page_not_up_to_date_locked:
1007 /* Did it get truncated before we got the lock? */ 1107 /* Did it get truncated before we got the lock? */
1008 if (!page->mapping) { 1108 if (!page->mapping) {
1009 unlock_page(page); 1109 unlock_page(page);
@@ -1200,42 +1300,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1200 1300
1201 mapping = filp->f_mapping; 1301 mapping = filp->f_mapping;
1202 inode = mapping->host; 1302 inode = mapping->host;
1203 retval = 0;
1204 if (!count) 1303 if (!count)
1205 goto out; /* skip atime */ 1304 goto out; /* skip atime */
1206 size = i_size_read(inode); 1305 size = i_size_read(inode);
1207 if (pos < size) { 1306 if (pos < size) {
1208 retval = generic_file_direct_IO(READ, iocb, 1307 retval = filemap_write_and_wait(mapping);
1209 iov, pos, nr_segs); 1308 if (!retval) {
1309 retval = mapping->a_ops->direct_IO(READ, iocb,
1310 iov, pos, nr_segs);
1311 }
1210 if (retval > 0) 1312 if (retval > 0)
1211 *ppos = pos + retval; 1313 *ppos = pos + retval;
1212 } 1314 if (retval) {
1213 if (likely(retval != 0)) { 1315 file_accessed(filp);
1214 file_accessed(filp); 1316 goto out;
1215 goto out; 1317 }
1216 } 1318 }
1217 } 1319 }
1218 1320
1219 retval = 0; 1321 for (seg = 0; seg < nr_segs; seg++) {
1220 if (count) { 1322 read_descriptor_t desc;
1221 for (seg = 0; seg < nr_segs; seg++) {
1222 read_descriptor_t desc;
1223 1323
1224 desc.written = 0; 1324 desc.written = 0;
1225 desc.arg.buf = iov[seg].iov_base; 1325 desc.arg.buf = iov[seg].iov_base;
1226 desc.count = iov[seg].iov_len; 1326 desc.count = iov[seg].iov_len;
1227 if (desc.count == 0) 1327 if (desc.count == 0)
1228 continue; 1328 continue;
1229 desc.error = 0; 1329 desc.error = 0;
1230 do_generic_file_read(filp,ppos,&desc,file_read_actor); 1330 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1231 retval += desc.written; 1331 retval += desc.written;
1232 if (desc.error) { 1332 if (desc.error) {
1233 retval = retval ?: desc.error; 1333 retval = retval ?: desc.error;
1234 break; 1334 break;
1235 }
1236 if (desc.count > 0)
1237 break;
1238 } 1335 }
1336 if (desc.count > 0)
1337 break;
1239 } 1338 }
1240out: 1339out:
1241 return retval; 1340 return retval;
@@ -1669,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
1669 return notify_change(dentry, &newattrs); 1768 return notify_change(dentry, &newattrs);
1670} 1769}
1671 1770
1672int remove_suid(struct dentry *dentry) 1771int file_remove_suid(struct file *file)
1673{ 1772{
1773 struct dentry *dentry = file->f_path.dentry;
1674 int killsuid = should_remove_suid(dentry); 1774 int killsuid = should_remove_suid(dentry);
1675 int killpriv = security_inode_need_killpriv(dentry); 1775 int killpriv = security_inode_need_killpriv(dentry);
1676 int error = 0; 1776 int error = 0;
@@ -1684,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
1684 1784
1685 return error; 1785 return error;
1686} 1786}
1687EXPORT_SYMBOL(remove_suid); 1787EXPORT_SYMBOL(file_remove_suid);
1688 1788
1689static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1789static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1690 const struct iovec *iov, size_t base, size_t bytes) 1790 const struct iovec *iov, size_t base, size_t bytes)
@@ -1779,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
1779 * The !iov->iov_len check ensures we skip over unlikely 1879 * The !iov->iov_len check ensures we skip over unlikely
1780 * zero-length segments (without overruning the iovec). 1880 * zero-length segments (without overruning the iovec).
1781 */ 1881 */
1782 while (bytes || unlikely(!iov->iov_len && i->count)) { 1882 while (bytes || unlikely(i->count && !iov->iov_len)) {
1783 int copy; 1883 int copy;
1784 1884
1785 copy = min(bytes, iov->iov_len - base); 1885 copy = min(bytes, iov->iov_len - base);
@@ -2004,11 +2104,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2004 struct address_space *mapping = file->f_mapping; 2104 struct address_space *mapping = file->f_mapping;
2005 struct inode *inode = mapping->host; 2105 struct inode *inode = mapping->host;
2006 ssize_t written; 2106 ssize_t written;
2107 size_t write_len;
2108 pgoff_t end;
2007 2109
2008 if (count != ocount) 2110 if (count != ocount)
2009 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2111 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2010 2112
2011 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2113 /*
2114 * Unmap all mmappings of the file up-front.
2115 *
2116 * This will cause any pte dirty bits to be propagated into the
2117 * pageframes for the subsequent filemap_write_and_wait().
2118 */
2119 write_len = iov_length(iov, *nr_segs);
2120 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2121 if (mapping_mapped(mapping))
2122 unmap_mapping_range(mapping, pos, write_len, 0);
2123
2124 written = filemap_write_and_wait(mapping);
2125 if (written)
2126 goto out;
2127
2128 /*
2129 * After a write we want buffered reads to be sure to go to disk to get
2130 * the new data. We invalidate clean cached page from the region we're
2131 * about to write. We do this *before* the write so that we can return
2132 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2133 */
2134 if (mapping->nrpages) {
2135 written = invalidate_inode_pages2_range(mapping,
2136 pos >> PAGE_CACHE_SHIFT, end);
2137 if (written)
2138 goto out;
2139 }
2140
2141 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2142
2143 /*
2144 * Finally, try again to invalidate clean pages which might have been
2145 * cached by non-direct readahead, or faulted in by get_user_pages()
2146 * if the source of the write was an mmap'ed region of the file
2147 * we're writing. Either one is a pretty crazy thing to do,
2148 * so we don't support it 100%. If this invalidation
2149 * fails, tough, the write still worked...
2150 */
2151 if (mapping->nrpages) {
2152 invalidate_inode_pages2_range(mapping,
2153 pos >> PAGE_CACHE_SHIFT, end);
2154 }
2155
2012 if (written > 0) { 2156 if (written > 0) {
2013 loff_t end = pos + written; 2157 loff_t end = pos + written;
2014 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2158 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2024,6 +2168,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2024 * i_mutex is held, which protects generic_osync_inode() from 2168 * i_mutex is held, which protects generic_osync_inode() from
2025 * livelocking. AIO O_DIRECT ops attempt to sync metadata here. 2169 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2026 */ 2170 */
2171out:
2027 if ((written >= 0 || written == -EIOCBQUEUED) && 2172 if ((written >= 0 || written == -EIOCBQUEUED) &&
2028 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2173 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2029 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2174 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2395,7 +2540,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2395 if (count == 0) 2540 if (count == 0)
2396 goto out; 2541 goto out;
2397 2542
2398 err = remove_suid(file->f_path.dentry); 2543 err = file_remove_suid(file);
2399 if (err) 2544 if (err)
2400 goto out; 2545 goto out;
2401 2546
@@ -2511,66 +2656,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2511} 2656}
2512EXPORT_SYMBOL(generic_file_aio_write); 2657EXPORT_SYMBOL(generic_file_aio_write);
2513 2658
2514/*
2515 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2516 * went wrong during pagecache shootdown.
2517 */
2518static ssize_t
2519generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2520 loff_t offset, unsigned long nr_segs)
2521{
2522 struct file *file = iocb->ki_filp;
2523 struct address_space *mapping = file->f_mapping;
2524 ssize_t retval;
2525 size_t write_len;
2526 pgoff_t end = 0; /* silence gcc */
2527
2528 /*
2529 * If it's a write, unmap all mmappings of the file up-front. This
2530 * will cause any pte dirty bits to be propagated into the pageframes
2531 * for the subsequent filemap_write_and_wait().
2532 */
2533 if (rw == WRITE) {
2534 write_len = iov_length(iov, nr_segs);
2535 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2536 if (mapping_mapped(mapping))
2537 unmap_mapping_range(mapping, offset, write_len, 0);
2538 }
2539
2540 retval = filemap_write_and_wait(mapping);
2541 if (retval)
2542 goto out;
2543
2544 /*
2545 * After a write we want buffered reads to be sure to go to disk to get
2546 * the new data. We invalidate clean cached page from the region we're
2547 * about to write. We do this *before* the write so that we can return
2548 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2549 */
2550 if (rw == WRITE && mapping->nrpages) {
2551 retval = invalidate_inode_pages2_range(mapping,
2552 offset >> PAGE_CACHE_SHIFT, end);
2553 if (retval)
2554 goto out;
2555 }
2556
2557 retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2558
2559 /*
2560 * Finally, try again to invalidate clean pages which might have been
2561 * cached by non-direct readahead, or faulted in by get_user_pages()
2562 * if the source of the write was an mmap'ed region of the file
2563 * we're writing. Either one is a pretty crazy thing to do,
2564 * so we don't support it 100%. If this invalidation
2565 * fails, tough, the write still worked...
2566 */
2567 if (rw == WRITE && mapping->nrpages) {
2568 invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2569 }
2570out:
2571 return retval;
2572}
2573
2574/** 2659/**
2575 * try_to_release_page() - release old fs-specific metadata on a page 2660 * try_to_release_page() - release old fs-specific metadata on a page
2576 * 2661 *
@@ -2582,9 +2667,8 @@ out:
2582 * Otherwise return zero. 2667 * Otherwise return zero.
2583 * 2668 *
2584 * The @gfp_mask argument specifies whether I/O may be performed to release 2669 * The @gfp_mask argument specifies whether I/O may be performed to release
2585 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). 2670 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2586 * 2671 *
2587 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2588 */ 2672 */
2589int try_to_release_page(struct page *page, gfp_t gfp_mask) 2673int try_to_release_page(struct page *page, gfp_t gfp_mask)
2590{ 2674{
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..380ab402d711 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/mmu_notifier.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18#include <asm/io.h> 19#include <asm/io.h>
@@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping,
188 if (pte) { 189 if (pte) {
189 /* Nuke the page table entry. */ 190 /* Nuke the page table entry. */
190 flush_cache_page(vma, address, pte_pfn(*pte)); 191 flush_cache_page(vma, address, pte_pfn(*pte));
191 pteval = ptep_clear_flush(vma, address, pte); 192 pteval = ptep_clear_flush_notify(vma, address, pte);
192 page_remove_rmap(page, vma); 193 page_remove_rmap(page, vma);
193 dec_mm_counter(mm, file_rss); 194 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 195 BUG_ON(pte_dirty(pteval));
@@ -380,7 +381,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
380 if (count == 0) 381 if (count == 0)
381 goto out_backing; 382 goto out_backing;
382 383
383 ret = remove_suid(filp->f_path.dentry); 384 ret = file_remove_suid(filp);
384 if (ret) 385 if (ret)
385 goto out_backing; 386 goto out_backing;
386 387
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7881638e4a12 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h>
18 19
19#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
20#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
214 spin_unlock(&mapping->i_mmap_lock); 215 spin_unlock(&mapping->i_mmap_lock);
215 } 216 }
216 217
218 mmu_notifier_invalidate_range_start(mm, start, start + size);
217 err = populate_range(mm, vma, start, size, pgoff); 219 err = populate_range(mm, vma, start, size, pgoff);
220 mmu_notifier_invalidate_range_end(mm, start, start + size);
218 if (!err && !(flags & MAP_NONBLOCK)) { 221 if (!err && !(flags & MAP_NONBLOCK)) {
219 if (unlikely(has_write_lock)) { 222 if (unlikely(has_write_lock)) {
220 downgrade_write(&mm->mmap_sem); 223 downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..67a71191136e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,43 +9,357 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/sysctl.h> 10#include <linux/sysctl.h>
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
12#include <linux/nodemask.h> 13#include <linux/nodemask.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 15#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 16#include <linux/cpuset.h>
16#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/bootmem.h>
19#include <linux/sysfs.h>
17 20
18#include <asm/page.h> 21#include <asm/page.h>
19#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23#include <asm/io.h>
20 24
21#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
22#include "internal.h" 26#include "internal.h"
23 27
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 28const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 29static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 30unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid; 31
32static int max_hstate;
33unsigned int default_hstate_idx;
34struct hstate hstates[HUGE_MAX_HSTATE];
35
36__initdata LIST_HEAD(huge_boot_pages);
37
38/* for command line parsing */
39static struct hstate * __initdata parsed_hstate;
40static unsigned long __initdata default_hstate_max_huge_pages;
41static unsigned long __initdata default_hstate_size;
42
43#define for_each_hstate(h) \
44 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
37 45
38/* 46/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 47 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
40 */ 48 */
41static DEFINE_SPINLOCK(hugetlb_lock); 49static DEFINE_SPINLOCK(hugetlb_lock);
42 50
43static void clear_huge_page(struct page *page, unsigned long addr) 51/*
52 * Region tracking -- allows tracking of reservations and instantiated pages
53 * across the pages in a mapping.
54 *
55 * The region data structures are protected by a combination of the mmap_sem
56 * and the hugetlb_instantion_mutex. To access or modify a region the caller
57 * must either hold the mmap_sem for write, or the mmap_sem for read and
58 * the hugetlb_instantiation mutex:
59 *
60 * down_write(&mm->mmap_sem);
61 * or
62 * down_read(&mm->mmap_sem);
63 * mutex_lock(&hugetlb_instantiation_mutex);
64 */
65struct file_region {
66 struct list_head link;
67 long from;
68 long to;
69};
70
71static long region_add(struct list_head *head, long f, long t)
72{
73 struct file_region *rg, *nrg, *trg;
74
75 /* Locate the region we are either in or before. */
76 list_for_each_entry(rg, head, link)
77 if (f <= rg->to)
78 break;
79
80 /* Round our left edge to the current segment if it encloses us. */
81 if (f > rg->from)
82 f = rg->from;
83
84 /* Check for and consume any regions we now overlap with. */
85 nrg = rg;
86 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
87 if (&rg->link == head)
88 break;
89 if (rg->from > t)
90 break;
91
92 /* If this area reaches higher then extend our area to
93 * include it completely. If this is not the first area
94 * which we intend to reuse, free it. */
95 if (rg->to > t)
96 t = rg->to;
97 if (rg != nrg) {
98 list_del(&rg->link);
99 kfree(rg);
100 }
101 }
102 nrg->from = f;
103 nrg->to = t;
104 return 0;
105}
106
107static long region_chg(struct list_head *head, long f, long t)
108{
109 struct file_region *rg, *nrg;
110 long chg = 0;
111
112 /* Locate the region we are before or in. */
113 list_for_each_entry(rg, head, link)
114 if (f <= rg->to)
115 break;
116
117 /* If we are below the current region then a new region is required.
118 * Subtle, allocate a new region at the position but make it zero
119 * size such that we can guarantee to record the reservation. */
120 if (&rg->link == head || t < rg->from) {
121 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
122 if (!nrg)
123 return -ENOMEM;
124 nrg->from = f;
125 nrg->to = f;
126 INIT_LIST_HEAD(&nrg->link);
127 list_add(&nrg->link, rg->link.prev);
128
129 return t - f;
130 }
131
132 /* Round our left edge to the current segment if it encloses us. */
133 if (f > rg->from)
134 f = rg->from;
135 chg = t - f;
136
137 /* Check for and consume any regions we now overlap with. */
138 list_for_each_entry(rg, rg->link.prev, link) {
139 if (&rg->link == head)
140 break;
141 if (rg->from > t)
142 return chg;
143
144 /* We overlap with this area, if it extends futher than
145 * us then we must extend ourselves. Account for its
146 * existing reservation. */
147 if (rg->to > t) {
148 chg += rg->to - t;
149 t = rg->to;
150 }
151 chg -= rg->to - rg->from;
152 }
153 return chg;
154}
155
156static long region_truncate(struct list_head *head, long end)
157{
158 struct file_region *rg, *trg;
159 long chg = 0;
160
161 /* Locate the region we are either in or before. */
162 list_for_each_entry(rg, head, link)
163 if (end <= rg->to)
164 break;
165 if (&rg->link == head)
166 return 0;
167
168 /* If we are in the middle of a region then adjust it. */
169 if (end > rg->from) {
170 chg = rg->to - end;
171 rg->to = end;
172 rg = list_entry(rg->link.next, typeof(*rg), link);
173 }
174
175 /* Drop any remaining regions. */
176 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
177 if (&rg->link == head)
178 break;
179 chg += rg->to - rg->from;
180 list_del(&rg->link);
181 kfree(rg);
182 }
183 return chg;
184}
185
186static long region_count(struct list_head *head, long f, long t)
187{
188 struct file_region *rg;
189 long chg = 0;
190
191 /* Locate each segment we overlap with, and count that overlap. */
192 list_for_each_entry(rg, head, link) {
193 int seg_from;
194 int seg_to;
195
196 if (rg->to <= f)
197 continue;
198 if (rg->from >= t)
199 break;
200
201 seg_from = max(rg->from, f);
202 seg_to = min(rg->to, t);
203
204 chg += seg_to - seg_from;
205 }
206
207 return chg;
208}
209
210/*
211 * Convert the address within this vma to the page offset within
212 * the mapping, in pagecache page units; huge pages here.
213 */
214static pgoff_t vma_hugecache_offset(struct hstate *h,
215 struct vm_area_struct *vma, unsigned long address)
216{
217 return ((address - vma->vm_start) >> huge_page_shift(h)) +
218 (vma->vm_pgoff >> huge_page_order(h));
219}
220
221/*
222 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
223 * bits of the reservation map pointer, which are always clear due to
224 * alignment.
225 */
226#define HPAGE_RESV_OWNER (1UL << 0)
227#define HPAGE_RESV_UNMAPPED (1UL << 1)
228#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
229
230/*
231 * These helpers are used to track how many pages are reserved for
232 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
233 * is guaranteed to have their future faults succeed.
234 *
235 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
236 * the reserve counters are updated with the hugetlb_lock held. It is safe
237 * to reset the VMA at fork() time as it is not in use yet and there is no
238 * chance of the global counters getting corrupted as a result of the values.
239 *
240 * The private mapping reservation is represented in a subtly different
241 * manner to a shared mapping. A shared mapping has a region map associated
242 * with the underlying file, this region map represents the backing file
243 * pages which have ever had a reservation assigned which this persists even
244 * after the page is instantiated. A private mapping has a region map
245 * associated with the original mmap which is attached to all VMAs which
246 * reference it, this region map represents those offsets which have consumed
247 * reservation ie. where pages have been instantiated.
248 */
249static unsigned long get_vma_private_data(struct vm_area_struct *vma)
250{
251 return (unsigned long)vma->vm_private_data;
252}
253
254static void set_vma_private_data(struct vm_area_struct *vma,
255 unsigned long value)
256{
257 vma->vm_private_data = (void *)value;
258}
259
260struct resv_map {
261 struct kref refs;
262 struct list_head regions;
263};
264
265struct resv_map *resv_map_alloc(void)
266{
267 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
268 if (!resv_map)
269 return NULL;
270
271 kref_init(&resv_map->refs);
272 INIT_LIST_HEAD(&resv_map->regions);
273
274 return resv_map;
275}
276
277void resv_map_release(struct kref *ref)
278{
279 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
280
281 /* Clear out any active regions before we release the map. */
282 region_truncate(&resv_map->regions, 0);
283 kfree(resv_map);
284}
285
286static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
287{
288 VM_BUG_ON(!is_vm_hugetlb_page(vma));
289 if (!(vma->vm_flags & VM_SHARED))
290 return (struct resv_map *)(get_vma_private_data(vma) &
291 ~HPAGE_RESV_MASK);
292 return 0;
293}
294
295static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
296{
297 VM_BUG_ON(!is_vm_hugetlb_page(vma));
298 VM_BUG_ON(vma->vm_flags & VM_SHARED);
299
300 set_vma_private_data(vma, (get_vma_private_data(vma) &
301 HPAGE_RESV_MASK) | (unsigned long)map);
302}
303
304static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
305{
306 VM_BUG_ON(!is_vm_hugetlb_page(vma));
307 VM_BUG_ON(vma->vm_flags & VM_SHARED);
308
309 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
310}
311
312static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
313{
314 VM_BUG_ON(!is_vm_hugetlb_page(vma));
315
316 return (get_vma_private_data(vma) & flag) != 0;
317}
318
319/* Decrement the reserved pages in the hugepage pool by one */
320static void decrement_hugepage_resv_vma(struct hstate *h,
321 struct vm_area_struct *vma)
322{
323 if (vma->vm_flags & VM_NORESERVE)
324 return;
325
326 if (vma->vm_flags & VM_SHARED) {
327 /* Shared mappings always use reserves */
328 h->resv_huge_pages--;
329 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
330 /*
331 * Only the process that called mmap() has reserves for
332 * private mappings.
333 */
334 h->resv_huge_pages--;
335 }
336}
337
338/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
339void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
340{
341 VM_BUG_ON(!is_vm_hugetlb_page(vma));
342 if (!(vma->vm_flags & VM_SHARED))
343 vma->vm_private_data = (void *)0;
344}
345
346/* Returns true if the VMA has associated reserve pages */
347static int vma_has_reserves(struct vm_area_struct *vma)
348{
349 if (vma->vm_flags & VM_SHARED)
350 return 1;
351 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
352 return 1;
353 return 0;
354}
355
356static void clear_huge_page(struct page *page,
357 unsigned long addr, unsigned long sz)
44{ 358{
45 int i; 359 int i;
46 360
47 might_sleep(); 361 might_sleep();
48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 362 for (i = 0; i < sz/PAGE_SIZE; i++) {
49 cond_resched(); 363 cond_resched();
50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 364 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
51 } 365 }
@@ -55,42 +369,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
55 unsigned long addr, struct vm_area_struct *vma) 369 unsigned long addr, struct vm_area_struct *vma)
56{ 370{
57 int i; 371 int i;
372 struct hstate *h = hstate_vma(vma);
58 373
59 might_sleep(); 374 might_sleep();
60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 375 for (i = 0; i < pages_per_huge_page(h); i++) {
61 cond_resched(); 376 cond_resched();
62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 377 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
63 } 378 }
64} 379}
65 380
66static void enqueue_huge_page(struct page *page) 381static void enqueue_huge_page(struct hstate *h, struct page *page)
67{ 382{
68 int nid = page_to_nid(page); 383 int nid = page_to_nid(page);
69 list_add(&page->lru, &hugepage_freelists[nid]); 384 list_add(&page->lru, &h->hugepage_freelists[nid]);
70 free_huge_pages++; 385 h->free_huge_pages++;
71 free_huge_pages_node[nid]++; 386 h->free_huge_pages_node[nid]++;
72} 387}
73 388
74static struct page *dequeue_huge_page(void) 389static struct page *dequeue_huge_page(struct hstate *h)
75{ 390{
76 int nid; 391 int nid;
77 struct page *page = NULL; 392 struct page *page = NULL;
78 393
79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 394 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
80 if (!list_empty(&hugepage_freelists[nid])) { 395 if (!list_empty(&h->hugepage_freelists[nid])) {
81 page = list_entry(hugepage_freelists[nid].next, 396 page = list_entry(h->hugepage_freelists[nid].next,
82 struct page, lru); 397 struct page, lru);
83 list_del(&page->lru); 398 list_del(&page->lru);
84 free_huge_pages--; 399 h->free_huge_pages--;
85 free_huge_pages_node[nid]--; 400 h->free_huge_pages_node[nid]--;
86 break; 401 break;
87 } 402 }
88 } 403 }
89 return page; 404 return page;
90} 405}
91 406
92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 407static struct page *dequeue_huge_page_vma(struct hstate *h,
93 unsigned long address) 408 struct vm_area_struct *vma,
409 unsigned long address, int avoid_reserve)
94{ 410{
95 int nid; 411 int nid;
96 struct page *page = NULL; 412 struct page *page = NULL;
@@ -101,18 +417,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
101 struct zone *zone; 417 struct zone *zone;
102 struct zoneref *z; 418 struct zoneref *z;
103 419
420 /*
421 * A child process with MAP_PRIVATE mappings created by their parent
422 * have no page reserves. This check ensures that reservations are
423 * not "stolen". The child may still get SIGKILLed
424 */
425 if (!vma_has_reserves(vma) &&
426 h->free_huge_pages - h->resv_huge_pages == 0)
427 return NULL;
428
429 /* If reserves cannot be used, ensure enough pages are in the pool */
430 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
431 return NULL;
432
104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 433 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) { 434 MAX_NR_ZONES - 1, nodemask) {
106 nid = zone_to_nid(zone); 435 nid = zone_to_nid(zone);
107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 436 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
108 !list_empty(&hugepage_freelists[nid])) { 437 !list_empty(&h->hugepage_freelists[nid])) {
109 page = list_entry(hugepage_freelists[nid].next, 438 page = list_entry(h->hugepage_freelists[nid].next,
110 struct page, lru); 439 struct page, lru);
111 list_del(&page->lru); 440 list_del(&page->lru);
112 free_huge_pages--; 441 h->free_huge_pages--;
113 free_huge_pages_node[nid]--; 442 h->free_huge_pages_node[nid]--;
114 if (vma && vma->vm_flags & VM_MAYSHARE) 443
115 resv_huge_pages--; 444 if (!avoid_reserve)
445 decrement_hugepage_resv_vma(h, vma);
446
116 break; 447 break;
117 } 448 }
118 } 449 }
@@ -120,12 +451,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
120 return page; 451 return page;
121} 452}
122 453
123static void update_and_free_page(struct page *page) 454static void update_and_free_page(struct hstate *h, struct page *page)
124{ 455{
125 int i; 456 int i;
126 nr_huge_pages--; 457
127 nr_huge_pages_node[page_to_nid(page)]--; 458 h->nr_huge_pages--;
128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 459 h->nr_huge_pages_node[page_to_nid(page)]--;
460 for (i = 0; i < pages_per_huge_page(h); i++) {
129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 461 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 462 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
131 1 << PG_private | 1<< PG_writeback); 463 1 << PG_private | 1<< PG_writeback);
@@ -133,11 +465,27 @@ static void update_and_free_page(struct page *page)
133 set_compound_page_dtor(page, NULL); 465 set_compound_page_dtor(page, NULL);
134 set_page_refcounted(page); 466 set_page_refcounted(page);
135 arch_release_hugepage(page); 467 arch_release_hugepage(page);
136 __free_pages(page, HUGETLB_PAGE_ORDER); 468 __free_pages(page, huge_page_order(h));
469}
470
471struct hstate *size_to_hstate(unsigned long size)
472{
473 struct hstate *h;
474
475 for_each_hstate(h) {
476 if (huge_page_size(h) == size)
477 return h;
478 }
479 return NULL;
137} 480}
138 481
139static void free_huge_page(struct page *page) 482static void free_huge_page(struct page *page)
140{ 483{
484 /*
485 * Can't pass hstate in here because it is called from the
486 * compound page destructor.
487 */
488 struct hstate *h = page_hstate(page);
141 int nid = page_to_nid(page); 489 int nid = page_to_nid(page);
142 struct address_space *mapping; 490 struct address_space *mapping;
143 491
@@ -147,12 +495,12 @@ static void free_huge_page(struct page *page)
147 INIT_LIST_HEAD(&page->lru); 495 INIT_LIST_HEAD(&page->lru);
148 496
149 spin_lock(&hugetlb_lock); 497 spin_lock(&hugetlb_lock);
150 if (surplus_huge_pages_node[nid]) { 498 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
151 update_and_free_page(page); 499 update_and_free_page(h, page);
152 surplus_huge_pages--; 500 h->surplus_huge_pages--;
153 surplus_huge_pages_node[nid]--; 501 h->surplus_huge_pages_node[nid]--;
154 } else { 502 } else {
155 enqueue_huge_page(page); 503 enqueue_huge_page(h, page);
156 } 504 }
157 spin_unlock(&hugetlb_lock); 505 spin_unlock(&hugetlb_lock);
158 if (mapping) 506 if (mapping)
@@ -164,7 +512,7 @@ static void free_huge_page(struct page *page)
164 * balanced by operating on them in a round-robin fashion. 512 * balanced by operating on them in a round-robin fashion.
165 * Returns 1 if an adjustment was made. 513 * Returns 1 if an adjustment was made.
166 */ 514 */
167static int adjust_pool_surplus(int delta) 515static int adjust_pool_surplus(struct hstate *h, int delta)
168{ 516{
169 static int prev_nid; 517 static int prev_nid;
170 int nid = prev_nid; 518 int nid = prev_nid;
@@ -177,15 +525,15 @@ static int adjust_pool_surplus(int delta)
177 nid = first_node(node_online_map); 525 nid = first_node(node_online_map);
178 526
179 /* To shrink on this node, there must be a surplus page */ 527 /* To shrink on this node, there must be a surplus page */
180 if (delta < 0 && !surplus_huge_pages_node[nid]) 528 if (delta < 0 && !h->surplus_huge_pages_node[nid])
181 continue; 529 continue;
182 /* Surplus cannot exceed the total number of pages */ 530 /* Surplus cannot exceed the total number of pages */
183 if (delta > 0 && surplus_huge_pages_node[nid] >= 531 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
184 nr_huge_pages_node[nid]) 532 h->nr_huge_pages_node[nid])
185 continue; 533 continue;
186 534
187 surplus_huge_pages += delta; 535 h->surplus_huge_pages += delta;
188 surplus_huge_pages_node[nid] += delta; 536 h->surplus_huge_pages_node[nid] += delta;
189 ret = 1; 537 ret = 1;
190 break; 538 break;
191 } while (nid != prev_nid); 539 } while (nid != prev_nid);
@@ -194,59 +542,74 @@ static int adjust_pool_surplus(int delta)
194 return ret; 542 return ret;
195} 543}
196 544
197static struct page *alloc_fresh_huge_page_node(int nid) 545static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
546{
547 set_compound_page_dtor(page, free_huge_page);
548 spin_lock(&hugetlb_lock);
549 h->nr_huge_pages++;
550 h->nr_huge_pages_node[nid]++;
551 spin_unlock(&hugetlb_lock);
552 put_page(page); /* free it into the hugepage allocator */
553}
554
555static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
198{ 556{
199 struct page *page; 557 struct page *page;
200 558
559 if (h->order >= MAX_ORDER)
560 return NULL;
561
201 page = alloc_pages_node(nid, 562 page = alloc_pages_node(nid,
202 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 563 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
203 __GFP_REPEAT|__GFP_NOWARN, 564 __GFP_REPEAT|__GFP_NOWARN,
204 HUGETLB_PAGE_ORDER); 565 huge_page_order(h));
205 if (page) { 566 if (page) {
206 if (arch_prepare_hugepage(page)) { 567 if (arch_prepare_hugepage(page)) {
207 __free_pages(page, HUGETLB_PAGE_ORDER); 568 __free_pages(page, huge_page_order(h));
208 return NULL; 569 return NULL;
209 } 570 }
210 set_compound_page_dtor(page, free_huge_page); 571 prep_new_huge_page(h, page, nid);
211 spin_lock(&hugetlb_lock);
212 nr_huge_pages++;
213 nr_huge_pages_node[nid]++;
214 spin_unlock(&hugetlb_lock);
215 put_page(page); /* free it into the hugepage allocator */
216 } 572 }
217 573
218 return page; 574 return page;
219} 575}
220 576
221static int alloc_fresh_huge_page(void) 577/*
578 * Use a helper variable to find the next node and then
579 * copy it back to hugetlb_next_nid afterwards:
580 * otherwise there's a window in which a racer might
581 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
582 * But we don't need to use a spin_lock here: it really
583 * doesn't matter if occasionally a racer chooses the
584 * same nid as we do. Move nid forward in the mask even
585 * if we just successfully allocated a hugepage so that
586 * the next caller gets hugepages on the next node.
587 */
588static int hstate_next_node(struct hstate *h)
589{
590 int next_nid;
591 next_nid = next_node(h->hugetlb_next_nid, node_online_map);
592 if (next_nid == MAX_NUMNODES)
593 next_nid = first_node(node_online_map);
594 h->hugetlb_next_nid = next_nid;
595 return next_nid;
596}
597
598static int alloc_fresh_huge_page(struct hstate *h)
222{ 599{
223 struct page *page; 600 struct page *page;
224 int start_nid; 601 int start_nid;
225 int next_nid; 602 int next_nid;
226 int ret = 0; 603 int ret = 0;
227 604
228 start_nid = hugetlb_next_nid; 605 start_nid = h->hugetlb_next_nid;
229 606
230 do { 607 do {
231 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 608 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
232 if (page) 609 if (page)
233 ret = 1; 610 ret = 1;
234 /* 611 next_nid = hstate_next_node(h);
235 * Use a helper variable to find the next node and then 612 } while (!page && h->hugetlb_next_nid != start_nid);
236 * copy it back to hugetlb_next_nid afterwards:
237 * otherwise there's a window in which a racer might
238 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
239 * But we don't need to use a spin_lock here: it really
240 * doesn't matter if occasionally a racer chooses the
241 * same nid as we do. Move nid forward in the mask even
242 * if we just successfully allocated a hugepage so that
243 * the next caller gets hugepages on the next node.
244 */
245 next_nid = next_node(hugetlb_next_nid, node_online_map);
246 if (next_nid == MAX_NUMNODES)
247 next_nid = first_node(node_online_map);
248 hugetlb_next_nid = next_nid;
249 } while (!page && hugetlb_next_nid != start_nid);
250 613
251 if (ret) 614 if (ret)
252 count_vm_event(HTLB_BUDDY_PGALLOC); 615 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +619,15 @@ static int alloc_fresh_huge_page(void)
256 return ret; 619 return ret;
257} 620}
258 621
259static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 622static struct page *alloc_buddy_huge_page(struct hstate *h,
260 unsigned long address) 623 struct vm_area_struct *vma, unsigned long address)
261{ 624{
262 struct page *page; 625 struct page *page;
263 unsigned int nid; 626 unsigned int nid;
264 627
628 if (h->order >= MAX_ORDER)
629 return NULL;
630
265 /* 631 /*
266 * Assume we will successfully allocate the surplus page to 632 * Assume we will successfully allocate the surplus page to
267 * prevent racing processes from causing the surplus to exceed 633 * prevent racing processes from causing the surplus to exceed
@@ -286,18 +652,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
286 * per-node value is checked there. 652 * per-node value is checked there.
287 */ 653 */
288 spin_lock(&hugetlb_lock); 654 spin_lock(&hugetlb_lock);
289 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 655 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
290 spin_unlock(&hugetlb_lock); 656 spin_unlock(&hugetlb_lock);
291 return NULL; 657 return NULL;
292 } else { 658 } else {
293 nr_huge_pages++; 659 h->nr_huge_pages++;
294 surplus_huge_pages++; 660 h->surplus_huge_pages++;
295 } 661 }
296 spin_unlock(&hugetlb_lock); 662 spin_unlock(&hugetlb_lock);
297 663
298 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 664 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
299 __GFP_REPEAT|__GFP_NOWARN, 665 __GFP_REPEAT|__GFP_NOWARN,
300 HUGETLB_PAGE_ORDER); 666 huge_page_order(h));
667
668 if (page && arch_prepare_hugepage(page)) {
669 __free_pages(page, huge_page_order(h));
670 return NULL;
671 }
301 672
302 spin_lock(&hugetlb_lock); 673 spin_lock(&hugetlb_lock);
303 if (page) { 674 if (page) {
@@ -312,12 +683,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
312 /* 683 /*
313 * We incremented the global counters already 684 * We incremented the global counters already
314 */ 685 */
315 nr_huge_pages_node[nid]++; 686 h->nr_huge_pages_node[nid]++;
316 surplus_huge_pages_node[nid]++; 687 h->surplus_huge_pages_node[nid]++;
317 __count_vm_event(HTLB_BUDDY_PGALLOC); 688 __count_vm_event(HTLB_BUDDY_PGALLOC);
318 } else { 689 } else {
319 nr_huge_pages--; 690 h->nr_huge_pages--;
320 surplus_huge_pages--; 691 h->surplus_huge_pages--;
321 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 692 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
322 } 693 }
323 spin_unlock(&hugetlb_lock); 694 spin_unlock(&hugetlb_lock);
@@ -329,16 +700,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
329 * Increase the hugetlb pool such that it can accomodate a reservation 700 * Increase the hugetlb pool such that it can accomodate a reservation
330 * of size 'delta'. 701 * of size 'delta'.
331 */ 702 */
332static int gather_surplus_pages(int delta) 703static int gather_surplus_pages(struct hstate *h, int delta)
333{ 704{
334 struct list_head surplus_list; 705 struct list_head surplus_list;
335 struct page *page, *tmp; 706 struct page *page, *tmp;
336 int ret, i; 707 int ret, i;
337 int needed, allocated; 708 int needed, allocated;
338 709
339 needed = (resv_huge_pages + delta) - free_huge_pages; 710 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
340 if (needed <= 0) { 711 if (needed <= 0) {
341 resv_huge_pages += delta; 712 h->resv_huge_pages += delta;
342 return 0; 713 return 0;
343 } 714 }
344 715
@@ -349,7 +720,7 @@ static int gather_surplus_pages(int delta)
349retry: 720retry:
350 spin_unlock(&hugetlb_lock); 721 spin_unlock(&hugetlb_lock);
351 for (i = 0; i < needed; i++) { 722 for (i = 0; i < needed; i++) {
352 page = alloc_buddy_huge_page(NULL, 0); 723 page = alloc_buddy_huge_page(h, NULL, 0);
353 if (!page) { 724 if (!page) {
354 /* 725 /*
355 * We were not able to allocate enough pages to 726 * We were not able to allocate enough pages to
@@ -370,7 +741,8 @@ retry:
370 * because either resv_huge_pages or free_huge_pages may have changed. 741 * because either resv_huge_pages or free_huge_pages may have changed.
371 */ 742 */
372 spin_lock(&hugetlb_lock); 743 spin_lock(&hugetlb_lock);
373 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 744 needed = (h->resv_huge_pages + delta) -
745 (h->free_huge_pages + allocated);
374 if (needed > 0) 746 if (needed > 0)
375 goto retry; 747 goto retry;
376 748
@@ -383,7 +755,7 @@ retry:
383 * before they are reserved. 755 * before they are reserved.
384 */ 756 */
385 needed += allocated; 757 needed += allocated;
386 resv_huge_pages += delta; 758 h->resv_huge_pages += delta;
387 ret = 0; 759 ret = 0;
388free: 760free:
389 /* Free the needed pages to the hugetlb pool */ 761 /* Free the needed pages to the hugetlb pool */
@@ -391,7 +763,7 @@ free:
391 if ((--needed) < 0) 763 if ((--needed) < 0)
392 break; 764 break;
393 list_del(&page->lru); 765 list_del(&page->lru);
394 enqueue_huge_page(page); 766 enqueue_huge_page(h, page);
395 } 767 }
396 768
397 /* Free unnecessary surplus pages to the buddy allocator */ 769 /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +791,8 @@ free:
419 * allocated to satisfy the reservation must be explicitly freed if they were 791 * allocated to satisfy the reservation must be explicitly freed if they were
420 * never used. 792 * never used.
421 */ 793 */
422static void return_unused_surplus_pages(unsigned long unused_resv_pages) 794static void return_unused_surplus_pages(struct hstate *h,
795 unsigned long unused_resv_pages)
423{ 796{
424 static int nid = -1; 797 static int nid = -1;
425 struct page *page; 798 struct page *page;
@@ -434,157 +807,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
434 unsigned long remaining_iterations = num_online_nodes(); 807 unsigned long remaining_iterations = num_online_nodes();
435 808
436 /* Uncommit the reservation */ 809 /* Uncommit the reservation */
437 resv_huge_pages -= unused_resv_pages; 810 h->resv_huge_pages -= unused_resv_pages;
438 811
439 nr_pages = min(unused_resv_pages, surplus_huge_pages); 812 /* Cannot return gigantic pages currently */
813 if (h->order >= MAX_ORDER)
814 return;
815
816 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
440 817
441 while (remaining_iterations-- && nr_pages) { 818 while (remaining_iterations-- && nr_pages) {
442 nid = next_node(nid, node_online_map); 819 nid = next_node(nid, node_online_map);
443 if (nid == MAX_NUMNODES) 820 if (nid == MAX_NUMNODES)
444 nid = first_node(node_online_map); 821 nid = first_node(node_online_map);
445 822
446 if (!surplus_huge_pages_node[nid]) 823 if (!h->surplus_huge_pages_node[nid])
447 continue; 824 continue;
448 825
449 if (!list_empty(&hugepage_freelists[nid])) { 826 if (!list_empty(&h->hugepage_freelists[nid])) {
450 page = list_entry(hugepage_freelists[nid].next, 827 page = list_entry(h->hugepage_freelists[nid].next,
451 struct page, lru); 828 struct page, lru);
452 list_del(&page->lru); 829 list_del(&page->lru);
453 update_and_free_page(page); 830 update_and_free_page(h, page);
454 free_huge_pages--; 831 h->free_huge_pages--;
455 free_huge_pages_node[nid]--; 832 h->free_huge_pages_node[nid]--;
456 surplus_huge_pages--; 833 h->surplus_huge_pages--;
457 surplus_huge_pages_node[nid]--; 834 h->surplus_huge_pages_node[nid]--;
458 nr_pages--; 835 nr_pages--;
459 remaining_iterations = num_online_nodes(); 836 remaining_iterations = num_online_nodes();
460 } 837 }
461 } 838 }
462} 839}
463 840
841/*
842 * Determine if the huge page at addr within the vma has an associated
843 * reservation. Where it does not we will need to logically increase
844 * reservation and actually increase quota before an allocation can occur.
845 * Where any new reservation would be required the reservation change is
846 * prepared, but not committed. Once the page has been quota'd allocated
847 * an instantiated the change should be committed via vma_commit_reservation.
848 * No action is required on failure.
849 */
850static int vma_needs_reservation(struct hstate *h,
851 struct vm_area_struct *vma, unsigned long addr)
852{
853 struct address_space *mapping = vma->vm_file->f_mapping;
854 struct inode *inode = mapping->host;
855
856 if (vma->vm_flags & VM_SHARED) {
857 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
858 return region_chg(&inode->i_mapping->private_list,
859 idx, idx + 1);
860
861 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
862 return 1;
863
864 } else {
865 int err;
866 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
867 struct resv_map *reservations = vma_resv_map(vma);
464 868
465static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 869 err = region_chg(&reservations->regions, idx, idx + 1);
466 unsigned long addr) 870 if (err < 0)
871 return err;
872 return 0;
873 }
874}
875static void vma_commit_reservation(struct hstate *h,
876 struct vm_area_struct *vma, unsigned long addr)
467{ 877{
468 struct page *page; 878 struct address_space *mapping = vma->vm_file->f_mapping;
879 struct inode *inode = mapping->host;
469 880
470 spin_lock(&hugetlb_lock); 881 if (vma->vm_flags & VM_SHARED) {
471 page = dequeue_huge_page_vma(vma, addr); 882 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
472 spin_unlock(&hugetlb_lock); 883 region_add(&inode->i_mapping->private_list, idx, idx + 1);
473 return page ? page : ERR_PTR(-VM_FAULT_OOM); 884
885 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
886 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
887 struct resv_map *reservations = vma_resv_map(vma);
888
889 /* Mark this page used in the map. */
890 region_add(&reservations->regions, idx, idx + 1);
891 }
474} 892}
475 893
476static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 894static struct page *alloc_huge_page(struct vm_area_struct *vma,
477 unsigned long addr) 895 unsigned long addr, int avoid_reserve)
478{ 896{
479 struct page *page = NULL; 897 struct hstate *h = hstate_vma(vma);
898 struct page *page;
899 struct address_space *mapping = vma->vm_file->f_mapping;
900 struct inode *inode = mapping->host;
901 unsigned int chg;
480 902
481 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 903 /*
482 return ERR_PTR(-VM_FAULT_SIGBUS); 904 * Processes that did not create the mapping will have no reserves and
905 * will not have accounted against quota. Check that the quota can be
906 * made before satisfying the allocation
907 * MAP_NORESERVE mappings may also need pages and quota allocated
908 * if no reserve mapping overlaps.
909 */
910 chg = vma_needs_reservation(h, vma, addr);
911 if (chg < 0)
912 return ERR_PTR(chg);
913 if (chg)
914 if (hugetlb_get_quota(inode->i_mapping, chg))
915 return ERR_PTR(-ENOSPC);
483 916
484 spin_lock(&hugetlb_lock); 917 spin_lock(&hugetlb_lock);
485 if (free_huge_pages > resv_huge_pages) 918 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
486 page = dequeue_huge_page_vma(vma, addr);
487 spin_unlock(&hugetlb_lock); 919 spin_unlock(&hugetlb_lock);
920
488 if (!page) { 921 if (!page) {
489 page = alloc_buddy_huge_page(vma, addr); 922 page = alloc_buddy_huge_page(h, vma, addr);
490 if (!page) { 923 if (!page) {
491 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 924 hugetlb_put_quota(inode->i_mapping, chg);
492 return ERR_PTR(-VM_FAULT_OOM); 925 return ERR_PTR(-VM_FAULT_OOM);
493 } 926 }
494 } 927 }
928
929 set_page_refcounted(page);
930 set_page_private(page, (unsigned long) mapping);
931
932 vma_commit_reservation(h, vma, addr);
933
495 return page; 934 return page;
496} 935}
497 936
498static struct page *alloc_huge_page(struct vm_area_struct *vma, 937__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
499 unsigned long addr)
500{ 938{
501 struct page *page; 939 struct huge_bootmem_page *m;
502 struct address_space *mapping = vma->vm_file->f_mapping; 940 int nr_nodes = nodes_weight(node_online_map);
503 941
504 if (vma->vm_flags & VM_MAYSHARE) 942 while (nr_nodes) {
505 page = alloc_huge_page_shared(vma, addr); 943 void *addr;
506 else 944
507 page = alloc_huge_page_private(vma, addr); 945 addr = __alloc_bootmem_node_nopanic(
946 NODE_DATA(h->hugetlb_next_nid),
947 huge_page_size(h), huge_page_size(h), 0);
508 948
509 if (!IS_ERR(page)) { 949 if (addr) {
510 set_page_refcounted(page); 950 /*
511 set_page_private(page, (unsigned long) mapping); 951 * Use the beginning of the huge page to store the
952 * huge_bootmem_page struct (until gather_bootmem
953 * puts them into the mem_map).
954 */
955 m = addr;
956 if (m)
957 goto found;
958 }
959 hstate_next_node(h);
960 nr_nodes--;
512 } 961 }
513 return page; 962 return 0;
963
964found:
965 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
966 /* Put them into a private list first because mem_map is not up yet */
967 list_add(&m->list, &huge_boot_pages);
968 m->hstate = h;
969 return 1;
514} 970}
515 971
516static int __init hugetlb_init(void) 972/* Put bootmem huge pages into the standard lists after mem_map is up */
973static void __init gather_bootmem_prealloc(void)
517{ 974{
518 unsigned long i; 975 struct huge_bootmem_page *m;
519 976
520 if (HPAGE_SHIFT == 0) 977 list_for_each_entry(m, &huge_boot_pages, list) {
521 return 0; 978 struct page *page = virt_to_page(m);
522 979 struct hstate *h = m->hstate;
523 for (i = 0; i < MAX_NUMNODES; ++i) 980 __ClearPageReserved(page);
524 INIT_LIST_HEAD(&hugepage_freelists[i]); 981 WARN_ON(page_count(page) != 1);
982 prep_compound_page(page, h->order);
983 prep_new_huge_page(h, page, page_to_nid(page));
984 }
985}
525 986
526 hugetlb_next_nid = first_node(node_online_map); 987static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
988{
989 unsigned long i;
527 990
528 for (i = 0; i < max_huge_pages; ++i) { 991 for (i = 0; i < h->max_huge_pages; ++i) {
529 if (!alloc_fresh_huge_page()) 992 if (h->order >= MAX_ORDER) {
993 if (!alloc_bootmem_huge_page(h))
994 break;
995 } else if (!alloc_fresh_huge_page(h))
530 break; 996 break;
531 } 997 }
532 max_huge_pages = free_huge_pages = nr_huge_pages = i; 998 h->max_huge_pages = i;
533 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
534 return 0;
535} 999}
536module_init(hugetlb_init);
537 1000
538static int __init hugetlb_setup(char *s) 1001static void __init hugetlb_init_hstates(void)
539{ 1002{
540 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 1003 struct hstate *h;
541 max_huge_pages = 0; 1004
542 return 1; 1005 for_each_hstate(h) {
1006 /* oversize hugepages were init'ed in early boot */
1007 if (h->order < MAX_ORDER)
1008 hugetlb_hstate_alloc_pages(h);
1009 }
543} 1010}
544__setup("hugepages=", hugetlb_setup);
545 1011
546static unsigned int cpuset_mems_nr(unsigned int *array) 1012static char * __init memfmt(char *buf, unsigned long n)
547{ 1013{
548 int node; 1014 if (n >= (1UL << 30))
549 unsigned int nr = 0; 1015 sprintf(buf, "%lu GB", n >> 30);
550 1016 else if (n >= (1UL << 20))
551 for_each_node_mask(node, cpuset_current_mems_allowed) 1017 sprintf(buf, "%lu MB", n >> 20);
552 nr += array[node]; 1018 else
1019 sprintf(buf, "%lu KB", n >> 10);
1020 return buf;
1021}
553 1022
554 return nr; 1023static void __init report_hugepages(void)
1024{
1025 struct hstate *h;
1026
1027 for_each_hstate(h) {
1028 char buf[32];
1029 printk(KERN_INFO "HugeTLB registered %s page size, "
1030 "pre-allocated %ld pages\n",
1031 memfmt(buf, huge_page_size(h)),
1032 h->free_huge_pages);
1033 }
555} 1034}
556 1035
557#ifdef CONFIG_SYSCTL
558#ifdef CONFIG_HIGHMEM 1036#ifdef CONFIG_HIGHMEM
559static void try_to_free_low(unsigned long count) 1037static void try_to_free_low(struct hstate *h, unsigned long count)
560{ 1038{
561 int i; 1039 int i;
562 1040
1041 if (h->order >= MAX_ORDER)
1042 return;
1043
563 for (i = 0; i < MAX_NUMNODES; ++i) { 1044 for (i = 0; i < MAX_NUMNODES; ++i) {
564 struct page *page, *next; 1045 struct page *page, *next;
565 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 1046 struct list_head *freel = &h->hugepage_freelists[i];
566 if (count >= nr_huge_pages) 1047 list_for_each_entry_safe(page, next, freel, lru) {
1048 if (count >= h->nr_huge_pages)
567 return; 1049 return;
568 if (PageHighMem(page)) 1050 if (PageHighMem(page))
569 continue; 1051 continue;
570 list_del(&page->lru); 1052 list_del(&page->lru);
571 update_and_free_page(page); 1053 update_and_free_page(h, page);
572 free_huge_pages--; 1054 h->free_huge_pages--;
573 free_huge_pages_node[page_to_nid(page)]--; 1055 h->free_huge_pages_node[page_to_nid(page)]--;
574 } 1056 }
575 } 1057 }
576} 1058}
577#else 1059#else
578static inline void try_to_free_low(unsigned long count) 1060static inline void try_to_free_low(struct hstate *h, unsigned long count)
579{ 1061{
580} 1062}
581#endif 1063#endif
582 1064
583#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 1065#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
584static unsigned long set_max_huge_pages(unsigned long count) 1066static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
585{ 1067{
586 unsigned long min_count, ret; 1068 unsigned long min_count, ret;
587 1069
1070 if (h->order >= MAX_ORDER)
1071 return h->max_huge_pages;
1072
588 /* 1073 /*
589 * Increase the pool size 1074 * Increase the pool size
590 * First take pages out of surplus state. Then make up the 1075 * First take pages out of surplus state. Then make up the
@@ -597,20 +1082,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
597 * within all the constraints specified by the sysctls. 1082 * within all the constraints specified by the sysctls.
598 */ 1083 */
599 spin_lock(&hugetlb_lock); 1084 spin_lock(&hugetlb_lock);
600 while (surplus_huge_pages && count > persistent_huge_pages) { 1085 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
601 if (!adjust_pool_surplus(-1)) 1086 if (!adjust_pool_surplus(h, -1))
602 break; 1087 break;
603 } 1088 }
604 1089
605 while (count > persistent_huge_pages) { 1090 while (count > persistent_huge_pages(h)) {
606 int ret;
607 /* 1091 /*
608 * If this allocation races such that we no longer need the 1092 * If this allocation races such that we no longer need the
609 * page, free_huge_page will handle it by freeing the page 1093 * page, free_huge_page will handle it by freeing the page
610 * and reducing the surplus. 1094 * and reducing the surplus.
611 */ 1095 */
612 spin_unlock(&hugetlb_lock); 1096 spin_unlock(&hugetlb_lock);
613 ret = alloc_fresh_huge_page(); 1097 ret = alloc_fresh_huge_page(h);
614 spin_lock(&hugetlb_lock); 1098 spin_lock(&hugetlb_lock);
615 if (!ret) 1099 if (!ret)
616 goto out; 1100 goto out;
@@ -632,31 +1116,305 @@ static unsigned long set_max_huge_pages(unsigned long count)
632 * and won't grow the pool anywhere else. Not until one of the 1116 * and won't grow the pool anywhere else. Not until one of the
633 * sysctls are changed, or the surplus pages go out of use. 1117 * sysctls are changed, or the surplus pages go out of use.
634 */ 1118 */
635 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 1119 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
636 min_count = max(count, min_count); 1120 min_count = max(count, min_count);
637 try_to_free_low(min_count); 1121 try_to_free_low(h, min_count);
638 while (min_count < persistent_huge_pages) { 1122 while (min_count < persistent_huge_pages(h)) {
639 struct page *page = dequeue_huge_page(); 1123 struct page *page = dequeue_huge_page(h);
640 if (!page) 1124 if (!page)
641 break; 1125 break;
642 update_and_free_page(page); 1126 update_and_free_page(h, page);
643 } 1127 }
644 while (count < persistent_huge_pages) { 1128 while (count < persistent_huge_pages(h)) {
645 if (!adjust_pool_surplus(1)) 1129 if (!adjust_pool_surplus(h, 1))
646 break; 1130 break;
647 } 1131 }
648out: 1132out:
649 ret = persistent_huge_pages; 1133 ret = persistent_huge_pages(h);
650 spin_unlock(&hugetlb_lock); 1134 spin_unlock(&hugetlb_lock);
651 return ret; 1135 return ret;
652} 1136}
653 1137
1138#define HSTATE_ATTR_RO(_name) \
1139 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1140
1141#define HSTATE_ATTR(_name) \
1142 static struct kobj_attribute _name##_attr = \
1143 __ATTR(_name, 0644, _name##_show, _name##_store)
1144
1145static struct kobject *hugepages_kobj;
1146static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1147
1148static struct hstate *kobj_to_hstate(struct kobject *kobj)
1149{
1150 int i;
1151 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1152 if (hstate_kobjs[i] == kobj)
1153 return &hstates[i];
1154 BUG();
1155 return NULL;
1156}
1157
1158static ssize_t nr_hugepages_show(struct kobject *kobj,
1159 struct kobj_attribute *attr, char *buf)
1160{
1161 struct hstate *h = kobj_to_hstate(kobj);
1162 return sprintf(buf, "%lu\n", h->nr_huge_pages);
1163}
1164static ssize_t nr_hugepages_store(struct kobject *kobj,
1165 struct kobj_attribute *attr, const char *buf, size_t count)
1166{
1167 int err;
1168 unsigned long input;
1169 struct hstate *h = kobj_to_hstate(kobj);
1170
1171 err = strict_strtoul(buf, 10, &input);
1172 if (err)
1173 return 0;
1174
1175 h->max_huge_pages = set_max_huge_pages(h, input);
1176
1177 return count;
1178}
1179HSTATE_ATTR(nr_hugepages);
1180
1181static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1182 struct kobj_attribute *attr, char *buf)
1183{
1184 struct hstate *h = kobj_to_hstate(kobj);
1185 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1186}
1187static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1188 struct kobj_attribute *attr, const char *buf, size_t count)
1189{
1190 int err;
1191 unsigned long input;
1192 struct hstate *h = kobj_to_hstate(kobj);
1193
1194 err = strict_strtoul(buf, 10, &input);
1195 if (err)
1196 return 0;
1197
1198 spin_lock(&hugetlb_lock);
1199 h->nr_overcommit_huge_pages = input;
1200 spin_unlock(&hugetlb_lock);
1201
1202 return count;
1203}
1204HSTATE_ATTR(nr_overcommit_hugepages);
1205
1206static ssize_t free_hugepages_show(struct kobject *kobj,
1207 struct kobj_attribute *attr, char *buf)
1208{
1209 struct hstate *h = kobj_to_hstate(kobj);
1210 return sprintf(buf, "%lu\n", h->free_huge_pages);
1211}
1212HSTATE_ATTR_RO(free_hugepages);
1213
1214static ssize_t resv_hugepages_show(struct kobject *kobj,
1215 struct kobj_attribute *attr, char *buf)
1216{
1217 struct hstate *h = kobj_to_hstate(kobj);
1218 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1219}
1220HSTATE_ATTR_RO(resv_hugepages);
1221
1222static ssize_t surplus_hugepages_show(struct kobject *kobj,
1223 struct kobj_attribute *attr, char *buf)
1224{
1225 struct hstate *h = kobj_to_hstate(kobj);
1226 return sprintf(buf, "%lu\n", h->surplus_huge_pages);
1227}
1228HSTATE_ATTR_RO(surplus_hugepages);
1229
1230static struct attribute *hstate_attrs[] = {
1231 &nr_hugepages_attr.attr,
1232 &nr_overcommit_hugepages_attr.attr,
1233 &free_hugepages_attr.attr,
1234 &resv_hugepages_attr.attr,
1235 &surplus_hugepages_attr.attr,
1236 NULL,
1237};
1238
1239static struct attribute_group hstate_attr_group = {
1240 .attrs = hstate_attrs,
1241};
1242
1243static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
1244{
1245 int retval;
1246
1247 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
1248 hugepages_kobj);
1249 if (!hstate_kobjs[h - hstates])
1250 return -ENOMEM;
1251
1252 retval = sysfs_create_group(hstate_kobjs[h - hstates],
1253 &hstate_attr_group);
1254 if (retval)
1255 kobject_put(hstate_kobjs[h - hstates]);
1256
1257 return retval;
1258}
1259
1260static void __init hugetlb_sysfs_init(void)
1261{
1262 struct hstate *h;
1263 int err;
1264
1265 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1266 if (!hugepages_kobj)
1267 return;
1268
1269 for_each_hstate(h) {
1270 err = hugetlb_sysfs_add_hstate(h);
1271 if (err)
1272 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1273 h->name);
1274 }
1275}
1276
1277static void __exit hugetlb_exit(void)
1278{
1279 struct hstate *h;
1280
1281 for_each_hstate(h) {
1282 kobject_put(hstate_kobjs[h - hstates]);
1283 }
1284
1285 kobject_put(hugepages_kobj);
1286}
1287module_exit(hugetlb_exit);
1288
1289static int __init hugetlb_init(void)
1290{
1291 /* Some platform decide whether they support huge pages at boot
1292 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1293 * there is no such support
1294 */
1295 if (HPAGE_SHIFT == 0)
1296 return 0;
1297
1298 if (!size_to_hstate(default_hstate_size)) {
1299 default_hstate_size = HPAGE_SIZE;
1300 if (!size_to_hstate(default_hstate_size))
1301 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1302 }
1303 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1304 if (default_hstate_max_huge_pages)
1305 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1306
1307 hugetlb_init_hstates();
1308
1309 gather_bootmem_prealloc();
1310
1311 report_hugepages();
1312
1313 hugetlb_sysfs_init();
1314
1315 return 0;
1316}
1317module_init(hugetlb_init);
1318
1319/* Should be called on processing a hugepagesz=... option */
1320void __init hugetlb_add_hstate(unsigned order)
1321{
1322 struct hstate *h;
1323 unsigned long i;
1324
1325 if (size_to_hstate(PAGE_SIZE << order)) {
1326 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1327 return;
1328 }
1329 BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1330 BUG_ON(order == 0);
1331 h = &hstates[max_hstate++];
1332 h->order = order;
1333 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1334 h->nr_huge_pages = 0;
1335 h->free_huge_pages = 0;
1336 for (i = 0; i < MAX_NUMNODES; ++i)
1337 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1338 h->hugetlb_next_nid = first_node(node_online_map);
1339 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1340 huge_page_size(h)/1024);
1341
1342 parsed_hstate = h;
1343}
1344
1345static int __init hugetlb_nrpages_setup(char *s)
1346{
1347 unsigned long *mhp;
1348 static unsigned long *last_mhp;
1349
1350 /*
1351 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1352 * so this hugepages= parameter goes to the "default hstate".
1353 */
1354 if (!max_hstate)
1355 mhp = &default_hstate_max_huge_pages;
1356 else
1357 mhp = &parsed_hstate->max_huge_pages;
1358
1359 if (mhp == last_mhp) {
1360 printk(KERN_WARNING "hugepages= specified twice without "
1361 "interleaving hugepagesz=, ignoring\n");
1362 return 1;
1363 }
1364
1365 if (sscanf(s, "%lu", mhp) <= 0)
1366 *mhp = 0;
1367
1368 /*
1369 * Global state is always initialized later in hugetlb_init.
1370 * But we need to allocate >= MAX_ORDER hstates here early to still
1371 * use the bootmem allocator.
1372 */
1373 if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1374 hugetlb_hstate_alloc_pages(parsed_hstate);
1375
1376 last_mhp = mhp;
1377
1378 return 1;
1379}
1380__setup("hugepages=", hugetlb_nrpages_setup);
1381
1382static int __init hugetlb_default_setup(char *s)
1383{
1384 default_hstate_size = memparse(s, &s);
1385 return 1;
1386}
1387__setup("default_hugepagesz=", hugetlb_default_setup);
1388
1389static unsigned int cpuset_mems_nr(unsigned int *array)
1390{
1391 int node;
1392 unsigned int nr = 0;
1393
1394 for_each_node_mask(node, cpuset_current_mems_allowed)
1395 nr += array[node];
1396
1397 return nr;
1398}
1399
1400#ifdef CONFIG_SYSCTL
654int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1401int hugetlb_sysctl_handler(struct ctl_table *table, int write,
655 struct file *file, void __user *buffer, 1402 struct file *file, void __user *buffer,
656 size_t *length, loff_t *ppos) 1403 size_t *length, loff_t *ppos)
657{ 1404{
1405 struct hstate *h = &default_hstate;
1406 unsigned long tmp;
1407
1408 if (!write)
1409 tmp = h->max_huge_pages;
1410
1411 table->data = &tmp;
1412 table->maxlen = sizeof(unsigned long);
658 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1413 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
659 max_huge_pages = set_max_huge_pages(max_huge_pages); 1414
1415 if (write)
1416 h->max_huge_pages = set_max_huge_pages(h, tmp);
1417
660 return 0; 1418 return 0;
661} 1419}
662 1420
@@ -676,10 +1434,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
676 struct file *file, void __user *buffer, 1434 struct file *file, void __user *buffer,
677 size_t *length, loff_t *ppos) 1435 size_t *length, loff_t *ppos)
678{ 1436{
1437 struct hstate *h = &default_hstate;
1438 unsigned long tmp;
1439
1440 if (!write)
1441 tmp = h->nr_overcommit_huge_pages;
1442
1443 table->data = &tmp;
1444 table->maxlen = sizeof(unsigned long);
679 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1445 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
680 spin_lock(&hugetlb_lock); 1446
681 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 1447 if (write) {
682 spin_unlock(&hugetlb_lock); 1448 spin_lock(&hugetlb_lock);
1449 h->nr_overcommit_huge_pages = tmp;
1450 spin_unlock(&hugetlb_lock);
1451 }
1452
683 return 0; 1453 return 0;
684} 1454}
685 1455
@@ -687,34 +1457,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
687 1457
688int hugetlb_report_meminfo(char *buf) 1458int hugetlb_report_meminfo(char *buf)
689{ 1459{
1460 struct hstate *h = &default_hstate;
690 return sprintf(buf, 1461 return sprintf(buf,
691 "HugePages_Total: %5lu\n" 1462 "HugePages_Total: %5lu\n"
692 "HugePages_Free: %5lu\n" 1463 "HugePages_Free: %5lu\n"
693 "HugePages_Rsvd: %5lu\n" 1464 "HugePages_Rsvd: %5lu\n"
694 "HugePages_Surp: %5lu\n" 1465 "HugePages_Surp: %5lu\n"
695 "Hugepagesize: %5lu kB\n", 1466 "Hugepagesize: %5lu kB\n",
696 nr_huge_pages, 1467 h->nr_huge_pages,
697 free_huge_pages, 1468 h->free_huge_pages,
698 resv_huge_pages, 1469 h->resv_huge_pages,
699 surplus_huge_pages, 1470 h->surplus_huge_pages,
700 HPAGE_SIZE/1024); 1471 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
701} 1472}
702 1473
703int hugetlb_report_node_meminfo(int nid, char *buf) 1474int hugetlb_report_node_meminfo(int nid, char *buf)
704{ 1475{
1476 struct hstate *h = &default_hstate;
705 return sprintf(buf, 1477 return sprintf(buf,
706 "Node %d HugePages_Total: %5u\n" 1478 "Node %d HugePages_Total: %5u\n"
707 "Node %d HugePages_Free: %5u\n" 1479 "Node %d HugePages_Free: %5u\n"
708 "Node %d HugePages_Surp: %5u\n", 1480 "Node %d HugePages_Surp: %5u\n",
709 nid, nr_huge_pages_node[nid], 1481 nid, h->nr_huge_pages_node[nid],
710 nid, free_huge_pages_node[nid], 1482 nid, h->free_huge_pages_node[nid],
711 nid, surplus_huge_pages_node[nid]); 1483 nid, h->surplus_huge_pages_node[nid]);
712} 1484}
713 1485
714/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 1486/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
715unsigned long hugetlb_total_pages(void) 1487unsigned long hugetlb_total_pages(void)
716{ 1488{
717 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 1489 struct hstate *h = &default_hstate;
1490 return h->nr_huge_pages * pages_per_huge_page(h);
1491}
1492
1493static int hugetlb_acct_memory(struct hstate *h, long delta)
1494{
1495 int ret = -ENOMEM;
1496
1497 spin_lock(&hugetlb_lock);
1498 /*
1499 * When cpuset is configured, it breaks the strict hugetlb page
1500 * reservation as the accounting is done on a global variable. Such
1501 * reservation is completely rubbish in the presence of cpuset because
1502 * the reservation is not checked against page availability for the
1503 * current cpuset. Application can still potentially OOM'ed by kernel
1504 * with lack of free htlb page in cpuset that the task is in.
1505 * Attempt to enforce strict accounting with cpuset is almost
1506 * impossible (or too ugly) because cpuset is too fluid that
1507 * task or memory node can be dynamically moved between cpusets.
1508 *
1509 * The change of semantics for shared hugetlb mapping with cpuset is
1510 * undesirable. However, in order to preserve some of the semantics,
1511 * we fall back to check against current free page availability as
1512 * a best attempt and hopefully to minimize the impact of changing
1513 * semantics that cpuset has.
1514 */
1515 if (delta > 0) {
1516 if (gather_surplus_pages(h, delta) < 0)
1517 goto out;
1518
1519 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1520 return_unused_surplus_pages(h, delta);
1521 goto out;
1522 }
1523 }
1524
1525 ret = 0;
1526 if (delta < 0)
1527 return_unused_surplus_pages(h, (unsigned long) -delta);
1528
1529out:
1530 spin_unlock(&hugetlb_lock);
1531 return ret;
1532}
1533
1534static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1535{
1536 struct resv_map *reservations = vma_resv_map(vma);
1537
1538 /*
1539 * This new VMA should share its siblings reservation map if present.
1540 * The VMA will only ever have a valid reservation map pointer where
1541 * it is being copied for another still existing VMA. As that VMA
1542 * has a reference to the reservation map it cannot dissappear until
1543 * after this open call completes. It is therefore safe to take a
1544 * new reference here without additional locking.
1545 */
1546 if (reservations)
1547 kref_get(&reservations->refs);
1548}
1549
1550static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1551{
1552 struct hstate *h = hstate_vma(vma);
1553 struct resv_map *reservations = vma_resv_map(vma);
1554 unsigned long reserve;
1555 unsigned long start;
1556 unsigned long end;
1557
1558 if (reservations) {
1559 start = vma_hugecache_offset(h, vma, vma->vm_start);
1560 end = vma_hugecache_offset(h, vma, vma->vm_end);
1561
1562 reserve = (end - start) -
1563 region_count(&reservations->regions, start, end);
1564
1565 kref_put(&reservations->refs, resv_map_release);
1566
1567 if (reserve) {
1568 hugetlb_acct_memory(h, -reserve);
1569 hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
1570 }
1571 }
718} 1572}
719 1573
720/* 1574/*
@@ -731,6 +1585,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
731 1585
732struct vm_operations_struct hugetlb_vm_ops = { 1586struct vm_operations_struct hugetlb_vm_ops = {
733 .fault = hugetlb_vm_op_fault, 1587 .fault = hugetlb_vm_op_fault,
1588 .open = hugetlb_vm_op_open,
1589 .close = hugetlb_vm_op_close,
734}; 1590};
735 1591
736static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 1592static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1625,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
769 struct page *ptepage; 1625 struct page *ptepage;
770 unsigned long addr; 1626 unsigned long addr;
771 int cow; 1627 int cow;
1628 struct hstate *h = hstate_vma(vma);
1629 unsigned long sz = huge_page_size(h);
772 1630
773 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1631 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
774 1632
775 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 1633 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
776 src_pte = huge_pte_offset(src, addr); 1634 src_pte = huge_pte_offset(src, addr);
777 if (!src_pte) 1635 if (!src_pte)
778 continue; 1636 continue;
779 dst_pte = huge_pte_alloc(dst, addr); 1637 dst_pte = huge_pte_alloc(dst, addr, sz);
780 if (!dst_pte) 1638 if (!dst_pte)
781 goto nomem; 1639 goto nomem;
782 1640
@@ -804,7 +1662,7 @@ nomem:
804} 1662}
805 1663
806void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1664void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
807 unsigned long end) 1665 unsigned long end, struct page *ref_page)
808{ 1666{
809 struct mm_struct *mm = vma->vm_mm; 1667 struct mm_struct *mm = vma->vm_mm;
810 unsigned long address; 1668 unsigned long address;
@@ -812,6 +1670,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
812 pte_t pte; 1670 pte_t pte;
813 struct page *page; 1671 struct page *page;
814 struct page *tmp; 1672 struct page *tmp;
1673 struct hstate *h = hstate_vma(vma);
1674 unsigned long sz = huge_page_size(h);
1675
815 /* 1676 /*
816 * A page gathering list, protected by per file i_mmap_lock. The 1677 * A page gathering list, protected by per file i_mmap_lock. The
817 * lock is used to avoid list corruption from multiple unmapping 1678 * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1681,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
820 LIST_HEAD(page_list); 1681 LIST_HEAD(page_list);
821 1682
822 WARN_ON(!is_vm_hugetlb_page(vma)); 1683 WARN_ON(!is_vm_hugetlb_page(vma));
823 BUG_ON(start & ~HPAGE_MASK); 1684 BUG_ON(start & ~huge_page_mask(h));
824 BUG_ON(end & ~HPAGE_MASK); 1685 BUG_ON(end & ~huge_page_mask(h));
825 1686
1687 mmu_notifier_invalidate_range_start(mm, start, end);
826 spin_lock(&mm->page_table_lock); 1688 spin_lock(&mm->page_table_lock);
827 for (address = start; address < end; address += HPAGE_SIZE) { 1689 for (address = start; address < end; address += sz) {
828 ptep = huge_pte_offset(mm, address); 1690 ptep = huge_pte_offset(mm, address);
829 if (!ptep) 1691 if (!ptep)
830 continue; 1692 continue;
@@ -832,6 +1694,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
832 if (huge_pmd_unshare(mm, &address, ptep)) 1694 if (huge_pmd_unshare(mm, &address, ptep))
833 continue; 1695 continue;
834 1696
1697 /*
1698 * If a reference page is supplied, it is because a specific
1699 * page is being unmapped, not a range. Ensure the page we
1700 * are about to unmap is the actual page of interest.
1701 */
1702 if (ref_page) {
1703 pte = huge_ptep_get(ptep);
1704 if (huge_pte_none(pte))
1705 continue;
1706 page = pte_page(pte);
1707 if (page != ref_page)
1708 continue;
1709
1710 /*
1711 * Mark the VMA as having unmapped its page so that
1712 * future faults in this VMA will fail rather than
1713 * looking like data was lost
1714 */
1715 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
1716 }
1717
835 pte = huge_ptep_get_and_clear(mm, address, ptep); 1718 pte = huge_ptep_get_and_clear(mm, address, ptep);
836 if (huge_pte_none(pte)) 1719 if (huge_pte_none(pte))
837 continue; 1720 continue;
@@ -843,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
843 } 1726 }
844 spin_unlock(&mm->page_table_lock); 1727 spin_unlock(&mm->page_table_lock);
845 flush_tlb_range(vma, start, end); 1728 flush_tlb_range(vma, start, end);
1729 mmu_notifier_invalidate_range_end(mm, start, end);
846 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1730 list_for_each_entry_safe(page, tmp, &page_list, lru) {
847 list_del(&page->lru); 1731 list_del(&page->lru);
848 put_page(page); 1732 put_page(page);
@@ -850,31 +1734,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
850} 1734}
851 1735
852void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1736void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
853 unsigned long end) 1737 unsigned long end, struct page *ref_page)
854{ 1738{
1739 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1740 __unmap_hugepage_range(vma, start, end, ref_page);
1741 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1742}
1743
1744/*
1745 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1746 * mappping it owns the reserve page for. The intention is to unmap the page
1747 * from other VMAs and let the children be SIGKILLed if they are faulting the
1748 * same region.
1749 */
1750int unmap_ref_private(struct mm_struct *mm,
1751 struct vm_area_struct *vma,
1752 struct page *page,
1753 unsigned long address)
1754{
1755 struct vm_area_struct *iter_vma;
1756 struct address_space *mapping;
1757 struct prio_tree_iter iter;
1758 pgoff_t pgoff;
1759
855 /* 1760 /*
856 * It is undesirable to test vma->vm_file as it should be non-null 1761 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
857 * for valid hugetlb area. However, vm_file will be NULL in the error 1762 * from page cache lookup which is in HPAGE_SIZE units.
858 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
859 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
860 * to clean up. Since no pte has actually been setup, it is safe to
861 * do nothing in this case.
862 */ 1763 */
863 if (vma->vm_file) { 1764 address = address & huge_page_mask(hstate_vma(vma));
864 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1765 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
865 __unmap_hugepage_range(vma, start, end); 1766 + (vma->vm_pgoff >> PAGE_SHIFT);
866 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1767 mapping = (struct address_space *)page_private(page);
1768
1769 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1770 /* Do not unmap the current VMA */
1771 if (iter_vma == vma)
1772 continue;
1773
1774 /*
1775 * Unmap the page from other VMAs without their own reserves.
1776 * They get marked to be SIGKILLed if they fault in these
1777 * areas. This is because a future no-page fault on this VMA
1778 * could insert a zeroed page instead of the data existing
1779 * from the time of fork. This would look like data corruption
1780 */
1781 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1782 unmap_hugepage_range(iter_vma,
1783 address, address + HPAGE_SIZE,
1784 page);
867 } 1785 }
1786
1787 return 1;
868} 1788}
869 1789
870static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1790static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
871 unsigned long address, pte_t *ptep, pte_t pte) 1791 unsigned long address, pte_t *ptep, pte_t pte,
1792 struct page *pagecache_page)
872{ 1793{
1794 struct hstate *h = hstate_vma(vma);
873 struct page *old_page, *new_page; 1795 struct page *old_page, *new_page;
874 int avoidcopy; 1796 int avoidcopy;
1797 int outside_reserve = 0;
875 1798
876 old_page = pte_page(pte); 1799 old_page = pte_page(pte);
877 1800
1801retry_avoidcopy:
878 /* If no-one else is actually using this page, avoid the copy 1802 /* If no-one else is actually using this page, avoid the copy
879 * and just make the page writable */ 1803 * and just make the page writable */
880 avoidcopy = (page_count(old_page) == 1); 1804 avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1807,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
883 return 0; 1807 return 0;
884 } 1808 }
885 1809
1810 /*
1811 * If the process that created a MAP_PRIVATE mapping is about to
1812 * perform a COW due to a shared page count, attempt to satisfy
1813 * the allocation without using the existing reserves. The pagecache
1814 * page is used to determine if the reserve at this address was
1815 * consumed or not. If reserves were used, a partial faulted mapping
1816 * at the time of fork() could consume its reserves on COW instead
1817 * of the full address range.
1818 */
1819 if (!(vma->vm_flags & VM_SHARED) &&
1820 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1821 old_page != pagecache_page)
1822 outside_reserve = 1;
1823
886 page_cache_get(old_page); 1824 page_cache_get(old_page);
887 new_page = alloc_huge_page(vma, address); 1825 new_page = alloc_huge_page(vma, address, outside_reserve);
888 1826
889 if (IS_ERR(new_page)) { 1827 if (IS_ERR(new_page)) {
890 page_cache_release(old_page); 1828 page_cache_release(old_page);
1829
1830 /*
1831 * If a process owning a MAP_PRIVATE mapping fails to COW,
1832 * it is due to references held by a child and an insufficient
1833 * huge page pool. To guarantee the original mappers
1834 * reliability, unmap the page from child processes. The child
1835 * may get SIGKILLed if it later faults.
1836 */
1837 if (outside_reserve) {
1838 BUG_ON(huge_pte_none(pte));
1839 if (unmap_ref_private(mm, vma, old_page, address)) {
1840 BUG_ON(page_count(old_page) != 1);
1841 BUG_ON(huge_pte_none(pte));
1842 goto retry_avoidcopy;
1843 }
1844 WARN_ON_ONCE(1);
1845 }
1846
891 return -PTR_ERR(new_page); 1847 return -PTR_ERR(new_page);
892 } 1848 }
893 1849
@@ -896,7 +1852,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
896 __SetPageUptodate(new_page); 1852 __SetPageUptodate(new_page);
897 spin_lock(&mm->page_table_lock); 1853 spin_lock(&mm->page_table_lock);
898 1854
899 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1855 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
900 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1856 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
901 /* Break COW */ 1857 /* Break COW */
902 huge_ptep_clear_flush(vma, address, ptep); 1858 huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1866,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
910 return 0; 1866 return 0;
911} 1867}
912 1868
1869/* Return the pagecache page at a given address within a VMA */
1870static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1871 struct vm_area_struct *vma, unsigned long address)
1872{
1873 struct address_space *mapping;
1874 pgoff_t idx;
1875
1876 mapping = vma->vm_file->f_mapping;
1877 idx = vma_hugecache_offset(h, vma, address);
1878
1879 return find_lock_page(mapping, idx);
1880}
1881
913static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1882static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
914 unsigned long address, pte_t *ptep, int write_access) 1883 unsigned long address, pte_t *ptep, int write_access)
915{ 1884{
1885 struct hstate *h = hstate_vma(vma);
916 int ret = VM_FAULT_SIGBUS; 1886 int ret = VM_FAULT_SIGBUS;
917 unsigned long idx; 1887 pgoff_t idx;
918 unsigned long size; 1888 unsigned long size;
919 struct page *page; 1889 struct page *page;
920 struct address_space *mapping; 1890 struct address_space *mapping;
921 pte_t new_pte; 1891 pte_t new_pte;
922 1892
1893 /*
1894 * Currently, we are forced to kill the process in the event the
1895 * original mapper has unmapped pages from the child due to a failed
1896 * COW. Warn that such a situation has occured as it may not be obvious
1897 */
1898 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1899 printk(KERN_WARNING
1900 "PID %d killed due to inadequate hugepage pool\n",
1901 current->pid);
1902 return ret;
1903 }
1904
923 mapping = vma->vm_file->f_mapping; 1905 mapping = vma->vm_file->f_mapping;
924 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1906 idx = vma_hugecache_offset(h, vma, address);
925 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
926 1907
927 /* 1908 /*
928 * Use page lock to guard against racing truncation 1909 * Use page lock to guard against racing truncation
@@ -931,15 +1912,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
931retry: 1912retry:
932 page = find_lock_page(mapping, idx); 1913 page = find_lock_page(mapping, idx);
933 if (!page) { 1914 if (!page) {
934 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1915 size = i_size_read(mapping->host) >> huge_page_shift(h);
935 if (idx >= size) 1916 if (idx >= size)
936 goto out; 1917 goto out;
937 page = alloc_huge_page(vma, address); 1918 page = alloc_huge_page(vma, address, 0);
938 if (IS_ERR(page)) { 1919 if (IS_ERR(page)) {
939 ret = -PTR_ERR(page); 1920 ret = -PTR_ERR(page);
940 goto out; 1921 goto out;
941 } 1922 }
942 clear_huge_page(page, address); 1923 clear_huge_page(page, address, huge_page_size(h));
943 __SetPageUptodate(page); 1924 __SetPageUptodate(page);
944 1925
945 if (vma->vm_flags & VM_SHARED) { 1926 if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1936,26 @@ retry:
955 } 1936 }
956 1937
957 spin_lock(&inode->i_lock); 1938 spin_lock(&inode->i_lock);
958 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1939 inode->i_blocks += blocks_per_huge_page(h);
959 spin_unlock(&inode->i_lock); 1940 spin_unlock(&inode->i_lock);
960 } else 1941 } else
961 lock_page(page); 1942 lock_page(page);
962 } 1943 }
963 1944
1945 /*
1946 * If we are going to COW a private mapping later, we examine the
1947 * pending reservations for this page now. This will ensure that
1948 * any allocations necessary to record that reservation occur outside
1949 * the spinlock.
1950 */
1951 if (write_access && !(vma->vm_flags & VM_SHARED))
1952 if (vma_needs_reservation(h, vma, address) < 0) {
1953 ret = VM_FAULT_OOM;
1954 goto backout_unlocked;
1955 }
1956
964 spin_lock(&mm->page_table_lock); 1957 spin_lock(&mm->page_table_lock);
965 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1958 size = i_size_read(mapping->host) >> huge_page_shift(h);
966 if (idx >= size) 1959 if (idx >= size)
967 goto backout; 1960 goto backout;
968 1961
@@ -976,7 +1969,7 @@ retry:
976 1969
977 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1970 if (write_access && !(vma->vm_flags & VM_SHARED)) {
978 /* Optimization, do the COW without a second fault */ 1971 /* Optimization, do the COW without a second fault */
979 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1972 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
980 } 1973 }
981 1974
982 spin_unlock(&mm->page_table_lock); 1975 spin_unlock(&mm->page_table_lock);
@@ -986,6 +1979,7 @@ out:
986 1979
987backout: 1980backout:
988 spin_unlock(&mm->page_table_lock); 1981 spin_unlock(&mm->page_table_lock);
1982backout_unlocked:
989 unlock_page(page); 1983 unlock_page(page);
990 put_page(page); 1984 put_page(page);
991 goto out; 1985 goto out;
@@ -997,9 +1991,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
997 pte_t *ptep; 1991 pte_t *ptep;
998 pte_t entry; 1992 pte_t entry;
999 int ret; 1993 int ret;
1994 struct page *pagecache_page = NULL;
1000 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1995 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1996 struct hstate *h = hstate_vma(vma);
1001 1997
1002 ptep = huge_pte_alloc(mm, address); 1998 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
1003 if (!ptep) 1999 if (!ptep)
1004 return VM_FAULT_OOM; 2000 return VM_FAULT_OOM;
1005 2001
@@ -1012,23 +2008,58 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1012 entry = huge_ptep_get(ptep); 2008 entry = huge_ptep_get(ptep);
1013 if (huge_pte_none(entry)) { 2009 if (huge_pte_none(entry)) {
1014 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 2010 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
1015 mutex_unlock(&hugetlb_instantiation_mutex); 2011 goto out_unlock;
1016 return ret;
1017 } 2012 }
1018 2013
1019 ret = 0; 2014 ret = 0;
1020 2015
2016 /*
2017 * If we are going to COW the mapping later, we examine the pending
2018 * reservations for this page now. This will ensure that any
2019 * allocations necessary to record that reservation occur outside the
2020 * spinlock. For private mappings, we also lookup the pagecache
2021 * page now as it is used to determine if a reservation has been
2022 * consumed.
2023 */
2024 if (write_access && !pte_write(entry)) {
2025 if (vma_needs_reservation(h, vma, address) < 0) {
2026 ret = VM_FAULT_OOM;
2027 goto out_unlock;
2028 }
2029
2030 if (!(vma->vm_flags & VM_SHARED))
2031 pagecache_page = hugetlbfs_pagecache_page(h,
2032 vma, address);
2033 }
2034
1021 spin_lock(&mm->page_table_lock); 2035 spin_lock(&mm->page_table_lock);
1022 /* Check for a racing update before calling hugetlb_cow */ 2036 /* Check for a racing update before calling hugetlb_cow */
1023 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 2037 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1024 if (write_access && !pte_write(entry)) 2038 if (write_access && !pte_write(entry))
1025 ret = hugetlb_cow(mm, vma, address, ptep, entry); 2039 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2040 pagecache_page);
1026 spin_unlock(&mm->page_table_lock); 2041 spin_unlock(&mm->page_table_lock);
2042
2043 if (pagecache_page) {
2044 unlock_page(pagecache_page);
2045 put_page(pagecache_page);
2046 }
2047
2048out_unlock:
1027 mutex_unlock(&hugetlb_instantiation_mutex); 2049 mutex_unlock(&hugetlb_instantiation_mutex);
1028 2050
1029 return ret; 2051 return ret;
1030} 2052}
1031 2053
2054/* Can be overriden by architectures */
2055__attribute__((weak)) struct page *
2056follow_huge_pud(struct mm_struct *mm, unsigned long address,
2057 pud_t *pud, int write)
2058{
2059 BUG();
2060 return NULL;
2061}
2062
1032int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2063int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1033 struct page **pages, struct vm_area_struct **vmas, 2064 struct page **pages, struct vm_area_struct **vmas,
1034 unsigned long *position, int *length, int i, 2065 unsigned long *position, int *length, int i,
@@ -1037,6 +2068,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1037 unsigned long pfn_offset; 2068 unsigned long pfn_offset;
1038 unsigned long vaddr = *position; 2069 unsigned long vaddr = *position;
1039 int remainder = *length; 2070 int remainder = *length;
2071 struct hstate *h = hstate_vma(vma);
1040 2072
1041 spin_lock(&mm->page_table_lock); 2073 spin_lock(&mm->page_table_lock);
1042 while (vaddr < vma->vm_end && remainder) { 2074 while (vaddr < vma->vm_end && remainder) {
@@ -1048,7 +2080,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1048 * each hugepage. We have to make * sure we get the 2080 * each hugepage. We have to make * sure we get the
1049 * first, for the page indexing below to work. 2081 * first, for the page indexing below to work.
1050 */ 2082 */
1051 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 2083 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
1052 2084
1053 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 2085 if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1054 (write && !pte_write(huge_ptep_get(pte)))) { 2086 (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1066,7 +2098,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1066 break; 2098 break;
1067 } 2099 }
1068 2100
1069 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 2101 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
1070 page = pte_page(huge_ptep_get(pte)); 2102 page = pte_page(huge_ptep_get(pte));
1071same_page: 2103same_page:
1072 if (pages) { 2104 if (pages) {
@@ -1082,7 +2114,7 @@ same_page:
1082 --remainder; 2114 --remainder;
1083 ++i; 2115 ++i;
1084 if (vaddr < vma->vm_end && remainder && 2116 if (vaddr < vma->vm_end && remainder &&
1085 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 2117 pfn_offset < pages_per_huge_page(h)) {
1086 /* 2118 /*
1087 * We use pfn_offset to avoid touching the pageframes 2119 * We use pfn_offset to avoid touching the pageframes
1088 * of this compound page. 2120 * of this compound page.
@@ -1104,13 +2136,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1104 unsigned long start = address; 2136 unsigned long start = address;
1105 pte_t *ptep; 2137 pte_t *ptep;
1106 pte_t pte; 2138 pte_t pte;
2139 struct hstate *h = hstate_vma(vma);
1107 2140
1108 BUG_ON(address >= end); 2141 BUG_ON(address >= end);
1109 flush_cache_range(vma, address, end); 2142 flush_cache_range(vma, address, end);
1110 2143
1111 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2144 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1112 spin_lock(&mm->page_table_lock); 2145 spin_lock(&mm->page_table_lock);
1113 for (; address < end; address += HPAGE_SIZE) { 2146 for (; address < end; address += huge_page_size(h)) {
1114 ptep = huge_pte_offset(mm, address); 2147 ptep = huge_pte_offset(mm, address);
1115 if (!ptep) 2148 if (!ptep)
1116 continue; 2149 continue;
@@ -1128,195 +2161,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1128 flush_tlb_range(vma, start, end); 2161 flush_tlb_range(vma, start, end);
1129} 2162}
1130 2163
1131struct file_region { 2164int hugetlb_reserve_pages(struct inode *inode,
1132 struct list_head link; 2165 long from, long to,
1133 long from; 2166 struct vm_area_struct *vma)
1134 long to;
1135};
1136
1137static long region_add(struct list_head *head, long f, long t)
1138{
1139 struct file_region *rg, *nrg, *trg;
1140
1141 /* Locate the region we are either in or before. */
1142 list_for_each_entry(rg, head, link)
1143 if (f <= rg->to)
1144 break;
1145
1146 /* Round our left edge to the current segment if it encloses us. */
1147 if (f > rg->from)
1148 f = rg->from;
1149
1150 /* Check for and consume any regions we now overlap with. */
1151 nrg = rg;
1152 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1153 if (&rg->link == head)
1154 break;
1155 if (rg->from > t)
1156 break;
1157
1158 /* If this area reaches higher then extend our area to
1159 * include it completely. If this is not the first area
1160 * which we intend to reuse, free it. */
1161 if (rg->to > t)
1162 t = rg->to;
1163 if (rg != nrg) {
1164 list_del(&rg->link);
1165 kfree(rg);
1166 }
1167 }
1168 nrg->from = f;
1169 nrg->to = t;
1170 return 0;
1171}
1172
1173static long region_chg(struct list_head *head, long f, long t)
1174{ 2167{
1175 struct file_region *rg, *nrg; 2168 long ret, chg;
1176 long chg = 0; 2169 struct hstate *h = hstate_inode(inode);
1177
1178 /* Locate the region we are before or in. */
1179 list_for_each_entry(rg, head, link)
1180 if (f <= rg->to)
1181 break;
1182
1183 /* If we are below the current region then a new region is required.
1184 * Subtle, allocate a new region at the position but make it zero
1185 * size such that we can guarantee to record the reservation. */
1186 if (&rg->link == head || t < rg->from) {
1187 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1188 if (!nrg)
1189 return -ENOMEM;
1190 nrg->from = f;
1191 nrg->to = f;
1192 INIT_LIST_HEAD(&nrg->link);
1193 list_add(&nrg->link, rg->link.prev);
1194
1195 return t - f;
1196 }
1197
1198 /* Round our left edge to the current segment if it encloses us. */
1199 if (f > rg->from)
1200 f = rg->from;
1201 chg = t - f;
1202
1203 /* Check for and consume any regions we now overlap with. */
1204 list_for_each_entry(rg, rg->link.prev, link) {
1205 if (&rg->link == head)
1206 break;
1207 if (rg->from > t)
1208 return chg;
1209
1210 /* We overlap with this area, if it extends futher than
1211 * us then we must extend ourselves. Account for its
1212 * existing reservation. */
1213 if (rg->to > t) {
1214 chg += rg->to - t;
1215 t = rg->to;
1216 }
1217 chg -= rg->to - rg->from;
1218 }
1219 return chg;
1220}
1221
1222static long region_truncate(struct list_head *head, long end)
1223{
1224 struct file_region *rg, *trg;
1225 long chg = 0;
1226 2170
1227 /* Locate the region we are either in or before. */ 2171 if (vma && vma->vm_flags & VM_NORESERVE)
1228 list_for_each_entry(rg, head, link)
1229 if (end <= rg->to)
1230 break;
1231 if (&rg->link == head)
1232 return 0; 2172 return 0;
1233 2173
1234 /* If we are in the middle of a region then adjust it. */
1235 if (end > rg->from) {
1236 chg = rg->to - end;
1237 rg->to = end;
1238 rg = list_entry(rg->link.next, typeof(*rg), link);
1239 }
1240
1241 /* Drop any remaining regions. */
1242 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1243 if (&rg->link == head)
1244 break;
1245 chg += rg->to - rg->from;
1246 list_del(&rg->link);
1247 kfree(rg);
1248 }
1249 return chg;
1250}
1251
1252static int hugetlb_acct_memory(long delta)
1253{
1254 int ret = -ENOMEM;
1255
1256 spin_lock(&hugetlb_lock);
1257 /* 2174 /*
1258 * When cpuset is configured, it breaks the strict hugetlb page 2175 * Shared mappings base their reservation on the number of pages that
1259 * reservation as the accounting is done on a global variable. Such 2176 * are already allocated on behalf of the file. Private mappings need
1260 * reservation is completely rubbish in the presence of cpuset because 2177 * to reserve the full area even if read-only as mprotect() may be
1261 * the reservation is not checked against page availability for the 2178 * called to make the mapping read-write. Assume !vma is a shm mapping
1262 * current cpuset. Application can still potentially OOM'ed by kernel
1263 * with lack of free htlb page in cpuset that the task is in.
1264 * Attempt to enforce strict accounting with cpuset is almost
1265 * impossible (or too ugly) because cpuset is too fluid that
1266 * task or memory node can be dynamically moved between cpusets.
1267 *
1268 * The change of semantics for shared hugetlb mapping with cpuset is
1269 * undesirable. However, in order to preserve some of the semantics,
1270 * we fall back to check against current free page availability as
1271 * a best attempt and hopefully to minimize the impact of changing
1272 * semantics that cpuset has.
1273 */ 2179 */
1274 if (delta > 0) { 2180 if (!vma || vma->vm_flags & VM_SHARED)
1275 if (gather_surplus_pages(delta) < 0) 2181 chg = region_chg(&inode->i_mapping->private_list, from, to);
1276 goto out; 2182 else {
1277 2183 struct resv_map *resv_map = resv_map_alloc();
1278 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 2184 if (!resv_map)
1279 return_unused_surplus_pages(delta); 2185 return -ENOMEM;
1280 goto out;
1281 }
1282 }
1283
1284 ret = 0;
1285 if (delta < 0)
1286 return_unused_surplus_pages((unsigned long) -delta);
1287 2186
1288out: 2187 chg = to - from;
1289 spin_unlock(&hugetlb_lock);
1290 return ret;
1291}
1292 2188
1293int hugetlb_reserve_pages(struct inode *inode, long from, long to) 2189 set_vma_resv_map(vma, resv_map);
1294{ 2190 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1295 long ret, chg; 2191 }
1296 2192
1297 chg = region_chg(&inode->i_mapping->private_list, from, to);
1298 if (chg < 0) 2193 if (chg < 0)
1299 return chg; 2194 return chg;
1300 2195
1301 if (hugetlb_get_quota(inode->i_mapping, chg)) 2196 if (hugetlb_get_quota(inode->i_mapping, chg))
1302 return -ENOSPC; 2197 return -ENOSPC;
1303 ret = hugetlb_acct_memory(chg); 2198 ret = hugetlb_acct_memory(h, chg);
1304 if (ret < 0) { 2199 if (ret < 0) {
1305 hugetlb_put_quota(inode->i_mapping, chg); 2200 hugetlb_put_quota(inode->i_mapping, chg);
1306 return ret; 2201 return ret;
1307 } 2202 }
1308 region_add(&inode->i_mapping->private_list, from, to); 2203 if (!vma || vma->vm_flags & VM_SHARED)
2204 region_add(&inode->i_mapping->private_list, from, to);
1309 return 0; 2205 return 0;
1310} 2206}
1311 2207
1312void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2208void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1313{ 2209{
2210 struct hstate *h = hstate_inode(inode);
1314 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2211 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1315 2212
1316 spin_lock(&inode->i_lock); 2213 spin_lock(&inode->i_lock);
1317 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 2214 inode->i_blocks -= blocks_per_huge_page(h);
1318 spin_unlock(&inode->i_lock); 2215 spin_unlock(&inode->i_lock);
1319 2216
1320 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2217 hugetlb_put_quota(inode->i_mapping, (chg - freed));
1321 hugetlb_acct_memory(-(chg - freed)); 2218 hugetlb_acct_memory(h, -(chg - freed));
1322} 2219}
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..1f43f7416972 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15 15
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling);
18
19extern void prep_compound_page(struct page *page, unsigned long order);
20
16static inline void set_page_count(struct page *page, int v) 21static inline void set_page_count(struct page *page, int v)
17{ 22{
18 atomic_set(&page->_count, v); 23 atomic_set(&page->_count, v);
@@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page)
59#define __paginginit __init 64#define __paginginit __init
60#endif 65#endif
61 66
67/* Memory initialisation debug and verification */
68enum mminit_level {
69 MMINIT_WARNING,
70 MMINIT_VERIFY,
71 MMINIT_TRACE
72};
73
74#ifdef CONFIG_DEBUG_MEMORY_INIT
75
76extern int mminit_loglevel;
77
78#define mminit_dprintk(level, prefix, fmt, arg...) \
79do { \
80 if (level < mminit_loglevel) { \
81 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
82 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
83 } \
84} while (0)
85
86extern void mminit_verify_pageflags_layout(void);
87extern void mminit_verify_page_links(struct page *page,
88 enum zone_type zone, unsigned long nid, unsigned long pfn);
89extern void mminit_verify_zonelist(void);
90
91#else
92
93static inline void mminit_dprintk(enum mminit_level level,
94 const char *prefix, const char *fmt, ...)
95{
96}
97
98static inline void mminit_verify_pageflags_layout(void)
99{
100}
101
102static inline void mminit_verify_page_links(struct page *page,
103 enum zone_type zone, unsigned long nid, unsigned long pfn)
104{
105}
106
107static inline void mminit_verify_zonelist(void)
108{
109}
110#endif /* CONFIG_DEBUG_MEMORY_INIT */
111
112/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
113#if defined(CONFIG_SPARSEMEM)
114extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
115 unsigned long *end_pfn);
116#else
117static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
118 unsigned long *end_pfn)
119{
120}
121#endif /* CONFIG_SPARSEMEM */
122
62#endif 123#endif
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea0..f9349c18a1b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
132 * Application no longer needs these pages. If the pages are dirty, 132 * Application no longer needs these pages. If the pages are dirty,
133 * it's OK to just throw them away. The app will be more careful about 133 * it's OK to just throw them away. The app will be more careful about
134 * data it wants to keep. Be sure to free swap resources too. The 134 * data it wants to keep. Be sure to free swap resources too. The
135 * zap_page_range call sets things up for refill_inactive to actually free 135 * zap_page_range call sets things up for shrink_active_list to actually free
136 * these pages later if no one else has touched them in the meantime, 136 * these pages later if no one else has touched them in the meantime,
137 * although we could add these pages to a global reuse list for 137 * although we could add these pages to a global reuse list for
138 * refill_inactive to pick up before reclaiming other pages. 138 * shrink_active_list to pick up before reclaiming other pages.
139 * 139 *
140 * NB: This interface discards data rather than pushes it out to swap, 140 * NB: This interface discards data rather than pushes it out to swap,
141 * as some implementations do. This has performance implications for 141 * as some implementations do. This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..0f1f7a7374ba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,9 +35,9 @@
35 35
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37 37
38struct cgroup_subsys mem_cgroup_subsys; 38struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 39static struct kmem_cache *page_cgroup_cache __read_mostly;
40static struct kmem_cache *page_cgroup_cache; 40#define MEM_CGROUP_RECLAIM_RETRIES 5
41 41
42/* 42/*
43 * Statistics for memory cgroup. 43 * Statistics for memory cgroup.
@@ -166,7 +166,6 @@ struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */ 166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page; 167 struct page *page;
168 struct mem_cgroup *mem_cgroup; 168 struct mem_cgroup *mem_cgroup;
169 int ref_cnt; /* cached, mapped, migrating */
170 int flags; 169 int flags;
171}; 170};
172#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
@@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
185enum charge_type { 184enum charge_type {
186 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
187 MEM_CGROUP_CHARGE_TYPE_MAPPED, 186 MEM_CGROUP_CHARGE_TYPE_MAPPED,
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
188}; 188};
189 189
190/* 190/*
@@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
297 297
298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
299 list_del_init(&pc->lru); 299 list_del(&pc->lru);
300} 300}
301 301
302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
@@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active)
354 struct mem_cgroup_per_zone *mz; 354 struct mem_cgroup_per_zone *mz;
355 unsigned long flags; 355 unsigned long flags;
356 356
357 if (mem_cgroup_subsys.disabled)
358 return;
359
357 /* 360 /*
358 * We cannot lock_page_cgroup while holding zone's lru_lock, 361 * We cannot lock_page_cgroup while holding zone's lru_lock,
359 * because other holders of lock_page_cgroup can be interrupted 362 * because other holders of lock_page_cgroup can be interrupted
@@ -524,7 +527,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
524 * < 0 if the cgroup is over its limit 527 * < 0 if the cgroup is over its limit
525 */ 528 */
526static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 529static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
527 gfp_t gfp_mask, enum charge_type ctype) 530 gfp_t gfp_mask, enum charge_type ctype,
531 struct mem_cgroup *memcg)
528{ 532{
529 struct mem_cgroup *mem; 533 struct mem_cgroup *mem;
530 struct page_cgroup *pc; 534 struct page_cgroup *pc;
@@ -532,35 +536,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
532 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 536 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
533 struct mem_cgroup_per_zone *mz; 537 struct mem_cgroup_per_zone *mz;
534 538
535 if (mem_cgroup_subsys.disabled) 539 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
536 return 0; 540 if (unlikely(pc == NULL))
537
538 /*
539 * Should page_cgroup's go to their own slab?
540 * One could optimize the performance of the charging routine
541 * by saving a bit in the page_flags and using it as a lock
542 * to see if the cgroup page already has a page_cgroup associated
543 * with it
544 */
545retry:
546 lock_page_cgroup(page);
547 pc = page_get_page_cgroup(page);
548 /*
549 * The page_cgroup exists and
550 * the page has already been accounted.
551 */
552 if (pc) {
553 VM_BUG_ON(pc->page != page);
554 VM_BUG_ON(pc->ref_cnt <= 0);
555
556 pc->ref_cnt++;
557 unlock_page_cgroup(page);
558 goto done;
559 }
560 unlock_page_cgroup(page);
561
562 pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
563 if (pc == NULL)
564 goto err; 541 goto err;
565 542
566 /* 543 /*
@@ -569,16 +546,18 @@ retry:
569 * thread group leader migrates. It's possible that mm is not 546 * thread group leader migrates. It's possible that mm is not
570 * set, if so charge the init_mm (happens for pagecache usage). 547 * set, if so charge the init_mm (happens for pagecache usage).
571 */ 548 */
572 if (!mm) 549 if (likely(!memcg)) {
573 mm = &init_mm; 550 rcu_read_lock();
574 551 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
575 rcu_read_lock(); 552 /*
576 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 553 * For every charge from the cgroup, increment reference count
577 /* 554 */
578 * For every charge from the cgroup, increment reference count 555 css_get(&mem->css);
579 */ 556 rcu_read_unlock();
580 css_get(&mem->css); 557 } else {
581 rcu_read_unlock(); 558 mem = memcg;
559 css_get(&memcg->css);
560 }
582 561
583 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 562 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
584 if (!(gfp_mask & __GFP_WAIT)) 563 if (!(gfp_mask & __GFP_WAIT))
@@ -603,25 +582,24 @@ retry:
603 } 582 }
604 } 583 }
605 584
606 pc->ref_cnt = 1;
607 pc->mem_cgroup = mem; 585 pc->mem_cgroup = mem;
608 pc->page = page; 586 pc->page = page;
609 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 587 /*
588 * If a page is accounted as a page cache, insert to inactive list.
589 * If anon, insert to active list.
590 */
610 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 591 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
611 pc->flags = PAGE_CGROUP_FLAG_CACHE; 592 pc->flags = PAGE_CGROUP_FLAG_CACHE;
593 else
594 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
612 595
613 lock_page_cgroup(page); 596 lock_page_cgroup(page);
614 if (page_get_page_cgroup(page)) { 597 if (unlikely(page_get_page_cgroup(page))) {
615 unlock_page_cgroup(page); 598 unlock_page_cgroup(page);
616 /*
617 * Another charge has been added to this page already.
618 * We take lock_page_cgroup(page) again and read
619 * page->cgroup, increment refcnt.... just retry is OK.
620 */
621 res_counter_uncharge(&mem->res, PAGE_SIZE); 599 res_counter_uncharge(&mem->res, PAGE_SIZE);
622 css_put(&mem->css); 600 css_put(&mem->css);
623 kmem_cache_free(page_cgroup_cache, pc); 601 kmem_cache_free(page_cgroup_cache, pc);
624 goto retry; 602 goto done;
625 } 603 }
626 page_assign_page_cgroup(page, pc); 604 page_assign_page_cgroup(page, pc);
627 605
@@ -642,24 +620,65 @@ err:
642 620
643int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 621int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
644{ 622{
623 if (mem_cgroup_subsys.disabled)
624 return 0;
625
626 /*
627 * If already mapped, we don't have to account.
628 * If page cache, page->mapping has address_space.
629 * But page->mapping may have out-of-use anon_vma pointer,
630 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
631 * is NULL.
632 */
633 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
634 return 0;
635 if (unlikely(!mm))
636 mm = &init_mm;
645 return mem_cgroup_charge_common(page, mm, gfp_mask, 637 return mem_cgroup_charge_common(page, mm, gfp_mask,
646 MEM_CGROUP_CHARGE_TYPE_MAPPED); 638 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
647} 639}
648 640
649int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 641int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
650 gfp_t gfp_mask) 642 gfp_t gfp_mask)
651{ 643{
652 if (!mm) 644 if (mem_cgroup_subsys.disabled)
645 return 0;
646
647 /*
648 * Corner case handling. This is called from add_to_page_cache()
649 * in usual. But some FS (shmem) precharges this page before calling it
650 * and call add_to_page_cache() with GFP_NOWAIT.
651 *
652 * For GFP_NOWAIT case, the page may be pre-charged before calling
653 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
654 * charge twice. (It works but has to pay a bit larger cost.)
655 */
656 if (!(gfp_mask & __GFP_WAIT)) {
657 struct page_cgroup *pc;
658
659 lock_page_cgroup(page);
660 pc = page_get_page_cgroup(page);
661 if (pc) {
662 VM_BUG_ON(pc->page != page);
663 VM_BUG_ON(!pc->mem_cgroup);
664 unlock_page_cgroup(page);
665 return 0;
666 }
667 unlock_page_cgroup(page);
668 }
669
670 if (unlikely(!mm))
653 mm = &init_mm; 671 mm = &init_mm;
672
654 return mem_cgroup_charge_common(page, mm, gfp_mask, 673 return mem_cgroup_charge_common(page, mm, gfp_mask,
655 MEM_CGROUP_CHARGE_TYPE_CACHE); 674 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
656} 675}
657 676
658/* 677/*
659 * Uncharging is always a welcome operation, we never complain, simply 678 * uncharge if !page_mapped(page)
660 * uncharge.
661 */ 679 */
662void mem_cgroup_uncharge_page(struct page *page) 680static void
681__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
663{ 682{
664 struct page_cgroup *pc; 683 struct page_cgroup *pc;
665 struct mem_cgroup *mem; 684 struct mem_cgroup *mem;
@@ -674,98 +693,153 @@ void mem_cgroup_uncharge_page(struct page *page)
674 */ 693 */
675 lock_page_cgroup(page); 694 lock_page_cgroup(page);
676 pc = page_get_page_cgroup(page); 695 pc = page_get_page_cgroup(page);
677 if (!pc) 696 if (unlikely(!pc))
678 goto unlock; 697 goto unlock;
679 698
680 VM_BUG_ON(pc->page != page); 699 VM_BUG_ON(pc->page != page);
681 VM_BUG_ON(pc->ref_cnt <= 0);
682 700
683 if (--(pc->ref_cnt) == 0) { 701 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
684 mz = page_cgroup_zoneinfo(pc); 702 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
685 spin_lock_irqsave(&mz->lru_lock, flags); 703 || page_mapped(page)))
686 __mem_cgroup_remove_list(mz, pc); 704 goto unlock;
687 spin_unlock_irqrestore(&mz->lru_lock, flags);
688 705
689 page_assign_page_cgroup(page, NULL); 706 mz = page_cgroup_zoneinfo(pc);
690 unlock_page_cgroup(page); 707 spin_lock_irqsave(&mz->lru_lock, flags);
708 __mem_cgroup_remove_list(mz, pc);
709 spin_unlock_irqrestore(&mz->lru_lock, flags);
691 710
692 mem = pc->mem_cgroup; 711 page_assign_page_cgroup(page, NULL);
693 res_counter_uncharge(&mem->res, PAGE_SIZE); 712 unlock_page_cgroup(page);
694 css_put(&mem->css);
695 713
696 kmem_cache_free(page_cgroup_cache, pc); 714 mem = pc->mem_cgroup;
697 return; 715 res_counter_uncharge(&mem->res, PAGE_SIZE);
698 } 716 css_put(&mem->css);
699 717
718 kmem_cache_free(page_cgroup_cache, pc);
719 return;
700unlock: 720unlock:
701 unlock_page_cgroup(page); 721 unlock_page_cgroup(page);
702} 722}
703 723
724void mem_cgroup_uncharge_page(struct page *page)
725{
726 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
727}
728
729void mem_cgroup_uncharge_cache_page(struct page *page)
730{
731 VM_BUG_ON(page_mapped(page));
732 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
733}
734
704/* 735/*
705 * Returns non-zero if a page (under migration) has valid page_cgroup member. 736 * Before starting migration, account against new page.
706 * Refcnt of page_cgroup is incremented.
707 */ 737 */
708int mem_cgroup_prepare_migration(struct page *page) 738int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
709{ 739{
710 struct page_cgroup *pc; 740 struct page_cgroup *pc;
741 struct mem_cgroup *mem = NULL;
742 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
743 int ret = 0;
711 744
712 if (mem_cgroup_subsys.disabled) 745 if (mem_cgroup_subsys.disabled)
713 return 0; 746 return 0;
714 747
715 lock_page_cgroup(page); 748 lock_page_cgroup(page);
716 pc = page_get_page_cgroup(page); 749 pc = page_get_page_cgroup(page);
717 if (pc) 750 if (pc) {
718 pc->ref_cnt++; 751 mem = pc->mem_cgroup;
752 css_get(&mem->css);
753 if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
754 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
755 }
719 unlock_page_cgroup(page); 756 unlock_page_cgroup(page);
720 return pc != NULL; 757 if (mem) {
758 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
759 ctype, mem);
760 css_put(&mem->css);
761 }
762 return ret;
721} 763}
722 764
723void mem_cgroup_end_migration(struct page *page) 765/* remove redundant charge if migration failed*/
766void mem_cgroup_end_migration(struct page *newpage)
724{ 767{
725 mem_cgroup_uncharge_page(page); 768 /*
769 * At success, page->mapping is not NULL.
770 * special rollback care is necessary when
771 * 1. at migration failure. (newpage->mapping is cleared in this case)
772 * 2. the newpage was moved but not remapped again because the task
773 * exits and the newpage is obsolete. In this case, the new page
774 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
775 * always for avoiding mess. The page_cgroup will be removed if
776 * unnecessary. File cache pages is still on radix-tree. Don't
777 * care it.
778 */
779 if (!newpage->mapping)
780 __mem_cgroup_uncharge_common(newpage,
781 MEM_CGROUP_CHARGE_TYPE_FORCE);
782 else if (PageAnon(newpage))
783 mem_cgroup_uncharge_page(newpage);
726} 784}
727 785
728/* 786/*
729 * We know both *page* and *newpage* are now not-on-LRU and PG_locked. 787 * A call to try to shrink memory usage under specified resource controller.
730 * And no race with uncharge() routines because page_cgroup for *page* 788 * This is typically used for page reclaiming for shmem for reducing side
731 * has extra one reference by mem_cgroup_prepare_migration. 789 * effect of page allocation from shmem, which is used by some mem_cgroup.
732 */ 790 */
733void mem_cgroup_page_migration(struct page *page, struct page *newpage) 791int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
734{ 792{
735 struct page_cgroup *pc; 793 struct mem_cgroup *mem;
736 struct mem_cgroup_per_zone *mz; 794 int progress = 0;
737 unsigned long flags; 795 int retry = MEM_CGROUP_RECLAIM_RETRIES;
738 796
739 lock_page_cgroup(page); 797 if (mem_cgroup_subsys.disabled)
740 pc = page_get_page_cgroup(page); 798 return 0;
741 if (!pc) { 799 if (!mm)
742 unlock_page_cgroup(page); 800 return 0;
743 return;
744 }
745 801
746 mz = page_cgroup_zoneinfo(pc); 802 rcu_read_lock();
747 spin_lock_irqsave(&mz->lru_lock, flags); 803 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
748 __mem_cgroup_remove_list(mz, pc); 804 css_get(&mem->css);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 805 rcu_read_unlock();
750 806
751 page_assign_page_cgroup(page, NULL); 807 do {
752 unlock_page_cgroup(page); 808 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
809 } while (!progress && --retry);
753 810
754 pc->page = newpage; 811 css_put(&mem->css);
755 lock_page_cgroup(newpage); 812 if (!retry)
756 page_assign_page_cgroup(newpage, pc); 813 return -ENOMEM;
814 return 0;
815}
757 816
758 mz = page_cgroup_zoneinfo(pc); 817int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
759 spin_lock_irqsave(&mz->lru_lock, flags); 818{
760 __mem_cgroup_add_list(mz, pc); 819
761 spin_unlock_irqrestore(&mz->lru_lock, flags); 820 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
821 int progress;
822 int ret = 0;
762 823
763 unlock_page_cgroup(newpage); 824 while (res_counter_set_limit(&memcg->res, val)) {
825 if (signal_pending(current)) {
826 ret = -EINTR;
827 break;
828 }
829 if (!retry_count) {
830 ret = -EBUSY;
831 break;
832 }
833 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
834 if (!progress)
835 retry_count--;
836 }
837 return ret;
764} 838}
765 839
840
766/* 841/*
767 * This routine traverse page_cgroup in given list and drop them all. 842 * This routine traverse page_cgroup in given list and drop them all.
768 * This routine ignores page_cgroup->ref_cnt.
769 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 843 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
770 */ 844 */
771#define FORCE_UNCHARGE_BATCH (128) 845#define FORCE_UNCHARGE_BATCH (128)
@@ -790,12 +864,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
790 page = pc->page; 864 page = pc->page;
791 get_page(page); 865 get_page(page);
792 spin_unlock_irqrestore(&mz->lru_lock, flags); 866 spin_unlock_irqrestore(&mz->lru_lock, flags);
793 mem_cgroup_uncharge_page(page); 867 /*
794 put_page(page); 868 * Check if this page is on LRU. !LRU page can be found
795 if (--count <= 0) { 869 * if it's under page migration.
796 count = FORCE_UNCHARGE_BATCH; 870 */
871 if (PageLRU(page)) {
872 __mem_cgroup_uncharge_common(page,
873 MEM_CGROUP_CHARGE_TYPE_FORCE);
874 put_page(page);
875 if (--count <= 0) {
876 count = FORCE_UNCHARGE_BATCH;
877 cond_resched();
878 }
879 } else
797 cond_resched(); 880 cond_resched();
798 }
799 spin_lock_irqsave(&mz->lru_lock, flags); 881 spin_lock_irqsave(&mz->lru_lock, flags);
800 } 882 }
801 spin_unlock_irqrestore(&mz->lru_lock, flags); 883 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -810,9 +892,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
810 int ret = -EBUSY; 892 int ret = -EBUSY;
811 int node, zid; 893 int node, zid;
812 894
813 if (mem_cgroup_subsys.disabled)
814 return 0;
815
816 css_get(&mem->css); 895 css_get(&mem->css);
817 /* 896 /*
818 * page reclaim code (kswapd etc..) will move pages between 897 * page reclaim code (kswapd etc..) will move pages between
@@ -838,32 +917,34 @@ out:
838 return ret; 917 return ret;
839} 918}
840 919
841static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
842{
843 *tmp = memparse(buf, &buf);
844 if (*buf != '\0')
845 return -EINVAL;
846
847 /*
848 * Round up the value to the closest page size
849 */
850 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
851 return 0;
852}
853
854static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 920static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
855{ 921{
856 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 922 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
857 cft->private); 923 cft->private);
858} 924}
859 925/*
860static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 926 * The user of this function is...
861 struct file *file, const char __user *userbuf, 927 * RES_LIMIT.
862 size_t nbytes, loff_t *ppos) 928 */
929static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
930 const char *buffer)
863{ 931{
864 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 932 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
865 cft->private, userbuf, nbytes, ppos, 933 unsigned long long val;
866 mem_cgroup_write_strategy); 934 int ret;
935
936 switch (cft->private) {
937 case RES_LIMIT:
938 /* This function does all necessary parse...reuse it */
939 ret = res_counter_memparse_write_strategy(buffer, &val);
940 if (!ret)
941 ret = mem_cgroup_resize_limit(memcg, val);
942 break;
943 default:
944 ret = -EINVAL; /* should be BUG() ? */
945 break;
946 }
947 return ret;
867} 948}
868 949
869static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 950static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -940,7 +1021,7 @@ static struct cftype mem_cgroup_files[] = {
940 { 1021 {
941 .name = "limit_in_bytes", 1022 .name = "limit_in_bytes",
942 .private = RES_LIMIT, 1023 .private = RES_LIMIT,
943 .write = mem_cgroup_write, 1024 .write_string = mem_cgroup_write,
944 .read_u64 = mem_cgroup_read, 1025 .read_u64 = mem_cgroup_read,
945 }, 1026 },
946 { 1027 {
@@ -1070,8 +1151,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1070static int mem_cgroup_populate(struct cgroup_subsys *ss, 1151static int mem_cgroup_populate(struct cgroup_subsys *ss,
1071 struct cgroup *cont) 1152 struct cgroup *cont)
1072{ 1153{
1073 if (mem_cgroup_subsys.disabled)
1074 return 0;
1075 return cgroup_add_files(cont, ss, mem_cgroup_files, 1154 return cgroup_add_files(cont, ss, mem_cgroup_files,
1076 ARRAY_SIZE(mem_cgroup_files)); 1155 ARRAY_SIZE(mem_cgroup_files));
1077} 1156}
@@ -1084,9 +1163,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1084 struct mm_struct *mm; 1163 struct mm_struct *mm;
1085 struct mem_cgroup *mem, *old_mem; 1164 struct mem_cgroup *mem, *old_mem;
1086 1165
1087 if (mem_cgroup_subsys.disabled)
1088 return;
1089
1090 mm = get_task_mm(p); 1166 mm = get_task_mm(p);
1091 if (mm == NULL) 1167 if (mm == NULL)
1092 return; 1168 return;
@@ -1094,9 +1170,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1094 mem = mem_cgroup_from_cont(cont); 1170 mem = mem_cgroup_from_cont(cont);
1095 old_mem = mem_cgroup_from_cont(old_cont); 1171 old_mem = mem_cgroup_from_cont(old_cont);
1096 1172
1097 if (mem == old_mem)
1098 goto out;
1099
1100 /* 1173 /*
1101 * Only thread group leaders are allowed to migrate, the mm_struct is 1174 * Only thread group leaders are allowed to migrate, the mm_struct is
1102 * in effect owned by the leader 1175 * in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..1002f473f497 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
54 55
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -61,6 +62,8 @@
61#include <linux/swapops.h> 62#include <linux/swapops.h>
62#include <linux/elf.h> 63#include <linux/elf.h>
63 64
65#include "internal.h"
66
64#ifndef CONFIG_NEED_MULTIPLE_NODES 67#ifndef CONFIG_NEED_MULTIPLE_NODES
65/* use the per-pgdat data instead for discontigmem - mbligh */ 68/* use the per-pgdat data instead for discontigmem - mbligh */
66unsigned long max_mapnr; 69unsigned long max_mapnr;
@@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
211 * 214 *
212 * Must be called with pagetable lock held. 215 * Must be called with pagetable lock held.
213 */ 216 */
214void free_pgd_range(struct mmu_gather **tlb, 217void free_pgd_range(struct mmu_gather *tlb,
215 unsigned long addr, unsigned long end, 218 unsigned long addr, unsigned long end,
216 unsigned long floor, unsigned long ceiling) 219 unsigned long floor, unsigned long ceiling)
217{ 220{
@@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb,
262 return; 265 return;
263 266
264 start = addr; 267 start = addr;
265 pgd = pgd_offset((*tlb)->mm, addr); 268 pgd = pgd_offset(tlb->mm, addr);
266 do { 269 do {
267 next = pgd_addr_end(addr, end); 270 next = pgd_addr_end(addr, end);
268 if (pgd_none_or_clear_bad(pgd)) 271 if (pgd_none_or_clear_bad(pgd))
269 continue; 272 continue;
270 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 273 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
271 } while (pgd++, addr = next, addr != end); 274 } while (pgd++, addr = next, addr != end);
272} 275}
273 276
274void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, 277void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
275 unsigned long floor, unsigned long ceiling) 278 unsigned long floor, unsigned long ceiling)
276{ 279{
277 while (vma) { 280 while (vma) {
@@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
372 * 375 *
373 * The calling function must still handle the error. 376 * The calling function must still handle the error.
374 */ 377 */
375void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) 378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
379 unsigned long vaddr)
376{ 380{
377 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
378 "vm_flags = %lx, vaddr = %lx\n", 382 "vm_flags = %lx, vaddr = %lx\n",
@@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
649 unsigned long next; 653 unsigned long next;
650 unsigned long addr = vma->vm_start; 654 unsigned long addr = vma->vm_start;
651 unsigned long end = vma->vm_end; 655 unsigned long end = vma->vm_end;
656 int ret;
652 657
653 /* 658 /*
654 * Don't copy ptes where a page fault will fill them correctly. 659 * Don't copy ptes where a page fault will fill them correctly.
@@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
664 if (is_vm_hugetlb_page(vma)) 669 if (is_vm_hugetlb_page(vma))
665 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 670 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
666 671
672 /*
673 * We need to invalidate the secondary MMU mappings only when
674 * there could be a permission downgrade on the ptes of the
675 * parent mm. And a permission downgrade will only happen if
676 * is_cow_mapping() returns true.
677 */
678 if (is_cow_mapping(vma->vm_flags))
679 mmu_notifier_invalidate_range_start(src_mm, addr, end);
680
681 ret = 0;
667 dst_pgd = pgd_offset(dst_mm, addr); 682 dst_pgd = pgd_offset(dst_mm, addr);
668 src_pgd = pgd_offset(src_mm, addr); 683 src_pgd = pgd_offset(src_mm, addr);
669 do { 684 do {
670 next = pgd_addr_end(addr, end); 685 next = pgd_addr_end(addr, end);
671 if (pgd_none_or_clear_bad(src_pgd)) 686 if (pgd_none_or_clear_bad(src_pgd))
672 continue; 687 continue;
673 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 688 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
674 vma, addr, next)) 689 vma, addr, next))) {
675 return -ENOMEM; 690 ret = -ENOMEM;
691 break;
692 }
676 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 693 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
677 return 0; 694
695 if (is_cow_mapping(vma->vm_flags))
696 mmu_notifier_invalidate_range_end(src_mm,
697 vma->vm_start, end);
698 return ret;
678} 699}
679 700
680static unsigned long zap_pte_range(struct mmu_gather *tlb, 701static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
878 unsigned long start = start_addr; 899 unsigned long start = start_addr;
879 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 900 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
880 int fullmm = (*tlbp)->fullmm; 901 int fullmm = (*tlbp)->fullmm;
902 struct mm_struct *mm = vma->vm_mm;
881 903
904 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
882 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 905 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
883 unsigned long end; 906 unsigned long end;
884 907
@@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
899 } 922 }
900 923
901 if (unlikely(is_vm_hugetlb_page(vma))) { 924 if (unlikely(is_vm_hugetlb_page(vma))) {
902 unmap_hugepage_range(vma, start, end); 925 /*
903 zap_work -= (end - start) / 926 * It is undesirable to test vma->vm_file as it
904 (HPAGE_SIZE / PAGE_SIZE); 927 * should be non-null for valid hugetlb area.
928 * However, vm_file will be NULL in the error
929 * cleanup path of do_mmap_pgoff. When
930 * hugetlbfs ->mmap method fails,
931 * do_mmap_pgoff() nullifies vma->vm_file
932 * before calling this function to clean up.
933 * Since no pte has actually been setup, it is
934 * safe to do nothing in this case.
935 */
936 if (vma->vm_file) {
937 unmap_hugepage_range(vma, start, end, NULL);
938 zap_work -= (end - start) /
939 pages_per_huge_page(hstate_vma(vma));
940 }
941
905 start = end; 942 start = end;
906 } else 943 } else
907 start = unmap_page_range(*tlbp, vma, 944 start = unmap_page_range(*tlbp, vma,
@@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
929 } 966 }
930 } 967 }
931out: 968out:
969 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
932 return start; /* which is now the end (or restart) address */ 970 return start; /* which is now the end (or restart) address */
933} 971}
934 972
@@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
956 return end; 994 return end;
957} 995}
958 996
997/**
998 * zap_vma_ptes - remove ptes mapping the vma
999 * @vma: vm_area_struct holding ptes to be zapped
1000 * @address: starting address of pages to zap
1001 * @size: number of bytes to zap
1002 *
1003 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1004 *
1005 * The entire address range must be fully contained within the vma.
1006 *
1007 * Returns 0 if successful.
1008 */
1009int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1010 unsigned long size)
1011{
1012 if (address < vma->vm_start || address + size > vma->vm_end ||
1013 !(vma->vm_flags & VM_PFNMAP))
1014 return -1;
1015 zap_page_range(vma, address, size, NULL);
1016 return 0;
1017}
1018EXPORT_SYMBOL_GPL(zap_vma_ptes);
1019
959/* 1020/*
960 * Do a quick page-table lookup for a single page. 1021 * Do a quick page-table lookup for a single page.
961 */ 1022 */
@@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
982 goto no_page_table; 1043 goto no_page_table;
983 1044
984 pud = pud_offset(pgd, address); 1045 pud = pud_offset(pgd, address);
985 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 1046 if (pud_none(*pud))
986 goto no_page_table; 1047 goto no_page_table;
987 1048 if (pud_huge(*pud)) {
1049 BUG_ON(flags & FOLL_GET);
1050 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1051 goto out;
1052 }
1053 if (unlikely(pud_bad(*pud)))
1054 goto no_page_table;
1055
988 pmd = pmd_offset(pud, address); 1056 pmd = pmd_offset(pud, address);
989 if (pmd_none(*pmd)) 1057 if (pmd_none(*pmd))
990 goto no_page_table; 1058 goto no_page_table;
991
992 if (pmd_huge(*pmd)) { 1059 if (pmd_huge(*pmd)) {
993 BUG_ON(flags & FOLL_GET); 1060 BUG_ON(flags & FOLL_GET);
994 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1061 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
995 goto out; 1062 goto out;
996 } 1063 }
997
998 if (unlikely(pmd_bad(*pmd))) 1064 if (unlikely(pmd_bad(*pmd)))
999 goto no_page_table; 1065 goto no_page_table;
1000 1066
@@ -1058,11 +1124,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1058 if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) 1124 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1059 return 0; 1125 return 0;
1060 /* 1126 /*
1061 * And if we have a fault or a nopfn routine, it's not an 1127 * And if we have a fault routine, it's not an anonymous region.
1062 * anonymous region.
1063 */ 1128 */
1064 return !vma->vm_ops || 1129 return !vma->vm_ops || !vma->vm_ops->fault;
1065 (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
1066} 1130}
1067 1131
1068int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1132int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1338,6 +1402,11 @@ out:
1338 * 1402 *
1339 * This function should only be called from a vm_ops->fault handler, and 1403 * This function should only be called from a vm_ops->fault handler, and
1340 * in that case the handler should return NULL. 1404 * in that case the handler should return NULL.
1405 *
1406 * vma cannot be a COW mapping.
1407 *
1408 * As this is called only for pages that do not currently exist, we
1409 * do not need to flush old virtual caches or the TLB.
1341 */ 1410 */
1342int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1411int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1343 unsigned long pfn) 1412 unsigned long pfn)
@@ -1548,6 +1617,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1548 unsigned long next; 1617 unsigned long next;
1549 int err; 1618 int err;
1550 1619
1620 BUG_ON(pud_huge(*pud));
1621
1551 pmd = pmd_alloc(mm, pud, addr); 1622 pmd = pmd_alloc(mm, pud, addr);
1552 if (!pmd) 1623 if (!pmd)
1553 return -ENOMEM; 1624 return -ENOMEM;
@@ -1589,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1589{ 1660{
1590 pgd_t *pgd; 1661 pgd_t *pgd;
1591 unsigned long next; 1662 unsigned long next;
1592 unsigned long end = addr + size; 1663 unsigned long start = addr, end = addr + size;
1593 int err; 1664 int err;
1594 1665
1595 BUG_ON(addr >= end); 1666 BUG_ON(addr >= end);
1667 mmu_notifier_invalidate_range_start(mm, start, end);
1596 pgd = pgd_offset(mm, addr); 1668 pgd = pgd_offset(mm, addr);
1597 do { 1669 do {
1598 next = pgd_addr_end(addr, end); 1670 next = pgd_addr_end(addr, end);
@@ -1600,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1600 if (err) 1672 if (err)
1601 break; 1673 break;
1602 } while (pgd++, addr = next, addr != end); 1674 } while (pgd++, addr = next, addr != end);
1675 mmu_notifier_invalidate_range_end(mm, start, end);
1603 return err; 1676 return err;
1604} 1677}
1605EXPORT_SYMBOL_GPL(apply_to_page_range); 1678EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1716,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1716 * not dirty accountable. 1789 * not dirty accountable.
1717 */ 1790 */
1718 if (PageAnon(old_page)) { 1791 if (PageAnon(old_page)) {
1719 if (!TestSetPageLocked(old_page)) { 1792 if (trylock_page(old_page)) {
1720 reuse = can_share_swap_page(old_page); 1793 reuse = can_share_swap_page(old_page);
1721 unlock_page(old_page); 1794 unlock_page(old_page);
1722 } 1795 }
@@ -1812,7 +1885,7 @@ gotten:
1812 * seen in the presence of one thread doing SMC and another 1885 * seen in the presence of one thread doing SMC and another
1813 * thread doing COW. 1886 * thread doing COW.
1814 */ 1887 */
1815 ptep_clear_flush(vma, address, page_table); 1888 ptep_clear_flush_notify(vma, address, page_table);
1816 set_pte_at(mm, address, page_table, entry); 1889 set_pte_at(mm, address, page_table, entry);
1817 update_mmu_cache(vma, address, entry); 1890 update_mmu_cache(vma, address, entry);
1818 lru_cache_add_active(new_page); 1891 lru_cache_add_active(new_page);
@@ -2501,59 +2574,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2501 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2574 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2502} 2575}
2503 2576
2504
2505/*
2506 * do_no_pfn() tries to create a new page mapping for a page without
2507 * a struct_page backing it
2508 *
2509 * As this is called only for pages that do not currently exist, we
2510 * do not need to flush old virtual caches or the TLB.
2511 *
2512 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2513 * but allow concurrent faults), and pte mapped but not yet locked.
2514 * We return with mmap_sem still held, but pte unmapped and unlocked.
2515 *
2516 * It is expected that the ->nopfn handler always returns the same pfn
2517 * for a given virtual mapping.
2518 *
2519 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2520 */
2521static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2522 unsigned long address, pte_t *page_table, pmd_t *pmd,
2523 int write_access)
2524{
2525 spinlock_t *ptl;
2526 pte_t entry;
2527 unsigned long pfn;
2528
2529 pte_unmap(page_table);
2530 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2531 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2532
2533 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2534
2535 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2536
2537 if (unlikely(pfn == NOPFN_OOM))
2538 return VM_FAULT_OOM;
2539 else if (unlikely(pfn == NOPFN_SIGBUS))
2540 return VM_FAULT_SIGBUS;
2541 else if (unlikely(pfn == NOPFN_REFAULT))
2542 return 0;
2543
2544 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2545
2546 /* Only go through if we didn't race with anybody else... */
2547 if (pte_none(*page_table)) {
2548 entry = pfn_pte(pfn, vma->vm_page_prot);
2549 if (write_access)
2550 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2551 set_pte_at(mm, address, page_table, entry);
2552 }
2553 pte_unmap_unlock(page_table, ptl);
2554 return 0;
2555}
2556
2557/* 2577/*
2558 * Fault of a previously existing named mapping. Repopulate the pte 2578 * Fault of a previously existing named mapping. Repopulate the pte
2559 * from the encoded file_pte if possible. This enables swappable 2579 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2634,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 if (likely(vma->vm_ops->fault)) 2634 if (likely(vma->vm_ops->fault))
2615 return do_linear_fault(mm, vma, address, 2635 return do_linear_fault(mm, vma, address,
2616 pte, pmd, write_access, entry); 2636 pte, pmd, write_access, entry);
2617 if (unlikely(vma->vm_ops->nopfn))
2618 return do_no_pfn(mm, vma, address, pte,
2619 pmd, write_access);
2620 } 2637 }
2621 return do_anonymous_page(mm, vma, address, 2638 return do_anonymous_page(mm, vma, address,
2622 pte, pmd, write_access); 2639 pte, pmd, write_access);
@@ -2748,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end)
2748 2765
2749 vma = find_vma(current->mm, addr); 2766 vma = find_vma(current->mm, addr);
2750 if (!vma) 2767 if (!vma)
2751 return -1; 2768 return -ENOMEM;
2752 write = (vma->vm_flags & VM_WRITE) != 0; 2769 write = (vma->vm_flags & VM_WRITE) != 0;
2753 BUG_ON(addr >= end); 2770 BUG_ON(addr >= end);
2754 BUG_ON(end > vma->vm_end); 2771 BUG_ON(end > vma->vm_end);
2755 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 2772 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2756 ret = get_user_pages(current, current->mm, addr, 2773 ret = get_user_pages(current, current->mm, addr,
2757 len, write, 0, NULL, NULL); 2774 len, write, 0, NULL, NULL);
2758 if (ret < 0) 2775 if (ret < 0) {
2776 /*
2777 SUS require strange return value to mlock
2778 - invalid addr generate to ENOMEM.
2779 - out of memory should generate EAGAIN.
2780 */
2781 if (ret == -EFAULT)
2782 ret = -ENOMEM;
2783 else if (ret == -ENOMEM)
2784 ret = -EAGAIN;
2759 return ret; 2785 return ret;
2760 return ret == len ? 0 : -1; 2786 }
2787 return ret == len ? 0 : -ENOMEM;
2761} 2788}
2762 2789
2763#if !defined(__HAVE_ARCH_GATE_AREA) 2790#if !defined(__HAVE_ARCH_GATE_AREA)
@@ -2804,6 +2831,86 @@ int in_gate_area_no_task(unsigned long addr)
2804 2831
2805#endif /* __HAVE_ARCH_GATE_AREA */ 2832#endif /* __HAVE_ARCH_GATE_AREA */
2806 2833
2834#ifdef CONFIG_HAVE_IOREMAP_PROT
2835static resource_size_t follow_phys(struct vm_area_struct *vma,
2836 unsigned long address, unsigned int flags,
2837 unsigned long *prot)
2838{
2839 pgd_t *pgd;
2840 pud_t *pud;
2841 pmd_t *pmd;
2842 pte_t *ptep, pte;
2843 spinlock_t *ptl;
2844 resource_size_t phys_addr = 0;
2845 struct mm_struct *mm = vma->vm_mm;
2846
2847 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2848
2849 pgd = pgd_offset(mm, address);
2850 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2851 goto no_page_table;
2852
2853 pud = pud_offset(pgd, address);
2854 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2855 goto no_page_table;
2856
2857 pmd = pmd_offset(pud, address);
2858 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2859 goto no_page_table;
2860
2861 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
2862 if (pmd_huge(*pmd))
2863 goto no_page_table;
2864
2865 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2866 if (!ptep)
2867 goto out;
2868
2869 pte = *ptep;
2870 if (!pte_present(pte))
2871 goto unlock;
2872 if ((flags & FOLL_WRITE) && !pte_write(pte))
2873 goto unlock;
2874 phys_addr = pte_pfn(pte);
2875 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
2876
2877 *prot = pgprot_val(pte_pgprot(pte));
2878
2879unlock:
2880 pte_unmap_unlock(ptep, ptl);
2881out:
2882 return phys_addr;
2883no_page_table:
2884 return 0;
2885}
2886
2887int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2888 void *buf, int len, int write)
2889{
2890 resource_size_t phys_addr;
2891 unsigned long prot = 0;
2892 void *maddr;
2893 int offset = addr & (PAGE_SIZE-1);
2894
2895 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2896 return -EINVAL;
2897
2898 phys_addr = follow_phys(vma, addr, write, &prot);
2899
2900 if (!phys_addr)
2901 return -EINVAL;
2902
2903 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2904 if (write)
2905 memcpy_toio(maddr + offset, buf, len);
2906 else
2907 memcpy_fromio(buf, maddr + offset, len);
2908 iounmap(maddr);
2909
2910 return len;
2911}
2912#endif
2913
2807/* 2914/*
2808 * Access another process' address space. 2915 * Access another process' address space.
2809 * Source/target buffer must be kernel space, 2916 * Source/target buffer must be kernel space,
@@ -2813,7 +2920,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2813{ 2920{
2814 struct mm_struct *mm; 2921 struct mm_struct *mm;
2815 struct vm_area_struct *vma; 2922 struct vm_area_struct *vma;
2816 struct page *page;
2817 void *old_buf = buf; 2923 void *old_buf = buf;
2818 2924
2819 mm = get_task_mm(tsk); 2925 mm = get_task_mm(tsk);
@@ -2825,28 +2931,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2825 while (len) { 2931 while (len) {
2826 int bytes, ret, offset; 2932 int bytes, ret, offset;
2827 void *maddr; 2933 void *maddr;
2934 struct page *page = NULL;
2828 2935
2829 ret = get_user_pages(tsk, mm, addr, 1, 2936 ret = get_user_pages(tsk, mm, addr, 1,
2830 write, 1, &page, &vma); 2937 write, 1, &page, &vma);
2831 if (ret <= 0) 2938 if (ret <= 0) {
2832 break; 2939 /*
2833 2940 * Check if this is a VM_IO | VM_PFNMAP VMA, which
2834 bytes = len; 2941 * we can access using slightly different code.
2835 offset = addr & (PAGE_SIZE-1); 2942 */
2836 if (bytes > PAGE_SIZE-offset) 2943#ifdef CONFIG_HAVE_IOREMAP_PROT
2837 bytes = PAGE_SIZE-offset; 2944 vma = find_vma(mm, addr);
2838 2945 if (!vma)
2839 maddr = kmap(page); 2946 break;
2840 if (write) { 2947 if (vma->vm_ops && vma->vm_ops->access)
2841 copy_to_user_page(vma, page, addr, 2948 ret = vma->vm_ops->access(vma, addr, buf,
2842 maddr + offset, buf, bytes); 2949 len, write);
2843 set_page_dirty_lock(page); 2950 if (ret <= 0)
2951#endif
2952 break;
2953 bytes = ret;
2844 } else { 2954 } else {
2845 copy_from_user_page(vma, page, addr, 2955 bytes = len;
2846 buf, maddr + offset, bytes); 2956 offset = addr & (PAGE_SIZE-1);
2957 if (bytes > PAGE_SIZE-offset)
2958 bytes = PAGE_SIZE-offset;
2959
2960 maddr = kmap(page);
2961 if (write) {
2962 copy_to_user_page(vma, page, addr,
2963 maddr + offset, buf, bytes);
2964 set_page_dirty_lock(page);
2965 } else {
2966 copy_from_user_page(vma, page, addr,
2967 buf, maddr + offset, bytes);
2968 }
2969 kunmap(page);
2970 page_cache_release(page);
2847 } 2971 }
2848 kunmap(page);
2849 page_cache_release(page);
2850 len -= bytes; 2972 len -= bytes;
2851 buf += bytes; 2973 buf += bytes;
2852 addr += bytes; 2974 addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..89fee2dcb039 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res)
62 62
63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
64#ifndef CONFIG_SPARSEMEM_VMEMMAP 64#ifndef CONFIG_SPARSEMEM_VMEMMAP
65static void get_page_bootmem(unsigned long info, struct page *page, int magic) 65static void get_page_bootmem(unsigned long info, struct page *page, int type)
66{ 66{
67 atomic_set(&page->_mapcount, magic); 67 atomic_set(&page->_mapcount, type);
68 SetPagePrivate(page); 68 SetPagePrivate(page);
69 set_page_private(page, info); 69 set_page_private(page, info);
70 atomic_inc(&page->_count); 70 atomic_inc(&page->_count);
@@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic)
72 72
73void put_page_bootmem(struct page *page) 73void put_page_bootmem(struct page *page)
74{ 74{
75 int magic; 75 int type;
76 76
77 magic = atomic_read(&page->_mapcount); 77 type = atomic_read(&page->_mapcount);
78 BUG_ON(magic >= -1); 78 BUG_ON(type >= -1);
79 79
80 if (atomic_dec_return(&page->_count) == 1) { 80 if (atomic_dec_return(&page->_count) == 1) {
81 ClearPagePrivate(page); 81 ClearPagePrivate(page);
@@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page)
86 86
87} 87}
88 88
89void register_page_bootmem_info_section(unsigned long start_pfn) 89static void register_page_bootmem_info_section(unsigned long start_pfn)
90{ 90{
91 unsigned long *usemap, mapsize, section_nr, i; 91 unsigned long *usemap, mapsize, section_nr, i;
92 struct mem_section *ms; 92 struct mem_section *ms;
@@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
120 120
121 for (i = 0; i < mapsize; i++, page++) 121 for (i = 0; i < mapsize; i++, page++)
122 get_page_bootmem(section_nr, page, MIX_INFO); 122 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
123 123
124} 124}
125 125
@@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 429
430 if (need_zonelists_rebuild) 430 if (need_zonelists_rebuild)
431 build_all_zonelists(); 431 build_all_zonelists();
432 vm_total_pages = nr_free_pagecache_pages(); 432 else
433 vm_total_pages = nr_free_pagecache_pages();
434
433 writeback_set_ratelimit(); 435 writeback_set_ratelimit();
434 436
435 if (onlined_pages) 437 if (onlined_pages)
@@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
455 /* we can use NODE_DATA(nid) from here */ 457 /* we can use NODE_DATA(nid) from here */
456 458
457 /* init node's zones as empty zones, we don't have any present pages.*/ 459 /* init node's zones as empty zones, we don't have any present pages.*/
458 free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); 460 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
459 461
460 return pgdat; 462 return pgdat;
461} 463}
@@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory);
521 523
522#ifdef CONFIG_MEMORY_HOTREMOVE 524#ifdef CONFIG_MEMORY_HOTREMOVE
523/* 525/*
526 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
527 * set and the size of the free page is given by page_order(). Using this,
528 * the function determines if the pageblock contains only free pages.
529 * Due to buddy contraints, a free page at least the size of a pageblock will
530 * be located at the start of the pageblock
531 */
532static inline int pageblock_free(struct page *page)
533{
534 return PageBuddy(page) && page_order(page) >= pageblock_order;
535}
536
537/* Return the start of the next active pageblock after a given page */
538static struct page *next_active_pageblock(struct page *page)
539{
540 int pageblocks_stride;
541
542 /* Ensure the starting page is pageblock-aligned */
543 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
544
545 /* Move forward by at least 1 * pageblock_nr_pages */
546 pageblocks_stride = 1;
547
548 /* If the entire pageblock is free, move to the end of free page */
549 if (pageblock_free(page))
550 pageblocks_stride += page_order(page) - pageblock_order;
551
552 return page + (pageblocks_stride * pageblock_nr_pages);
553}
554
555/* Checks if this range of memory is likely to be hot-removable. */
556int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
557{
558 int type;
559 struct page *page = pfn_to_page(start_pfn);
560 struct page *end_page = page + nr_pages;
561
562 /* Check the starting page of each pageblock within the range */
563 for (; page < end_page; page = next_active_pageblock(page)) {
564 type = get_pageblock_migratetype(page);
565
566 /*
567 * A pageblock containing MOVABLE or free pages is considered
568 * removable
569 */
570 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
571 return 0;
572
573 /*
574 * A pageblock starting with a PageReserved page is not
575 * considered removable.
576 */
577 if (PageReserved(page))
578 return 0;
579 }
580
581 /* All pageblocks in the memory block are likely to be hot-removable */
582 return 1;
583}
584
585/*
524 * Confirm all pages in a range [start, end) is belongs to the same zone. 586 * Confirm all pages in a range [start, end) is belongs to the same zone.
525 */ 587 */
526static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 588static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..83369058ec13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
803int do_migrate_pages(struct mm_struct *mm, 803int do_migrate_pages(struct mm_struct *mm,
804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
805{ 805{
806 LIST_HEAD(pagelist);
807 int busy = 0; 806 int busy = 0;
808 int err = 0; 807 int err = 0;
809 nodemask_t tmp; 808 nodemask_t tmp;
@@ -1481,7 +1480,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1481 1480
1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1481 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1482 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484 HPAGE_SHIFT), gfp_flags); 1483 huge_page_shift(hstate_vma(vma))), gfp_flags);
1485 } else { 1484 } else {
1486 zl = policy_zonelist(gfp_flags, *mpol); 1485 zl = policy_zonelist(gfp_flags, *mpol);
1487 if ((*mpol)->mode == MPOL_BIND) 1486 if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2219,12 @@ static void check_huge_range(struct vm_area_struct *vma,
2220{ 2219{
2221 unsigned long addr; 2220 unsigned long addr;
2222 struct page *page; 2221 struct page *page;
2222 struct hstate *h = hstate_vma(vma);
2223 unsigned long sz = huge_page_size(h);
2223 2224
2224 for (addr = start; addr < end; addr += HPAGE_SIZE) { 2225 for (addr = start; addr < end; addr += sz) {
2225 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2226 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2227 addr & huge_page_mask(h));
2226 pte_t pte; 2228 pte_t pte;
2227 2229
2228 if (!ptep) 2230 if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 55bd355d170d..2a80136b23bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,6 +30,7 @@
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
@@ -284,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
284 285
285 page = migration_entry_to_page(entry); 286 page = migration_entry_to_page(entry);
286 287
287 get_page(page); 288 /*
289 * Once radix-tree replacement of page migration started, page_count
290 * *must* be zero. And, we don't want to call wait_on_page_locked()
291 * against a page without get_page().
292 * So, we use get_page_unless_zero(), here. Even failed, page fault
293 * will occur again.
294 */
295 if (!get_page_unless_zero(page))
296 goto out;
288 pte_unmap_unlock(ptep, ptl); 297 pte_unmap_unlock(ptep, ptl);
289 wait_on_page_locked(page); 298 wait_on_page_locked(page);
290 put_page(page); 299 put_page(page);
@@ -304,6 +313,7 @@ out:
304static int migrate_page_move_mapping(struct address_space *mapping, 313static int migrate_page_move_mapping(struct address_space *mapping,
305 struct page *newpage, struct page *page) 314 struct page *newpage, struct page *page)
306{ 315{
316 int expected_count;
307 void **pslot; 317 void **pslot;
308 318
309 if (!mapping) { 319 if (!mapping) {
@@ -313,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
313 return 0; 323 return 0;
314 } 324 }
315 325
316 write_lock_irq(&mapping->tree_lock); 326 spin_lock_irq(&mapping->tree_lock);
317 327
318 pslot = radix_tree_lookup_slot(&mapping->page_tree, 328 pslot = radix_tree_lookup_slot(&mapping->page_tree,
319 page_index(page)); 329 page_index(page));
320 330
321 if (page_count(page) != 2 + !!PagePrivate(page) || 331 expected_count = 2 + !!PagePrivate(page);
332 if (page_count(page) != expected_count ||
322 (struct page *)radix_tree_deref_slot(pslot) != page) { 333 (struct page *)radix_tree_deref_slot(pslot) != page) {
323 write_unlock_irq(&mapping->tree_lock); 334 spin_unlock_irq(&mapping->tree_lock);
335 return -EAGAIN;
336 }
337
338 if (!page_freeze_refs(page, expected_count)) {
339 spin_unlock_irq(&mapping->tree_lock);
324 return -EAGAIN; 340 return -EAGAIN;
325 } 341 }
326 342
@@ -337,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
337 353
338 radix_tree_replace_slot(pslot, newpage); 354 radix_tree_replace_slot(pslot, newpage);
339 355
356 page_unfreeze_refs(page, expected_count);
340 /* 357 /*
341 * Drop cache reference from old page. 358 * Drop cache reference from old page.
342 * We know this isn't the last reference. 359 * We know this isn't the last reference.
@@ -356,7 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 __dec_zone_page_state(page, NR_FILE_PAGES); 373 __dec_zone_page_state(page, NR_FILE_PAGES);
357 __inc_zone_page_state(newpage, NR_FILE_PAGES); 374 __inc_zone_page_state(newpage, NR_FILE_PAGES);
358 375
359 write_unlock_irq(&mapping->tree_lock); 376 spin_unlock_irq(&mapping->tree_lock);
377 if (!PageSwapCache(newpage))
378 mem_cgroup_uncharge_cache_page(page);
360 379
361 return 0; 380 return 0;
362} 381}
@@ -586,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
586 * establishing additional references. We are the only one 605 * establishing additional references. We are the only one
587 * holding a reference to the new page at this point. 606 * holding a reference to the new page at this point.
588 */ 607 */
589 if (TestSetPageLocked(newpage)) 608 if (!trylock_page(newpage))
590 BUG(); 609 BUG();
591 610
592 /* Prepare mapping for the new page.*/ 611 /* Prepare mapping for the new page.*/
@@ -610,7 +629,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
610 rc = fallback_migrate_page(mapping, newpage, page); 629 rc = fallback_migrate_page(mapping, newpage, page);
611 630
612 if (!rc) { 631 if (!rc) {
613 mem_cgroup_page_migration(page, newpage);
614 remove_migration_ptes(page, newpage); 632 remove_migration_ptes(page, newpage);
615 } else 633 } else
616 newpage->mapping = NULL; 634 newpage->mapping = NULL;
@@ -640,8 +658,16 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
640 /* page was freed from under us. So we are done. */ 658 /* page was freed from under us. So we are done. */
641 goto move_newpage; 659 goto move_newpage;
642 660
661 charge = mem_cgroup_prepare_migration(page, newpage);
662 if (charge == -ENOMEM) {
663 rc = -ENOMEM;
664 goto move_newpage;
665 }
666 /* prepare cgroup just returns 0 or -ENOMEM */
667 BUG_ON(charge);
668
643 rc = -EAGAIN; 669 rc = -EAGAIN;
644 if (TestSetPageLocked(page)) { 670 if (!trylock_page(page)) {
645 if (!force) 671 if (!force)
646 goto move_newpage; 672 goto move_newpage;
647 lock_page(page); 673 lock_page(page);
@@ -691,19 +717,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
691 goto rcu_unlock; 717 goto rcu_unlock;
692 } 718 }
693 719
694 charge = mem_cgroup_prepare_migration(page);
695 /* Establish migration ptes or remove ptes */ 720 /* Establish migration ptes or remove ptes */
696 try_to_unmap(page, 1); 721 try_to_unmap(page, 1);
697 722
698 if (!page_mapped(page)) 723 if (!page_mapped(page))
699 rc = move_to_new_page(newpage, page); 724 rc = move_to_new_page(newpage, page);
700 725
701 if (rc) { 726 if (rc)
702 remove_migration_ptes(page, page); 727 remove_migration_ptes(page, page);
703 if (charge)
704 mem_cgroup_end_migration(page);
705 } else if (charge)
706 mem_cgroup_end_migration(newpage);
707rcu_unlock: 728rcu_unlock:
708 if (rcu_locked) 729 if (rcu_locked)
709 rcu_read_unlock(); 730 rcu_read_unlock();
@@ -724,6 +745,8 @@ unlock:
724 } 745 }
725 746
726move_newpage: 747move_newpage:
748 if (!charge)
749 mem_cgroup_end_migration(newpage);
727 /* 750 /*
728 * Move the new page to the LRU. If migration was not successful 751 * Move the new page to the LRU. If migration was not successful
729 * then this will free the page. 752 * then this will free the page.
@@ -1070,7 +1093,6 @@ out2:
1070 mmput(mm); 1093 mmput(mm);
1071 return err; 1094 return err;
1072} 1095}
1073#endif
1074 1096
1075/* 1097/*
1076 * Call migration functions in the vma_ops that may prepare 1098 * Call migration functions in the vma_ops that may prepare
@@ -1092,3 +1114,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1092 } 1114 }
1093 return err; 1115 return err;
1094} 1116}
1117#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6a..01fbe93eff5c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -78,8 +78,6 @@ success:
78 78
79 mm->locked_vm -= pages; 79 mm->locked_vm -= pages;
80out: 80out:
81 if (ret == -ENOMEM)
82 ret = -EAGAIN;
83 return ret; 81 return ret;
84} 82}
85 83
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..936ef2efd892
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
1/*
2 * mm_init.c - Memory initialisation verification and debugging
3 *
4 * Copyright 2008 IBM Corporation, 2008
5 * Author Mel Gorman <mel@csn.ul.ie>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/kobject.h>
11#include <linux/module.h>
12#include "internal.h"
13
14#ifdef CONFIG_DEBUG_MEMORY_INIT
15int __meminitdata mminit_loglevel;
16
17#ifndef SECTIONS_SHIFT
18#define SECTIONS_SHIFT 0
19#endif
20
21/* The zonelists are simply reported, validation is manual. */
22void mminit_verify_zonelist(void)
23{
24 int nid;
25
26 if (mminit_loglevel < MMINIT_VERIFY)
27 return;
28
29 for_each_online_node(nid) {
30 pg_data_t *pgdat = NODE_DATA(nid);
31 struct zone *zone;
32 struct zoneref *z;
33 struct zonelist *zonelist;
34 int i, listid, zoneid;
35
36 BUG_ON(MAX_ZONELISTS > 2);
37 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
38
39 /* Identify the zone and nodelist */
40 zoneid = i % MAX_NR_ZONES;
41 listid = i / MAX_NR_ZONES;
42 zonelist = &pgdat->node_zonelists[listid];
43 zone = &pgdat->node_zones[zoneid];
44 if (!populated_zone(zone))
45 continue;
46
47 /* Print information about the zonelist */
48 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
49 listid > 0 ? "thisnode" : "general", nid,
50 zone->name);
51
52 /* Iterate the zonelist */
53 for_each_zone_zonelist(zone, z, zonelist, zoneid) {
54#ifdef CONFIG_NUMA
55 printk(KERN_CONT "%d:%s ",
56 zone->node, zone->name);
57#else
58 printk(KERN_CONT "0:%s ", zone->name);
59#endif /* CONFIG_NUMA */
60 }
61 printk(KERN_CONT "\n");
62 }
63 }
64}
65
66void __init mminit_verify_pageflags_layout(void)
67{
68 int shift, width;
69 unsigned long or_mask, add_mask;
70
71 shift = 8 * sizeof(unsigned long);
72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
74 "Section %d Node %d Zone %d Flags %d\n",
75 SECTIONS_WIDTH,
76 NODES_WIDTH,
77 ZONES_WIDTH,
78 NR_PAGEFLAGS);
79 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
80 "Section %d Node %d Zone %d\n",
81 SECTIONS_SHIFT,
82 NODES_SHIFT,
83 ZONES_SHIFT);
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
85 "Section %lu Node %lu Zone %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT);
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
90 "Zone ID: %lu -> %lu\n",
91 (unsigned long)ZONEID_PGOFF,
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags");
99#endif
100
101 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH;
103 BUG_ON(shift != SECTIONS_PGSHIFT);
104 }
105 if (NODES_WIDTH) {
106 shift -= NODES_WIDTH;
107 BUG_ON(shift != NODES_PGSHIFT);
108 }
109 if (ZONES_WIDTH) {
110 shift -= ZONES_WIDTH;
111 BUG_ON(shift != ZONES_PGSHIFT);
112 }
113
114 /* Check for bitmask overlaps */
115 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
116 (NODES_MASK << NODES_PGSHIFT) |
117 (SECTIONS_MASK << SECTIONS_PGSHIFT);
118 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
119 (NODES_MASK << NODES_PGSHIFT) +
120 (SECTIONS_MASK << SECTIONS_PGSHIFT);
121 BUG_ON(or_mask != add_mask);
122}
123
124void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
125 unsigned long nid, unsigned long pfn)
126{
127 BUG_ON(page_to_nid(page) != nid);
128 BUG_ON(page_zonenum(page) != zone);
129 BUG_ON(page_to_pfn(page) != pfn);
130}
131
132static __init int set_mminit_loglevel(char *str)
133{
134 get_option(&str, &mminit_loglevel);
135 return 0;
136}
137early_param("mminit_loglevel", set_mminit_loglevel);
138#endif /* CONFIG_DEBUG_MEMORY_INIT */
139
140struct kobject *mm_kobj;
141EXPORT_SYMBOL_GPL(mm_kobj);
142
143static int __init mm_sysfs_init(void)
144{
145 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
146 if (!mm_kobj)
147 return -ENOMEM;
148
149 return 0;
150}
151
152__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..339cf5c4d5d8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,12 +26,15 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/cacheflush.h> 32#include <asm/cacheflush.h>
32#include <asm/tlb.h> 33#include <asm/tlb.h>
33#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
34 35
36#include "internal.h"
37
35#ifndef arch_mmap_check 38#ifndef arch_mmap_check
36#define arch_mmap_check(addr, len, flags) (0) 39#define arch_mmap_check(addr, len, flags) (0)
37#endif 40#endif
@@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
367 if (vma_tmp->vm_end > addr) { 370 if (vma_tmp->vm_end > addr) {
368 vma = vma_tmp; 371 vma = vma_tmp;
369 if (vma_tmp->vm_start <= addr) 372 if (vma_tmp->vm_start <= addr)
370 return vma; 373 break;
371 __rb_link = &__rb_parent->rb_left; 374 __rb_link = &__rb_parent->rb_left;
372 } else { 375 } else {
373 rb_prev = __rb_parent; 376 rb_prev = __rb_parent;
@@ -1108,6 +1111,9 @@ munmap_back:
1108 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 1111 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1109 return -ENOMEM; 1112 return -ENOMEM;
1110 1113
1114 if (flags & MAP_NORESERVE)
1115 vm_flags |= VM_NORESERVE;
1116
1111 if (accountable && (!(flags & MAP_NORESERVE) || 1117 if (accountable && (!(flags & MAP_NORESERVE) ||
1112 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 1118 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1113 if (vm_flags & VM_SHARED) { 1119 if (vm_flags & VM_SHARED) {
@@ -1763,7 +1769,7 @@ static void unmap_region(struct mm_struct *mm,
1763 update_hiwater_rss(mm); 1769 update_hiwater_rss(mm);
1764 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1770 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1765 vm_unacct_memory(nr_accounted); 1771 vm_unacct_memory(nr_accounted);
1766 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1772 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1767 next? next->vm_start: 0); 1773 next? next->vm_start: 0);
1768 tlb_finish_mmu(tlb, start, end); 1774 tlb_finish_mmu(tlb, start, end);
1769} 1775}
@@ -1807,7 +1813,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1807 struct mempolicy *pol; 1813 struct mempolicy *pol;
1808 struct vm_area_struct *new; 1814 struct vm_area_struct *new;
1809 1815
1810 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 1816 if (is_vm_hugetlb_page(vma) && (addr &
1817 ~(huge_page_mask(hstate_vma(vma)))))
1811 return -EINVAL; 1818 return -EINVAL;
1812 1819
1813 if (mm->map_count >= sysctl_max_map_count) 1820 if (mm->map_count >= sysctl_max_map_count)
@@ -2055,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm)
2055 2062
2056 /* mm's last user has gone, and its about to be pulled down */ 2063 /* mm's last user has gone, and its about to be pulled down */
2057 arch_exit_mmap(mm); 2064 arch_exit_mmap(mm);
2065 mmu_notifier_release(mm);
2058 2066
2059 lru_add_drain(); 2067 lru_add_drain();
2060 flush_cache_mm(mm); 2068 flush_cache_mm(mm);
@@ -2063,7 +2071,7 @@ void exit_mmap(struct mm_struct *mm)
2063 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2071 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2064 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2072 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2065 vm_unacct_memory(nr_accounted); 2073 vm_unacct_memory(nr_accounted);
2066 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2074 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2067 tlb_finish_mmu(tlb, 0, end); 2075 tlb_finish_mmu(tlb, 0, end);
2068 2076
2069 /* 2077 /*
@@ -2262,3 +2270,167 @@ int install_special_mapping(struct mm_struct *mm,
2262 2270
2263 return 0; 2271 return 0;
2264} 2272}
2273
2274static DEFINE_MUTEX(mm_all_locks_mutex);
2275
2276static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2277{
2278 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2279 /*
2280 * The LSB of head.next can't change from under us
2281 * because we hold the mm_all_locks_mutex.
2282 */
2283 spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
2284 /*
2285 * We can safely modify head.next after taking the
2286 * anon_vma->lock. If some other vma in this mm shares
2287 * the same anon_vma we won't take it again.
2288 *
2289 * No need of atomic instructions here, head.next
2290 * can't change from under us thanks to the
2291 * anon_vma->lock.
2292 */
2293 if (__test_and_set_bit(0, (unsigned long *)
2294 &anon_vma->head.next))
2295 BUG();
2296 }
2297}
2298
2299static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2300{
2301 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2302 /*
2303 * AS_MM_ALL_LOCKS can't change from under us because
2304 * we hold the mm_all_locks_mutex.
2305 *
2306 * Operations on ->flags have to be atomic because
2307 * even if AS_MM_ALL_LOCKS is stable thanks to the
2308 * mm_all_locks_mutex, there may be other cpus
2309 * changing other bitflags in parallel to us.
2310 */
2311 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2312 BUG();
2313 spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
2314 }
2315}
2316
2317/*
2318 * This operation locks against the VM for all pte/vma/mm related
2319 * operations that could ever happen on a certain mm. This includes
2320 * vmtruncate, try_to_unmap, and all page faults.
2321 *
2322 * The caller must take the mmap_sem in write mode before calling
2323 * mm_take_all_locks(). The caller isn't allowed to release the
2324 * mmap_sem until mm_drop_all_locks() returns.
2325 *
2326 * mmap_sem in write mode is required in order to block all operations
2327 * that could modify pagetables and free pages without need of
2328 * altering the vma layout (for example populate_range() with
2329 * nonlinear vmas). It's also needed in write mode to avoid new
2330 * anon_vmas to be associated with existing vmas.
2331 *
2332 * A single task can't take more than one mm_take_all_locks() in a row
2333 * or it would deadlock.
2334 *
2335 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
2336 * mapping->flags avoid to take the same lock twice, if more than one
2337 * vma in this mm is backed by the same anon_vma or address_space.
2338 *
2339 * We can take all the locks in random order because the VM code
2340 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
2341 * takes more than one of them in a row. Secondly we're protected
2342 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2343 *
2344 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2345 * that may have to take thousand of locks.
2346 *
2347 * mm_take_all_locks() can fail if it's interrupted by signals.
2348 */
2349int mm_take_all_locks(struct mm_struct *mm)
2350{
2351 struct vm_area_struct *vma;
2352 int ret = -EINTR;
2353
2354 BUG_ON(down_read_trylock(&mm->mmap_sem));
2355
2356 mutex_lock(&mm_all_locks_mutex);
2357
2358 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2359 if (signal_pending(current))
2360 goto out_unlock;
2361 if (vma->vm_file && vma->vm_file->f_mapping)
2362 vm_lock_mapping(mm, vma->vm_file->f_mapping);
2363 }
2364
2365 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2366 if (signal_pending(current))
2367 goto out_unlock;
2368 if (vma->anon_vma)
2369 vm_lock_anon_vma(mm, vma->anon_vma);
2370 }
2371
2372 ret = 0;
2373
2374out_unlock:
2375 if (ret)
2376 mm_drop_all_locks(mm);
2377
2378 return ret;
2379}
2380
2381static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2382{
2383 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2384 /*
2385 * The LSB of head.next can't change to 0 from under
2386 * us because we hold the mm_all_locks_mutex.
2387 *
2388 * We must however clear the bitflag before unlocking
2389 * the vma so the users using the anon_vma->head will
2390 * never see our bitflag.
2391 *
2392 * No need of atomic instructions here, head.next
2393 * can't change from under us until we release the
2394 * anon_vma->lock.
2395 */
2396 if (!__test_and_clear_bit(0, (unsigned long *)
2397 &anon_vma->head.next))
2398 BUG();
2399 spin_unlock(&anon_vma->lock);
2400 }
2401}
2402
2403static void vm_unlock_mapping(struct address_space *mapping)
2404{
2405 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2406 /*
2407 * AS_MM_ALL_LOCKS can't change to 0 from under us
2408 * because we hold the mm_all_locks_mutex.
2409 */
2410 spin_unlock(&mapping->i_mmap_lock);
2411 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2412 &mapping->flags))
2413 BUG();
2414 }
2415}
2416
2417/*
2418 * The mmap_sem cannot be released by the caller until
2419 * mm_drop_all_locks() returns.
2420 */
2421void mm_drop_all_locks(struct mm_struct *mm)
2422{
2423 struct vm_area_struct *vma;
2424
2425 BUG_ON(down_read_trylock(&mm->mmap_sem));
2426 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2427
2428 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2429 if (vma->anon_vma)
2430 vm_unlock_anon_vma(vma->anon_vma);
2431 if (vma->vm_file && vma->vm_file->f_mapping)
2432 vm_unlock_mapping(vma->vm_file->f_mapping);
2433 }
2434
2435 mutex_unlock(&mm_all_locks_mutex);
2436}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
1/*
2 * linux/mm/mmu_notifier.c
3 *
4 * Copyright (C) 2008 Qumranet, Inc.
5 * Copyright (C) 2008 SGI
6 * Christoph Lameter <clameter@sgi.com>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See
9 * the COPYING file in the top-level directory.
10 */
11
12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h>
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/err.h>
17#include <linux/rcupdate.h>
18#include <linux/sched.h>
19
20/*
21 * This function can't run concurrently against mmu_notifier_register
22 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
23 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
24 * in parallel despite there being no task using this mm any more,
25 * through the vmas outside of the exit_mmap context, such as with
26 * vmtruncate. This serializes against mmu_notifier_unregister with
27 * the mmu_notifier_mm->lock in addition to RCU and it serializes
28 * against the other mmu notifiers with RCU. struct mmu_notifier_mm
29 * can't go away from under us as exit_mmap holds an mm_count pin
30 * itself.
31 */
32void __mmu_notifier_release(struct mm_struct *mm)
33{
34 struct mmu_notifier *mn;
35
36 spin_lock(&mm->mmu_notifier_mm->lock);
37 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
38 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
39 struct mmu_notifier,
40 hlist);
41 /*
42 * We arrived before mmu_notifier_unregister so
43 * mmu_notifier_unregister will do nothing other than
44 * to wait ->release to finish and
45 * mmu_notifier_unregister to return.
46 */
47 hlist_del_init_rcu(&mn->hlist);
48 /*
49 * RCU here will block mmu_notifier_unregister until
50 * ->release returns.
51 */
52 rcu_read_lock();
53 spin_unlock(&mm->mmu_notifier_mm->lock);
54 /*
55 * if ->release runs before mmu_notifier_unregister it
56 * must be handled as it's the only way for the driver
57 * to flush all existing sptes and stop the driver
58 * from establishing any more sptes before all the
59 * pages in the mm are freed.
60 */
61 if (mn->ops->release)
62 mn->ops->release(mn, mm);
63 rcu_read_unlock();
64 spin_lock(&mm->mmu_notifier_mm->lock);
65 }
66 spin_unlock(&mm->mmu_notifier_mm->lock);
67
68 /*
69 * synchronize_rcu here prevents mmu_notifier_release to
70 * return to exit_mmap (which would proceed freeing all pages
71 * in the mm) until the ->release method returns, if it was
72 * invoked by mmu_notifier_unregister.
73 *
74 * The mmu_notifier_mm can't go away from under us because one
75 * mm_count is hold by exit_mmap.
76 */
77 synchronize_rcu();
78}
79
80/*
81 * If no young bitflag is supported by the hardware, ->clear_flush_young can
82 * unmap the address and return 1 or 0 depending if the mapping previously
83 * existed or not.
84 */
85int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
86 unsigned long address)
87{
88 struct mmu_notifier *mn;
89 struct hlist_node *n;
90 int young = 0;
91
92 rcu_read_lock();
93 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
94 if (mn->ops->clear_flush_young)
95 young |= mn->ops->clear_flush_young(mn, mm, address);
96 }
97 rcu_read_unlock();
98
99 return young;
100}
101
102void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->invalidate_page)
111 mn->ops->invalidate_page(mn, mm, address);
112 }
113 rcu_read_unlock();
114}
115
116void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
117 unsigned long start, unsigned long end)
118{
119 struct mmu_notifier *mn;
120 struct hlist_node *n;
121
122 rcu_read_lock();
123 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
124 if (mn->ops->invalidate_range_start)
125 mn->ops->invalidate_range_start(mn, mm, start, end);
126 }
127 rcu_read_unlock();
128}
129
130void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
131 unsigned long start, unsigned long end)
132{
133 struct mmu_notifier *mn;
134 struct hlist_node *n;
135
136 rcu_read_lock();
137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
138 if (mn->ops->invalidate_range_end)
139 mn->ops->invalidate_range_end(mn, mm, start, end);
140 }
141 rcu_read_unlock();
142}
143
144static int do_mmu_notifier_register(struct mmu_notifier *mn,
145 struct mm_struct *mm,
146 int take_mmap_sem)
147{
148 struct mmu_notifier_mm *mmu_notifier_mm;
149 int ret;
150
151 BUG_ON(atomic_read(&mm->mm_users) <= 0);
152
153 ret = -ENOMEM;
154 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
155 if (unlikely(!mmu_notifier_mm))
156 goto out;
157
158 if (take_mmap_sem)
159 down_write(&mm->mmap_sem);
160 ret = mm_take_all_locks(mm);
161 if (unlikely(ret))
162 goto out_cleanup;
163
164 if (!mm_has_notifiers(mm)) {
165 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
166 spin_lock_init(&mmu_notifier_mm->lock);
167 mm->mmu_notifier_mm = mmu_notifier_mm;
168 mmu_notifier_mm = NULL;
169 }
170 atomic_inc(&mm->mm_count);
171
172 /*
173 * Serialize the update against mmu_notifier_unregister. A
174 * side note: mmu_notifier_release can't run concurrently with
175 * us because we hold the mm_users pin (either implicitly as
176 * current->mm or explicitly with get_task_mm() or similar).
177 * We can't race against any other mmu notifier method either
178 * thanks to mm_take_all_locks().
179 */
180 spin_lock(&mm->mmu_notifier_mm->lock);
181 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
182 spin_unlock(&mm->mmu_notifier_mm->lock);
183
184 mm_drop_all_locks(mm);
185out_cleanup:
186 if (take_mmap_sem)
187 up_write(&mm->mmap_sem);
188 /* kfree() does nothing if mmu_notifier_mm is NULL */
189 kfree(mmu_notifier_mm);
190out:
191 BUG_ON(atomic_read(&mm->mm_users) <= 0);
192 return ret;
193}
194
195/*
196 * Must not hold mmap_sem nor any other VM related lock when calling
197 * this registration function. Must also ensure mm_users can't go down
198 * to zero while this runs to avoid races with mmu_notifier_release,
199 * so mm has to be current->mm or the mm should be pinned safely such
200 * as with get_task_mm(). If the mm is not current->mm, the mm_users
201 * pin should be released by calling mmput after mmu_notifier_register
202 * returns. mmu_notifier_unregister must be always called to
203 * unregister the notifier. mm_count is automatically pinned to allow
204 * mmu_notifier_unregister to safely run at any time later, before or
205 * after exit_mmap. ->release will always be called before exit_mmap
206 * frees the pages.
207 */
208int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
209{
210 return do_mmu_notifier_register(mn, mm, 1);
211}
212EXPORT_SYMBOL_GPL(mmu_notifier_register);
213
214/*
215 * Same as mmu_notifier_register but here the caller must hold the
216 * mmap_sem in write mode.
217 */
218int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
219{
220 return do_mmu_notifier_register(mn, mm, 0);
221}
222EXPORT_SYMBOL_GPL(__mmu_notifier_register);
223
224/* this is called after the last mmu_notifier_unregister() returned */
225void __mmu_notifier_mm_destroy(struct mm_struct *mm)
226{
227 BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
228 kfree(mm->mmu_notifier_mm);
229 mm->mmu_notifier_mm = LIST_POISON1; /* debug */
230}
231
232/*
233 * This releases the mm_count pin automatically and frees the mm
234 * structure if it was the last user of it. It serializes against
235 * running mmu notifiers with RCU and against mmu_notifier_unregister
236 * with the unregister lock + RCU. All sptes must be dropped before
237 * calling mmu_notifier_unregister. ->release or any other notifier
238 * method may be invoked concurrently with mmu_notifier_unregister,
239 * and only after mmu_notifier_unregister returned we're guaranteed
240 * that ->release or any other method can't run anymore.
241 */
242void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
243{
244 BUG_ON(atomic_read(&mm->mm_count) <= 0);
245
246 spin_lock(&mm->mmu_notifier_mm->lock);
247 if (!hlist_unhashed(&mn->hlist)) {
248 hlist_del_rcu(&mn->hlist);
249
250 /*
251 * RCU here will force exit_mmap to wait ->release to finish
252 * before freeing the pages.
253 */
254 rcu_read_lock();
255 spin_unlock(&mm->mmu_notifier_mm->lock);
256 /*
257 * exit_mmap will block in mmu_notifier_release to
258 * guarantee ->release is called before freeing the
259 * pages.
260 */
261 if (mn->ops->release)
262 mn->ops->release(mn, mm);
263 rcu_read_unlock();
264 } else
265 spin_unlock(&mm->mmu_notifier_mm->lock);
266
267 /*
268 * Wait any running method to finish, of course including
269 * ->release if it was run by mmu_notifier_relase instead of us.
270 */
271 synchronize_rcu();
272
273 BUG_ON(atomic_read(&mm->mm_count) <= 0);
274
275 mmdrop(mm);
276}
277EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 360d9cc8b38c..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include <asm/pgtable.h> 26#include <asm/pgtable.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
@@ -153,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
153 * If we make a private mapping writable we increase our commit; 154 * If we make a private mapping writable we increase our commit;
154 * but (without finer accounting) cannot reduce our commit if we 155 * but (without finer accounting) cannot reduce our commit if we
155 * make it unwritable again. 156 * make it unwritable again.
156 *
157 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
158 * a MAP_NORESERVE private mapping to writable will now reserve.
159 */ 157 */
160 if (newflags & VM_WRITE) { 158 if (newflags & VM_WRITE) {
161 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { 159 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
160 VM_SHARED|VM_NORESERVE))) {
162 charged = nrpages; 161 charged = nrpages;
163 if (security_vm_enough_memory(charged)) 162 if (security_vm_enough_memory(charged))
164 return -ENOMEM; 163 return -ENOMEM;
@@ -205,10 +204,12 @@ success:
205 dirty_accountable = 1; 204 dirty_accountable = 1;
206 } 205 }
207 206
207 mmu_notifier_invalidate_range_start(mm, start, end);
208 if (is_vm_hugetlb_page(vma)) 208 if (is_vm_hugetlb_page(vma))
209 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 209 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
210 else 210 else
211 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); 211 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
212 mmu_notifier_invalidate_range_end(mm, start, end);
212 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 213 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
213 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 214 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
214 return 0; 215 return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..1a7743923c8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
18#include <linux/highmem.h> 18#include <linux/highmem.h>
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h>
21 22
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/cacheflush.h> 24#include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
74 struct mm_struct *mm = vma->vm_mm; 75 struct mm_struct *mm = vma->vm_mm;
75 pte_t *old_pte, *new_pte, pte; 76 pte_t *old_pte, *new_pte, pte;
76 spinlock_t *old_ptl, *new_ptl; 77 spinlock_t *old_ptl, *new_ptl;
78 unsigned long old_start;
77 79
80 old_start = old_addr;
81 mmu_notifier_invalidate_range_start(vma->vm_mm,
82 old_start, old_end);
78 if (vma->vm_file) { 83 if (vma->vm_file) {
79 /* 84 /*
80 * Subtle point from Rajesh Venkatasubramanian: before 85 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
116 pte_unmap_unlock(old_pte - 1, old_ptl); 121 pte_unmap_unlock(old_pte - 1, old_ptl);
117 if (mapping) 122 if (mapping)
118 spin_unlock(&mapping->i_mmap_lock); 123 spin_unlock(&mapping->i_mmap_lock);
124 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
119} 125}
120 126
121#define LATENCY_LIMIT (64 * PAGE_SIZE) 127#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..ed75bc962fbe 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/ptrace.h> 25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
@@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node)
266} 266}
267EXPORT_SYMBOL(vmalloc_node); 267EXPORT_SYMBOL(vmalloc_node);
268 268
269#ifndef PAGE_KERNEL_EXEC
270# define PAGE_KERNEL_EXEC PAGE_KERNEL
271#endif
272
273/**
274 * vmalloc_exec - allocate virtually contiguous, executable memory
275 * @size: allocation size
276 *
277 * Kernel-internal function to allocate enough pages to cover @size
278 * the page level allocator and map them into contiguous and
279 * executable kernel virtual space.
280 *
281 * For tight control over page level allocator and protection flags
282 * use __vmalloc() instead.
283 */
284
285void *vmalloc_exec(unsigned long size)
286{
287 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
288}
289
269/** 290/**
270 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 291 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
271 * @size: allocation size 292 * @size: allocation size
@@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file,
745 * it's being traced - otherwise breakpoints set in it may interfere 766 * it's being traced - otherwise breakpoints set in it may interfere
746 * with another untraced process 767 * with another untraced process
747 */ 768 */
748 if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) 769 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
749 vm_flags &= ~VM_MAYSHARE; 770 vm_flags &= ~VM_MAYSHARE;
750 771
751 return vm_flags; 772 return vm_flags;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1088 if (!mapping) 1088 if (!mapping)
1089 return 1; 1089 return 1;
1090 1090
1091 write_lock_irq(&mapping->tree_lock); 1091 spin_lock_irq(&mapping->tree_lock);
1092 mapping2 = page_mapping(page); 1092 mapping2 = page_mapping(page);
1093 if (mapping2) { /* Race with truncate? */ 1093 if (mapping2) { /* Race with truncate? */
1094 BUG_ON(mapping2 != mapping); 1094 BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1102 radix_tree_tag_set(&mapping->page_tree, 1102 radix_tree_tag_set(&mapping->page_tree,
1103 page_index(page), PAGECACHE_TAG_DIRTY); 1103 page_index(page), PAGECACHE_TAG_DIRTY);
1104 } 1104 }
1105 write_unlock_irq(&mapping->tree_lock); 1105 spin_unlock_irq(&mapping->tree_lock);
1106 if (mapping->host) { 1106 if (mapping->host) {
1107 /* !PageAnon && !swapper_space */ 1107 /* !PageAnon && !swapper_space */
1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
1258 struct backing_dev_info *bdi = mapping->backing_dev_info; 1258 struct backing_dev_info *bdi = mapping->backing_dev_info;
1259 unsigned long flags; 1259 unsigned long flags;
1260 1260
1261 write_lock_irqsave(&mapping->tree_lock, flags); 1261 spin_lock_irqsave(&mapping->tree_lock, flags);
1262 ret = TestClearPageWriteback(page); 1262 ret = TestClearPageWriteback(page);
1263 if (ret) { 1263 if (ret) {
1264 radix_tree_tag_clear(&mapping->page_tree, 1264 radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
1269 __bdi_writeout_inc(bdi); 1269 __bdi_writeout_inc(bdi);
1270 } 1270 }
1271 } 1271 }
1272 write_unlock_irqrestore(&mapping->tree_lock, flags); 1272 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1273 } else { 1273 } else {
1274 ret = TestClearPageWriteback(page); 1274 ret = TestClearPageWriteback(page);
1275 } 1275 }
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
1287 struct backing_dev_info *bdi = mapping->backing_dev_info; 1287 struct backing_dev_info *bdi = mapping->backing_dev_info;
1288 unsigned long flags; 1288 unsigned long flags;
1289 1289
1290 write_lock_irqsave(&mapping->tree_lock, flags); 1290 spin_lock_irqsave(&mapping->tree_lock, flags);
1291 ret = TestSetPageWriteback(page); 1291 ret = TestSetPageWriteback(page);
1292 if (!ret) { 1292 if (!ret) {
1293 radix_tree_tag_set(&mapping->page_tree, 1293 radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
1300 radix_tree_tag_clear(&mapping->page_tree, 1300 radix_tree_tag_clear(&mapping->page_tree,
1301 page_index(page), 1301 page_index(page),
1302 PAGECACHE_TAG_DIRTY); 1302 PAGECACHE_TAG_DIRTY);
1303 write_unlock_irqrestore(&mapping->tree_lock, flags); 1303 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1304 } else { 1304 } else {
1305 ret = TestSetPageWriteback(page); 1305 ret = TestSetPageWriteback(page);
1306 } 1306 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..af982f7cdb2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 unsigned long __initdata required_kernelcore; 156 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 157 static unsigned long __initdata required_movablecore;
158 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
159 159
160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
161 int movable_zone; 161 int movable_zone;
@@ -264,7 +264,7 @@ static void free_compound_page(struct page *page)
264 __free_pages_ok(page, compound_order(page)); 264 __free_pages_ok(page, compound_order(page));
265} 265}
266 266
267static void prep_compound_page(struct page *page, unsigned long order) 267void prep_compound_page(struct page *page, unsigned long order)
268{ 268{
269 int i; 269 int i;
270 int nr_pages = 1 << order; 270 int nr_pages = 1 << order;
@@ -432,8 +432,9 @@ static inline void __free_one_page(struct page *page,
432 432
433 buddy = __page_find_buddy(page, page_idx, order); 433 buddy = __page_find_buddy(page, page_idx, order);
434 if (!page_is_buddy(page, buddy, order)) 434 if (!page_is_buddy(page, buddy, order))
435 break; /* Move the buddy up one level. */ 435 break;
436 436
437 /* Our buddy is free, merge with it and move up one order. */
437 list_del(&buddy->lru); 438 list_del(&buddy->lru);
438 zone->free_area[order].nr_free--; 439 zone->free_area[order].nr_free--;
439 rmv_page_order(buddy); 440 rmv_page_order(buddy);
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
532/* 533/*
533 * permit the bootmem allocator to evade page validation on high-order frees 534 * permit the bootmem allocator to evade page validation on high-order frees
534 */ 535 */
535void __free_pages_bootmem(struct page *page, unsigned int order) 536void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
536{ 537{
537 if (order == 0) { 538 if (order == 0) {
538 __ClearPageReserved(page); 539 __ClearPageReserved(page);
@@ -673,9 +674,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
673 * Note that start_page and end_pages are not aligned on a pageblock 674 * Note that start_page and end_pages are not aligned on a pageblock
674 * boundary. If alignment is required, use move_freepages_block() 675 * boundary. If alignment is required, use move_freepages_block()
675 */ 676 */
676int move_freepages(struct zone *zone, 677static int move_freepages(struct zone *zone,
677 struct page *start_page, struct page *end_page, 678 struct page *start_page, struct page *end_page,
678 int migratetype) 679 int migratetype)
679{ 680{
680 struct page *page; 681 struct page *page;
681 unsigned long order; 682 unsigned long order;
@@ -714,7 +715,8 @@ int move_freepages(struct zone *zone,
714 return pages_moved; 715 return pages_moved;
715} 716}
716 717
717int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 718static int move_freepages_block(struct zone *zone, struct page *page,
719 int migratetype)
718{ 720{
719 unsigned long start_pfn, end_pfn; 721 unsigned long start_pfn, end_pfn;
720 struct page *start_page, *end_page; 722 struct page *start_page, *end_page;
@@ -1429,7 +1431,7 @@ try_next_zone:
1429/* 1431/*
1430 * This is the 'heart' of the zoned buddy allocator. 1432 * This is the 'heart' of the zoned buddy allocator.
1431 */ 1433 */
1432static struct page * 1434struct page *
1433__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1435__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1434 struct zonelist *zonelist, nodemask_t *nodemask) 1436 struct zonelist *zonelist, nodemask_t *nodemask)
1435{ 1437{
@@ -1632,22 +1634,7 @@ nopage:
1632got_pg: 1634got_pg:
1633 return page; 1635 return page;
1634} 1636}
1635 1637EXPORT_SYMBOL(__alloc_pages_internal);
1636struct page *
1637__alloc_pages(gfp_t gfp_mask, unsigned int order,
1638 struct zonelist *zonelist)
1639{
1640 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1641}
1642
1643struct page *
1644__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1645 struct zonelist *zonelist, nodemask_t *nodemask)
1646{
1647 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1648}
1649
1650EXPORT_SYMBOL(__alloc_pages);
1651 1638
1652/* 1639/*
1653 * Common helper functions. 1640 * Common helper functions.
@@ -1711,6 +1698,59 @@ void free_pages(unsigned long addr, unsigned int order)
1711 1698
1712EXPORT_SYMBOL(free_pages); 1699EXPORT_SYMBOL(free_pages);
1713 1700
1701/**
1702 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
1703 * @size: the number of bytes to allocate
1704 * @gfp_mask: GFP flags for the allocation
1705 *
1706 * This function is similar to alloc_pages(), except that it allocates the
1707 * minimum number of pages to satisfy the request. alloc_pages() can only
1708 * allocate memory in power-of-two pages.
1709 *
1710 * This function is also limited by MAX_ORDER.
1711 *
1712 * Memory allocated by this function must be released by free_pages_exact().
1713 */
1714void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1715{
1716 unsigned int order = get_order(size);
1717 unsigned long addr;
1718
1719 addr = __get_free_pages(gfp_mask, order);
1720 if (addr) {
1721 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1722 unsigned long used = addr + PAGE_ALIGN(size);
1723
1724 split_page(virt_to_page(addr), order);
1725 while (used < alloc_end) {
1726 free_page(used);
1727 used += PAGE_SIZE;
1728 }
1729 }
1730
1731 return (void *)addr;
1732}
1733EXPORT_SYMBOL(alloc_pages_exact);
1734
1735/**
1736 * free_pages_exact - release memory allocated via alloc_pages_exact()
1737 * @virt: the value returned by alloc_pages_exact.
1738 * @size: size of allocation, same value as passed to alloc_pages_exact().
1739 *
1740 * Release the memory allocated by a previous call to alloc_pages_exact.
1741 */
1742void free_pages_exact(void *virt, size_t size)
1743{
1744 unsigned long addr = (unsigned long)virt;
1745 unsigned long end = addr + PAGE_ALIGN(size);
1746
1747 while (addr < end) {
1748 free_page(addr);
1749 addr += PAGE_SIZE;
1750 }
1751}
1752EXPORT_SYMBOL(free_pages_exact);
1753
1714static unsigned int nr_free_zone_pages(int offset) 1754static unsigned int nr_free_zone_pages(int offset)
1715{ 1755{
1716 struct zoneref *z; 1756 struct zoneref *z;
@@ -2332,7 +2372,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2332 2372
2333#endif /* CONFIG_NUMA */ 2373#endif /* CONFIG_NUMA */
2334 2374
2335/* return values int ....just for stop_machine_run() */ 2375/* return values int ....just for stop_machine() */
2336static int __build_all_zonelists(void *dummy) 2376static int __build_all_zonelists(void *dummy)
2337{ 2377{
2338 int nid; 2378 int nid;
@@ -2352,11 +2392,12 @@ void build_all_zonelists(void)
2352 2392
2353 if (system_state == SYSTEM_BOOTING) { 2393 if (system_state == SYSTEM_BOOTING) {
2354 __build_all_zonelists(NULL); 2394 __build_all_zonelists(NULL);
2395 mminit_verify_zonelist();
2355 cpuset_init_current_mems_allowed(); 2396 cpuset_init_current_mems_allowed();
2356 } else { 2397 } else {
2357 /* we have to stop all cpus to guarantee there is no user 2398 /* we have to stop all cpus to guarantee there is no user
2358 of zonelist */ 2399 of zonelist */
2359 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2400 stop_machine(__build_all_zonelists, NULL, NULL);
2360 /* cpuset refresh routine should be here */ 2401 /* cpuset refresh routine should be here */
2361 } 2402 }
2362 vm_total_pages = nr_free_pagecache_pages(); 2403 vm_total_pages = nr_free_pagecache_pages();
@@ -2534,6 +2575,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2534 } 2575 }
2535 page = pfn_to_page(pfn); 2576 page = pfn_to_page(pfn);
2536 set_page_links(page, zone, nid, pfn); 2577 set_page_links(page, zone, nid, pfn);
2578 mminit_verify_page_links(page, zone, nid, pfn);
2537 init_page_count(page); 2579 init_page_count(page);
2538 reset_page_mapcount(page); 2580 reset_page_mapcount(page);
2539 SetPageReserved(page); 2581 SetPageReserved(page);
@@ -2611,7 +2653,7 @@ static int zone_batchsize(struct zone *zone)
2611 return batch; 2653 return batch;
2612} 2654}
2613 2655
2614inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2656static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2615{ 2657{
2616 struct per_cpu_pages *pcp; 2658 struct per_cpu_pages *pcp;
2617 2659
@@ -2836,6 +2878,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2836 2878
2837 zone->zone_start_pfn = zone_start_pfn; 2879 zone->zone_start_pfn = zone_start_pfn;
2838 2880
2881 mminit_dprintk(MMINIT_TRACE, "memmap_init",
2882 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
2883 pgdat->node_id,
2884 (unsigned long)zone_idx(zone),
2885 zone_start_pfn, (zone_start_pfn + size));
2886
2839 zone_init_free_lists(zone); 2887 zone_init_free_lists(zone);
2840 2888
2841 return 0; 2889 return 0;
@@ -2975,7 +3023,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
2975void __init push_node_boundaries(unsigned int nid, 3023void __init push_node_boundaries(unsigned int nid,
2976 unsigned long start_pfn, unsigned long end_pfn) 3024 unsigned long start_pfn, unsigned long end_pfn)
2977{ 3025{
2978 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 3026 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3027 "Entering push_node_boundaries(%u, %lu, %lu)\n",
2979 nid, start_pfn, end_pfn); 3028 nid, start_pfn, end_pfn);
2980 3029
2981 /* Initialise the boundary for this node if necessary */ 3030 /* Initialise the boundary for this node if necessary */
@@ -2993,7 +3042,8 @@ void __init push_node_boundaries(unsigned int nid,
2993static void __meminit account_node_boundary(unsigned int nid, 3042static void __meminit account_node_boundary(unsigned int nid,
2994 unsigned long *start_pfn, unsigned long *end_pfn) 3043 unsigned long *start_pfn, unsigned long *end_pfn)
2995{ 3044{
2996 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 3045 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3046 "Entering account_node_boundary(%u, %lu, %lu)\n",
2997 nid, *start_pfn, *end_pfn); 3047 nid, *start_pfn, *end_pfn);
2998 3048
2999 /* Return if boundary information has not been provided */ 3049 /* Return if boundary information has not been provided */
@@ -3050,7 +3100,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3050 * assumption is made that zones within a node are ordered in monotonic 3100 * assumption is made that zones within a node are ordered in monotonic
3051 * increasing memory addresses so that the "highest" populated zone is used 3101 * increasing memory addresses so that the "highest" populated zone is used
3052 */ 3102 */
3053void __init find_usable_zone_for_movable(void) 3103static void __init find_usable_zone_for_movable(void)
3054{ 3104{
3055 int zone_index; 3105 int zone_index;
3056 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3106 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3076,7 +3126,7 @@ void __init find_usable_zone_for_movable(void)
3076 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3126 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3077 * zones within a node are in order of monotonic increases memory addresses 3127 * zones within a node are in order of monotonic increases memory addresses
3078 */ 3128 */
3079void __meminit adjust_zone_range_for_zone_movable(int nid, 3129static void __meminit adjust_zone_range_for_zone_movable(int nid,
3080 unsigned long zone_type, 3130 unsigned long zone_type,
3081 unsigned long node_start_pfn, 3131 unsigned long node_start_pfn,
3082 unsigned long node_end_pfn, 3132 unsigned long node_end_pfn,
@@ -3137,7 +3187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3137 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3187 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3138 * then all holes in the requested range will be accounted for. 3188 * then all holes in the requested range will be accounted for.
3139 */ 3189 */
3140unsigned long __meminit __absent_pages_in_range(int nid, 3190static unsigned long __meminit __absent_pages_in_range(int nid,
3141 unsigned long range_start_pfn, 3191 unsigned long range_start_pfn,
3142 unsigned long range_end_pfn) 3192 unsigned long range_end_pfn)
3143{ 3193{
@@ -3368,8 +3418,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3368 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3418 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3369 if (realsize >= memmap_pages) { 3419 if (realsize >= memmap_pages) {
3370 realsize -= memmap_pages; 3420 realsize -= memmap_pages;
3371 printk(KERN_DEBUG 3421 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3372 " %s zone: %lu pages used for memmap\n", 3422 "%s zone: %lu pages used for memmap\n",
3373 zone_names[j], memmap_pages); 3423 zone_names[j], memmap_pages);
3374 } else 3424 } else
3375 printk(KERN_WARNING 3425 printk(KERN_WARNING
@@ -3379,7 +3429,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3379 /* Account for reserved pages */ 3429 /* Account for reserved pages */
3380 if (j == 0 && realsize > dma_reserve) { 3430 if (j == 0 && realsize > dma_reserve) {
3381 realsize -= dma_reserve; 3431 realsize -= dma_reserve;
3382 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 3432 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3433 "%s zone: %lu pages reserved\n",
3383 zone_names[0], dma_reserve); 3434 zone_names[0], dma_reserve);
3384 } 3435 }
3385 3436
@@ -3464,10 +3515,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3464#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3515#endif /* CONFIG_FLAT_NODE_MEM_MAP */
3465} 3516}
3466 3517
3467void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, 3518void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3468 unsigned long *zones_size, unsigned long node_start_pfn, 3519 unsigned long node_start_pfn, unsigned long *zholes_size)
3469 unsigned long *zholes_size)
3470{ 3520{
3521 pg_data_t *pgdat = NODE_DATA(nid);
3522
3471 pgdat->node_id = nid; 3523 pgdat->node_id = nid;
3472 pgdat->node_start_pfn = node_start_pfn; 3524 pgdat->node_start_pfn = node_start_pfn;
3473 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3525 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -3520,10 +3572,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3520{ 3572{
3521 int i; 3573 int i;
3522 3574
3523 printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " 3575 mminit_dprintk(MMINIT_TRACE, "memory_register",
3524 "%d entries of %d used\n", 3576 "Entering add_active_range(%d, %#lx, %#lx) "
3525 nid, start_pfn, end_pfn, 3577 "%d entries of %d used\n",
3526 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3578 nid, start_pfn, end_pfn,
3579 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
3580
3581 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
3527 3582
3528 /* Merge with existing active regions if possible */ 3583 /* Merge with existing active regions if possible */
3529 for (i = 0; i < nr_nodemap_entries; i++) { 3584 for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3669,7 +3724,7 @@ static void __init sort_node_map(void)
3669} 3724}
3670 3725
3671/* Find the lowest pfn for a node */ 3726/* Find the lowest pfn for a node */
3672unsigned long __init find_min_pfn_for_node(int nid) 3727static unsigned long __init find_min_pfn_for_node(int nid)
3673{ 3728{
3674 int i; 3729 int i;
3675 unsigned long min_pfn = ULONG_MAX; 3730 unsigned long min_pfn = ULONG_MAX;
@@ -3698,23 +3753,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
3698 return find_min_pfn_for_node(MAX_NUMNODES); 3753 return find_min_pfn_for_node(MAX_NUMNODES);
3699} 3754}
3700 3755
3701/**
3702 * find_max_pfn_with_active_regions - Find the maximum PFN registered
3703 *
3704 * It returns the maximum PFN based on information provided via
3705 * add_active_range().
3706 */
3707unsigned long __init find_max_pfn_with_active_regions(void)
3708{
3709 int i;
3710 unsigned long max_pfn = 0;
3711
3712 for (i = 0; i < nr_nodemap_entries; i++)
3713 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
3714
3715 return max_pfn;
3716}
3717
3718/* 3756/*
3719 * early_calculate_totalpages() 3757 * early_calculate_totalpages()
3720 * Sum pages in active regions for movable zone. 3758 * Sum pages in active regions for movable zone.
@@ -3741,7 +3779,7 @@ static unsigned long __init early_calculate_totalpages(void)
3741 * memory. When they don't, some nodes will have more kernelcore than 3779 * memory. When they don't, some nodes will have more kernelcore than
3742 * others 3780 * others
3743 */ 3781 */
3744void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3782static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3745{ 3783{
3746 int i, nid; 3784 int i, nid;
3747 unsigned long usable_startpfn; 3785 unsigned long usable_startpfn;
@@ -3957,10 +3995,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3957 early_node_map[i].end_pfn); 3995 early_node_map[i].end_pfn);
3958 3996
3959 /* Initialise every node */ 3997 /* Initialise every node */
3998 mminit_verify_pageflags_layout();
3960 setup_nr_node_ids(); 3999 setup_nr_node_ids();
3961 for_each_online_node(nid) { 4000 for_each_online_node(nid) {
3962 pg_data_t *pgdat = NODE_DATA(nid); 4001 pg_data_t *pgdat = NODE_DATA(nid);
3963 free_area_init_node(nid, pgdat, NULL, 4002 free_area_init_node(nid, NULL,
3964 find_min_pfn_for_node(nid), NULL); 4003 find_min_pfn_for_node(nid), NULL);
3965 4004
3966 /* Any memory on that node */ 4005 /* Any memory on that node */
@@ -4025,15 +4064,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4025} 4064}
4026 4065
4027#ifndef CONFIG_NEED_MULTIPLE_NODES 4066#ifndef CONFIG_NEED_MULTIPLE_NODES
4028static bootmem_data_t contig_bootmem_data; 4067struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
4029struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
4030
4031EXPORT_SYMBOL(contig_page_data); 4068EXPORT_SYMBOL(contig_page_data);
4032#endif 4069#endif
4033 4070
4034void __init free_area_init(unsigned long *zones_size) 4071void __init free_area_init(unsigned long *zones_size)
4035{ 4072{
4036 free_area_init_node(0, NODE_DATA(0), zones_size, 4073 free_area_init_node(0, zones_size,
4037 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4074 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4038} 4075}
4039 4076
@@ -4400,7 +4437,7 @@ void *__init alloc_large_system_hash(const char *tablename,
4400 do { 4437 do {
4401 size = bucketsize << log2qty; 4438 size = bucketsize << log2qty;
4402 if (flags & HASH_EARLY) 4439 if (flags & HASH_EARLY)
4403 table = alloc_bootmem(size); 4440 table = alloc_bootmem_nopanic(size);
4404 else if (hashdist) 4441 else if (hashdist)
4405 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4442 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4406 else { 4443 else {
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..0cbe0c60c6bf 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
130 * Thread creation: For how long have there been zero 130 * Thread creation: For how long have there been zero
131 * available threads? 131 * available threads?
132 */ 132 */
133 if (jiffies - last_empty_jifs > 1 * HZ) { 133 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
134 /* unlocked list_empty() test is OK here */ 134 /* unlocked list_empty() test is OK here */
135 if (list_empty(&pdflush_list)) { 135 if (list_empty(&pdflush_list)) {
136 /* unlocked test is OK here */ 136 /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) 151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
152 continue; 152 continue;
153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
154 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { 154 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
155 /* Limit exit rate */ 155 /* Limit exit rate */
156 pdf->when_i_went_to_sleep = jiffies; 156 pdf->when_i_went_to_sleep = jiffies;
157 break; /* exeunt */ 157 break; /* exeunt */
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
382 if (hit_readahead_marker) { 382 if (hit_readahead_marker) {
383 pgoff_t start; 383 pgoff_t start;
384 384
385 read_lock_irq(&mapping->tree_lock); 385 rcu_read_lock();
386 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); 386 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
387 read_unlock_irq(&mapping->tree_lock); 387 rcu_read_unlock();
388 388
389 if (!start || start - offset > max) 389 if (!start || start - offset > max)
390 return 0; 390 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..1ea4e6fcee77 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 51#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
54 55
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
138 anon_vma_free(anon_vma); 139 anon_vma_free(anon_vma);
139} 140}
140 141
141static void anon_vma_ctor(struct kmem_cache *cachep, void *data) 142static void anon_vma_ctor(void *data)
142{ 143{
143 struct anon_vma *anon_vma = data; 144 struct anon_vma *anon_vma = data;
144 145
@@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page,
287 if (vma->vm_flags & VM_LOCKED) { 288 if (vma->vm_flags & VM_LOCKED) {
288 referenced++; 289 referenced++;
289 *mapcount = 1; /* break early from loop */ 290 *mapcount = 1; /* break early from loop */
290 } else if (ptep_clear_flush_young(vma, address, pte)) 291 } else if (ptep_clear_flush_young_notify(vma, address, pte))
291 referenced++; 292 referenced++;
292 293
293 /* Pretend the page is referenced if the task has the 294 /* Pretend the page is referenced if the task has the
@@ -421,7 +422,7 @@ int page_referenced(struct page *page, int is_locked,
421 referenced += page_referenced_anon(page, mem_cont); 422 referenced += page_referenced_anon(page, mem_cont);
422 else if (is_locked) 423 else if (is_locked)
423 referenced += page_referenced_file(page, mem_cont); 424 referenced += page_referenced_file(page, mem_cont);
424 else if (TestSetPageLocked(page)) 425 else if (!trylock_page(page))
425 referenced++; 426 referenced++;
426 else { 427 else {
427 if (page->mapping) 428 if (page->mapping)
@@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
457 pte_t entry; 458 pte_t entry;
458 459
459 flush_cache_page(vma, address, pte_pfn(*pte)); 460 flush_cache_page(vma, address, pte_pfn(*pte));
460 entry = ptep_clear_flush(vma, address, pte); 461 entry = ptep_clear_flush_notify(vma, address, pte);
461 entry = pte_wrprotect(entry); 462 entry = pte_wrprotect(entry);
462 entry = pte_mkclean(entry); 463 entry = pte_mkclean(entry);
463 set_pte_at(mm, address, pte, entry); 464 set_pte_at(mm, address, pte, entry);
@@ -576,14 +577,8 @@ void page_add_anon_rmap(struct page *page,
576 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 577 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
577 if (atomic_inc_and_test(&page->_mapcount)) 578 if (atomic_inc_and_test(&page->_mapcount))
578 __page_set_anon_rmap(page, vma, address); 579 __page_set_anon_rmap(page, vma, address);
579 else { 580 else
580 __page_check_anon_rmap(page, vma, address); 581 __page_check_anon_rmap(page, vma, address);
581 /*
582 * We unconditionally charged during prepare, we uncharge here
583 * This takes care of balancing the reference counts
584 */
585 mem_cgroup_uncharge_page(page);
586 }
587} 582}
588 583
589/** 584/**
@@ -614,12 +609,6 @@ void page_add_file_rmap(struct page *page)
614{ 609{
615 if (atomic_inc_and_test(&page->_mapcount)) 610 if (atomic_inc_and_test(&page->_mapcount))
616 __inc_zone_page_state(page, NR_FILE_MAPPED); 611 __inc_zone_page_state(page, NR_FILE_MAPPED);
617 else
618 /*
619 * We unconditionally charged during prepare, we uncharge here
620 * This takes care of balancing the reference counts
621 */
622 mem_cgroup_uncharge_page(page);
623} 612}
624 613
625#ifdef CONFIG_DEBUG_VM 614#ifdef CONFIG_DEBUG_VM
@@ -678,7 +667,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
678 * Leaving it set also helps swapoff to reinstate ptes 667 * Leaving it set also helps swapoff to reinstate ptes
679 * faster for those pages still in swapcache. 668 * faster for those pages still in swapcache.
680 */ 669 */
681 if (page_test_dirty(page)) { 670 if ((!PageAnon(page) || PageSwapCache(page)) &&
671 page_test_dirty(page)) {
682 page_clear_dirty(page); 672 page_clear_dirty(page);
683 set_page_dirty(page); 673 set_page_dirty(page);
684 } 674 }
@@ -717,14 +707,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
717 * skipped over this mm) then we should reactivate it. 707 * skipped over this mm) then we should reactivate it.
718 */ 708 */
719 if (!migration && ((vma->vm_flags & VM_LOCKED) || 709 if (!migration && ((vma->vm_flags & VM_LOCKED) ||
720 (ptep_clear_flush_young(vma, address, pte)))) { 710 (ptep_clear_flush_young_notify(vma, address, pte)))) {
721 ret = SWAP_FAIL; 711 ret = SWAP_FAIL;
722 goto out_unmap; 712 goto out_unmap;
723 } 713 }
724 714
725 /* Nuke the page table entry. */ 715 /* Nuke the page table entry. */
726 flush_cache_page(vma, address, page_to_pfn(page)); 716 flush_cache_page(vma, address, page_to_pfn(page));
727 pteval = ptep_clear_flush(vma, address, pte); 717 pteval = ptep_clear_flush_notify(vma, address, pte);
728 718
729 /* Move the dirty bit to the physical page now the pte is gone. */ 719 /* Move the dirty bit to the physical page now the pte is gone. */
730 if (pte_dirty(pteval)) 720 if (pte_dirty(pteval))
@@ -849,12 +839,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
849 page = vm_normal_page(vma, address, *pte); 839 page = vm_normal_page(vma, address, *pte);
850 BUG_ON(!page || PageAnon(page)); 840 BUG_ON(!page || PageAnon(page));
851 841
852 if (ptep_clear_flush_young(vma, address, pte)) 842 if (ptep_clear_flush_young_notify(vma, address, pte))
853 continue; 843 continue;
854 844
855 /* Nuke the page table entry. */ 845 /* Nuke the page table entry. */
856 flush_cache_page(vma, address, pte_pfn(*pte)); 846 flush_cache_page(vma, address, pte_pfn(*pte));
857 pteval = ptep_clear_flush(vma, address, pte); 847 pteval = ptep_clear_flush_notify(vma, address, pte);
858 848
859 /* If nonlinear, store the file page offset in the pte. */ 849 /* If nonlinear, store the file page offset in the pte. */
860 if (page->index != linear_page_index(vma, address)) 850 if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..04fb4f1ab88e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,20 +922,26 @@ found:
922 error = 1; 922 error = 1;
923 if (!inode) 923 if (!inode)
924 goto out; 924 goto out;
925 /* Precharge page while we can wait, compensate afterwards */ 925 /* Precharge page using GFP_KERNEL while we can wait */
926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
927 if (error) 927 if (error)
928 goto out; 928 goto out;
929 error = radix_tree_preload(GFP_KERNEL); 929 error = radix_tree_preload(GFP_KERNEL);
930 if (error) 930 if (error) {
931 goto uncharge; 931 mem_cgroup_uncharge_cache_page(page);
932 goto out;
933 }
932 error = 1; 934 error = 1;
933 935
934 spin_lock(&info->lock); 936 spin_lock(&info->lock);
935 ptr = shmem_swp_entry(info, idx, NULL); 937 ptr = shmem_swp_entry(info, idx, NULL);
936 if (ptr && ptr->val == entry.val) 938 if (ptr && ptr->val == entry.val) {
937 error = add_to_page_cache(page, inode->i_mapping, 939 error = add_to_page_cache_locked(page, inode->i_mapping,
938 idx, GFP_NOWAIT); 940 idx, GFP_NOWAIT);
941 /* does mem_cgroup_uncharge_cache_page on error */
942 } else /* we must compensate for our precharge above */
943 mem_cgroup_uncharge_cache_page(page);
944
939 if (error == -EEXIST) { 945 if (error == -EEXIST) {
940 struct page *filepage = find_get_page(inode->i_mapping, idx); 946 struct page *filepage = find_get_page(inode->i_mapping, idx);
941 error = 1; 947 error = 1;
@@ -961,8 +967,6 @@ found:
961 shmem_swp_unmap(ptr); 967 shmem_swp_unmap(ptr);
962 spin_unlock(&info->lock); 968 spin_unlock(&info->lock);
963 radix_tree_preload_end(); 969 radix_tree_preload_end();
964uncharge:
965 mem_cgroup_uncharge_page(page);
966out: 970out:
967 unlock_page(page); 971 unlock_page(page);
968 page_cache_release(page); 972 page_cache_release(page);
@@ -1261,7 +1265,7 @@ repeat:
1261 } 1265 }
1262 1266
1263 /* We have to do this with page locked to prevent races */ 1267 /* We have to do this with page locked to prevent races */
1264 if (TestSetPageLocked(swappage)) { 1268 if (!trylock_page(swappage)) {
1265 shmem_swp_unmap(entry); 1269 shmem_swp_unmap(entry);
1266 spin_unlock(&info->lock); 1270 spin_unlock(&info->lock);
1267 wait_on_page_locked(swappage); 1271 wait_on_page_locked(swappage);
@@ -1297,8 +1301,8 @@ repeat:
1297 SetPageUptodate(filepage); 1301 SetPageUptodate(filepage);
1298 set_page_dirty(filepage); 1302 set_page_dirty(filepage);
1299 swap_free(swap); 1303 swap_free(swap);
1300 } else if (!(error = add_to_page_cache( 1304 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1301 swappage, mapping, idx, GFP_NOWAIT))) { 1305 idx, GFP_NOWAIT))) {
1302 info->flags |= SHMEM_PAGEIN; 1306 info->flags |= SHMEM_PAGEIN;
1303 shmem_swp_set(info, entry, 0); 1307 shmem_swp_set(info, entry, 0);
1304 shmem_swp_unmap(entry); 1308 shmem_swp_unmap(entry);
@@ -1311,24 +1315,21 @@ repeat:
1311 shmem_swp_unmap(entry); 1315 shmem_swp_unmap(entry);
1312 spin_unlock(&info->lock); 1316 spin_unlock(&info->lock);
1313 unlock_page(swappage); 1317 unlock_page(swappage);
1318 page_cache_release(swappage);
1314 if (error == -ENOMEM) { 1319 if (error == -ENOMEM) {
1315 /* allow reclaim from this memory cgroup */ 1320 /* allow reclaim from this memory cgroup */
1316 error = mem_cgroup_cache_charge(swappage, 1321 error = mem_cgroup_shrink_usage(current->mm,
1317 current->mm, gfp & ~__GFP_HIGHMEM); 1322 gfp);
1318 if (error) { 1323 if (error)
1319 page_cache_release(swappage);
1320 goto failed; 1324 goto failed;
1321 }
1322 mem_cgroup_uncharge_page(swappage);
1323 } 1325 }
1324 page_cache_release(swappage);
1325 goto repeat; 1326 goto repeat;
1326 } 1327 }
1327 } else if (sgp == SGP_READ && !filepage) { 1328 } else if (sgp == SGP_READ && !filepage) {
1328 shmem_swp_unmap(entry); 1329 shmem_swp_unmap(entry);
1329 filepage = find_get_page(mapping, idx); 1330 filepage = find_get_page(mapping, idx);
1330 if (filepage && 1331 if (filepage &&
1331 (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { 1332 (!PageUptodate(filepage) || !trylock_page(filepage))) {
1332 spin_unlock(&info->lock); 1333 spin_unlock(&info->lock);
1333 wait_on_page_locked(filepage); 1334 wait_on_page_locked(filepage);
1334 page_cache_release(filepage); 1335 page_cache_release(filepage);
@@ -1358,6 +1359,8 @@ repeat:
1358 } 1359 }
1359 1360
1360 if (!filepage) { 1361 if (!filepage) {
1362 int ret;
1363
1361 spin_unlock(&info->lock); 1364 spin_unlock(&info->lock);
1362 filepage = shmem_alloc_page(gfp, info, idx); 1365 filepage = shmem_alloc_page(gfp, info, idx);
1363 if (!filepage) { 1366 if (!filepage) {
@@ -1386,10 +1389,18 @@ repeat:
1386 swap = *entry; 1389 swap = *entry;
1387 shmem_swp_unmap(entry); 1390 shmem_swp_unmap(entry);
1388 } 1391 }
1389 if (error || swap.val || 0 != add_to_page_cache_lru( 1392 ret = error || swap.val;
1390 filepage, mapping, idx, GFP_NOWAIT)) { 1393 if (ret)
1394 mem_cgroup_uncharge_cache_page(filepage);
1395 else
1396 ret = add_to_page_cache_lru(filepage, mapping,
1397 idx, GFP_NOWAIT);
1398 /*
1399 * At add_to_page_cache_lru() failure, uncharge will
1400 * be done automatically.
1401 */
1402 if (ret) {
1391 spin_unlock(&info->lock); 1403 spin_unlock(&info->lock);
1392 mem_cgroup_uncharge_page(filepage);
1393 page_cache_release(filepage); 1404 page_cache_release(filepage);
1394 shmem_unacct_blocks(info->flags, 1); 1405 shmem_unacct_blocks(info->flags, 1);
1395 shmem_free_blocks(inode, 1); 1406 shmem_free_blocks(inode, 1);
@@ -1398,7 +1409,6 @@ repeat:
1398 goto failed; 1409 goto failed;
1399 goto repeat; 1410 goto repeat;
1400 } 1411 }
1401 mem_cgroup_uncharge_page(filepage);
1402 info->flags |= SHMEM_PAGEIN; 1412 info->flags |= SHMEM_PAGEIN;
1403 } 1413 }
1404 1414
@@ -1503,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1503 inode->i_uid = current->fsuid; 1513 inode->i_uid = current->fsuid;
1504 inode->i_gid = current->fsgid; 1514 inode->i_gid = current->fsgid;
1505 inode->i_blocks = 0; 1515 inode->i_blocks = 0;
1506 inode->i_mapping->a_ops = &shmem_aops;
1507 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1516 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1508 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1517 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1509 inode->i_generation = get_seconds(); 1518 inode->i_generation = get_seconds();
@@ -1518,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1518 init_special_inode(inode, mode, dev); 1527 init_special_inode(inode, mode, dev);
1519 break; 1528 break;
1520 case S_IFREG: 1529 case S_IFREG:
1530 inode->i_mapping->a_ops = &shmem_aops;
1521 inode->i_op = &shmem_inode_operations; 1531 inode->i_op = &shmem_inode_operations;
1522 inode->i_fop = &shmem_file_operations; 1532 inode->i_fop = &shmem_file_operations;
1523 mpol_shared_policy_init(&info->policy, 1533 mpol_shared_policy_init(&info->policy,
@@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1690 file_accessed(filp); 1700 file_accessed(filp);
1691} 1701}
1692 1702
1693static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 1703static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1704 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1694{ 1705{
1695 read_descriptor_t desc; 1706 struct file *filp = iocb->ki_filp;
1707 ssize_t retval;
1708 unsigned long seg;
1709 size_t count;
1710 loff_t *ppos = &iocb->ki_pos;
1696 1711
1697 if ((ssize_t) count < 0) 1712 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1698 return -EINVAL; 1713 if (retval)
1699 if (!access_ok(VERIFY_WRITE, buf, count)) 1714 return retval;
1700 return -EFAULT;
1701 if (!count)
1702 return 0;
1703 1715
1704 desc.written = 0; 1716 for (seg = 0; seg < nr_segs; seg++) {
1705 desc.count = count; 1717 read_descriptor_t desc;
1706 desc.arg.buf = buf;
1707 desc.error = 0;
1708 1718
1709 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1719 desc.written = 0;
1710 if (desc.written) 1720 desc.arg.buf = iov[seg].iov_base;
1711 return desc.written; 1721 desc.count = iov[seg].iov_len;
1712 return desc.error; 1722 if (desc.count == 0)
1723 continue;
1724 desc.error = 0;
1725 do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1726 retval += desc.written;
1727 if (desc.error) {
1728 retval = retval ?: desc.error;
1729 break;
1730 }
1731 if (desc.count > 0)
1732 break;
1733 }
1734 return retval;
1713} 1735}
1714 1736
1715static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1737static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1907,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1907 return error; 1929 return error;
1908 } 1930 }
1909 unlock_page(page); 1931 unlock_page(page);
1932 inode->i_mapping->a_ops = &shmem_aops;
1910 inode->i_op = &shmem_symlink_inode_operations; 1933 inode->i_op = &shmem_symlink_inode_operations;
1911 kaddr = kmap_atomic(page, KM_USER0); 1934 kaddr = kmap_atomic(page, KM_USER0);
1912 memcpy(kaddr, symname, len); 1935 memcpy(kaddr, symname, len);
@@ -2330,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
2330 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2353 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2331} 2354}
2332 2355
2333static void init_once(struct kmem_cache *cachep, void *foo) 2356static void init_once(void *foo)
2334{ 2357{
2335 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2358 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2336 2359
@@ -2369,8 +2392,9 @@ static const struct file_operations shmem_file_operations = {
2369 .mmap = shmem_mmap, 2392 .mmap = shmem_mmap,
2370#ifdef CONFIG_TMPFS 2393#ifdef CONFIG_TMPFS
2371 .llseek = generic_file_llseek, 2394 .llseek = generic_file_llseek,
2372 .read = shmem_file_read, 2395 .read = do_sync_read,
2373 .write = do_sync_write, 2396 .write = do_sync_write,
2397 .aio_read = shmem_file_aio_read,
2374 .aio_write = generic_file_aio_write, 2398 .aio_write = generic_file_aio_write,
2375 .fsync = simple_sync_file, 2399 .fsync = simple_sync_file,
2376 .splice_read = generic_file_splice_read, 2400 .splice_read = generic_file_splice_read,
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
191 * shmem_permission - permission() inode operation 191 * shmem_permission - permission() inode operation
192 */ 192 */
193int 193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd) 194shmem_permission(struct inode *inode, int mask)
195{ 195{
196 return generic_permission(inode, mask, shmem_check_acl); 196 return generic_permission(inode, mask, shmem_check_acl);
197} 197}
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..e76eee466886 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
406 unsigned int dflags; /* dynamic flags */ 406 unsigned int dflags; /* dynamic flags */
407 407
408 /* constructor func */ 408 /* constructor func */
409 void (*ctor)(struct kmem_cache *, void *); 409 void (*ctor)(void *obj);
410 410
411/* 5) cache creation/removal */ 411/* 5) cache creation/removal */
412 const char *name; 412 const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2137 */ 2137 */
2138struct kmem_cache * 2138struct kmem_cache *
2139kmem_cache_create (const char *name, size_t size, size_t align, 2139kmem_cache_create (const char *name, size_t size, size_t align,
2140 unsigned long flags, 2140 unsigned long flags, void (*ctor)(void *))
2141 void (*ctor)(struct kmem_cache *, void *))
2142{ 2141{
2143 size_t left_over, slab_size, ralign; 2142 size_t left_over, slab_size, ralign;
2144 struct kmem_cache *cachep = NULL, *pc; 2143 struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2653 * They must also be threaded. 2652 * They must also be threaded.
2654 */ 2653 */
2655 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2654 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2656 cachep->ctor(cachep, objp + obj_offset(cachep)); 2655 cachep->ctor(objp + obj_offset(cachep));
2657 2656
2658 if (cachep->flags & SLAB_RED_ZONE) { 2657 if (cachep->flags & SLAB_RED_ZONE) {
2659 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2658 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2669 cachep->buffer_size / PAGE_SIZE, 0); 2668 cachep->buffer_size / PAGE_SIZE, 0);
2670#else 2669#else
2671 if (cachep->ctor) 2670 if (cachep->ctor)
2672 cachep->ctor(cachep, objp); 2671 cachep->ctor(objp);
2673#endif 2672#endif
2674 slab_bufctl(slabp)[i] = i + 1; 2673 slab_bufctl(slabp)[i] = i + 1;
2675 } 2674 }
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3093#endif 3092#endif
3094 objp += obj_offset(cachep); 3093 objp += obj_offset(cachep);
3095 if (cachep->ctor && cachep->flags & SLAB_POISON) 3094 if (cachep->ctor && cachep->flags & SLAB_POISON)
3096 cachep->ctor(cachep, objp); 3095 cachep->ctor(objp);
3097#if ARCH_SLAB_MINALIGN 3096#if ARCH_SLAB_MINALIGN
3098 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3097 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3099 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3098 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -4473,4 +4472,3 @@ size_t ksize(const void *objp)
4473 4472
4474 return obj_size(virt_to_cache(objp)); 4473 return obj_size(virt_to_cache(objp));
4475} 4474}
4476EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..4c82dd41f32e 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
130 */ 130 */
131static inline int slob_page(struct slob_page *sp) 131static inline int slob_page(struct slob_page *sp)
132{ 132{
133 return test_bit(PG_active, &sp->flags); 133 return PageSlobPage((struct page *)sp);
134} 134}
135 135
136static inline void set_slob_page(struct slob_page *sp) 136static inline void set_slob_page(struct slob_page *sp)
137{ 137{
138 __set_bit(PG_active, &sp->flags); 138 __SetPageSlobPage((struct page *)sp);
139} 139}
140 140
141static inline void clear_slob_page(struct slob_page *sp) 141static inline void clear_slob_page(struct slob_page *sp)
142{ 142{
143 __clear_bit(PG_active, &sp->flags); 143 __ClearPageSlobPage((struct page *)sp);
144} 144}
145 145
146/* 146/*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
148 */ 148 */
149static inline int slob_page_free(struct slob_page *sp) 149static inline int slob_page_free(struct slob_page *sp)
150{ 150{
151 return test_bit(PG_private, &sp->flags); 151 return PageSlobFree((struct page *)sp);
152} 152}
153 153
154static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 154static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
155{ 155{
156 list_add(&sp->list, list); 156 list_add(&sp->list, list);
157 __set_bit(PG_private, &sp->flags); 157 __SetPageSlobFree((struct page *)sp);
158} 158}
159 159
160static inline void clear_slob_page_free(struct slob_page *sp) 160static inline void clear_slob_page_free(struct slob_page *sp)
161{ 161{
162 list_del(&sp->list); 162 list_del(&sp->list);
163 __clear_bit(PG_private, &sp->flags); 163 __ClearPageSlobFree((struct page *)sp);
164} 164}
165 165
166#define SLOB_UNIT sizeof(slob_t) 166#define SLOB_UNIT sizeof(slob_t)
@@ -519,18 +519,16 @@ size_t ksize(const void *block)
519 else 519 else
520 return sp->page.private; 520 return sp->page.private;
521} 521}
522EXPORT_SYMBOL(ksize);
523 522
524struct kmem_cache { 523struct kmem_cache {
525 unsigned int size, align; 524 unsigned int size, align;
526 unsigned long flags; 525 unsigned long flags;
527 const char *name; 526 const char *name;
528 void (*ctor)(struct kmem_cache *, void *); 527 void (*ctor)(void *);
529}; 528};
530 529
531struct kmem_cache *kmem_cache_create(const char *name, size_t size, 530struct kmem_cache *kmem_cache_create(const char *name, size_t size,
532 size_t align, unsigned long flags, 531 size_t align, unsigned long flags, void (*ctor)(void *))
533 void (*ctor)(struct kmem_cache *, void *))
534{ 532{
535 struct kmem_cache *c; 533 struct kmem_cache *c;
536 534
@@ -575,7 +573,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
575 b = slob_new_page(flags, get_order(c->size), node); 573 b = slob_new_page(flags, get_order(c->size), node);
576 574
577 if (c->ctor) 575 if (c->ctor)
578 c->ctor(c, b); 576 c->ctor(b);
579 577
580 return b; 578 return b;
581} 579}
diff --git a/mm/slub.c b/mm/slub.c
index 6d4a49c1ff2f..4f5b96149458 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -102,44 +102,12 @@
102 * the fast path and disables lockless freelists. 102 * the fast path and disables lockless freelists.
103 */ 103 */
104 104
105#define FROZEN (1 << PG_active)
106
107#ifdef CONFIG_SLUB_DEBUG 105#ifdef CONFIG_SLUB_DEBUG
108#define SLABDEBUG (1 << PG_error) 106#define SLABDEBUG 1
109#else 107#else
110#define SLABDEBUG 0 108#define SLABDEBUG 0
111#endif 109#endif
112 110
113static inline int SlabFrozen(struct page *page)
114{
115 return page->flags & FROZEN;
116}
117
118static inline void SetSlabFrozen(struct page *page)
119{
120 page->flags |= FROZEN;
121}
122
123static inline void ClearSlabFrozen(struct page *page)
124{
125 page->flags &= ~FROZEN;
126}
127
128static inline int SlabDebug(struct page *page)
129{
130 return page->flags & SLABDEBUG;
131}
132
133static inline void SetSlabDebug(struct page *page)
134{
135 page->flags |= SLABDEBUG;
136}
137
138static inline void ClearSlabDebug(struct page *page)
139{
140 page->flags &= ~SLABDEBUG;
141}
142
143/* 111/*
144 * Issues still to be resolved: 112 * Issues still to be resolved:
145 * 113 *
@@ -971,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
971 } 939 }
972 940
973 /* Special debug activities for freeing objects */ 941 /* Special debug activities for freeing objects */
974 if (!SlabFrozen(page) && !page->freelist) 942 if (!PageSlubFrozen(page) && !page->freelist)
975 remove_full(s, page); 943 remove_full(s, page);
976 if (s->flags & SLAB_STORE_USER) 944 if (s->flags & SLAB_STORE_USER)
977 set_track(s, object, TRACK_FREE, addr); 945 set_track(s, object, TRACK_FREE, addr);
@@ -1044,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
1044 1012
1045static unsigned long kmem_cache_flags(unsigned long objsize, 1013static unsigned long kmem_cache_flags(unsigned long objsize,
1046 unsigned long flags, const char *name, 1014 unsigned long flags, const char *name,
1047 void (*ctor)(struct kmem_cache *, void *)) 1015 void (*ctor)(void *))
1048{ 1016{
1049 /* 1017 /*
1050 * Enable debugging if selected on the kernel commandline. 1018 * Enable debugging if selected on the kernel commandline.
@@ -1072,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1072static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1040static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1073static inline unsigned long kmem_cache_flags(unsigned long objsize, 1041static inline unsigned long kmem_cache_flags(unsigned long objsize,
1074 unsigned long flags, const char *name, 1042 unsigned long flags, const char *name,
1075 void (*ctor)(struct kmem_cache *, void *)) 1043 void (*ctor)(void *))
1076{ 1044{
1077 return flags; 1045 return flags;
1078} 1046}
@@ -1135,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1135{ 1103{
1136 setup_object_debug(s, page, object); 1104 setup_object_debug(s, page, object);
1137 if (unlikely(s->ctor)) 1105 if (unlikely(s->ctor))
1138 s->ctor(s, object); 1106 s->ctor(object);
1139} 1107}
1140 1108
1141static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1109static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1157,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1157 page->flags |= 1 << PG_slab; 1125 page->flags |= 1 << PG_slab;
1158 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1126 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1159 SLAB_STORE_USER | SLAB_TRACE)) 1127 SLAB_STORE_USER | SLAB_TRACE))
1160 SetSlabDebug(page); 1128 __SetPageSlubDebug(page);
1161 1129
1162 start = page_address(page); 1130 start = page_address(page);
1163 1131
@@ -1184,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1184 int order = compound_order(page); 1152 int order = compound_order(page);
1185 int pages = 1 << order; 1153 int pages = 1 << order;
1186 1154
1187 if (unlikely(SlabDebug(page))) { 1155 if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
1188 void *p; 1156 void *p;
1189 1157
1190 slab_pad_check(s, page); 1158 slab_pad_check(s, page);
1191 for_each_object(p, s, page_address(page), 1159 for_each_object(p, s, page_address(page),
1192 page->objects) 1160 page->objects)
1193 check_object(s, page, p, 0); 1161 check_object(s, page, p, 0);
1194 ClearSlabDebug(page); 1162 __ClearPageSlubDebug(page);
1195 } 1163 }
1196 1164
1197 mod_zone_page_state(page_zone(page), 1165 mod_zone_page_state(page_zone(page),
@@ -1288,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1288 if (slab_trylock(page)) { 1256 if (slab_trylock(page)) {
1289 list_del(&page->lru); 1257 list_del(&page->lru);
1290 n->nr_partial--; 1258 n->nr_partial--;
1291 SetSlabFrozen(page); 1259 __SetPageSlubFrozen(page);
1292 return 1; 1260 return 1;
1293 } 1261 }
1294 return 0; 1262 return 0;
@@ -1361,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1361 n = get_node(s, zone_to_nid(zone)); 1329 n = get_node(s, zone_to_nid(zone));
1362 1330
1363 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1331 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1364 n->nr_partial > MIN_PARTIAL) { 1332 n->nr_partial > n->min_partial) {
1365 page = get_partial_node(n); 1333 page = get_partial_node(n);
1366 if (page) 1334 if (page)
1367 return page; 1335 return page;
@@ -1398,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1398 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1366 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1399 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); 1367 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1400 1368
1401 ClearSlabFrozen(page); 1369 __ClearPageSlubFrozen(page);
1402 if (page->inuse) { 1370 if (page->inuse) {
1403 1371
1404 if (page->freelist) { 1372 if (page->freelist) {
@@ -1406,13 +1374,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1406 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1374 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1407 } else { 1375 } else {
1408 stat(c, DEACTIVATE_FULL); 1376 stat(c, DEACTIVATE_FULL);
1409 if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1377 if (SLABDEBUG && PageSlubDebug(page) &&
1378 (s->flags & SLAB_STORE_USER))
1410 add_full(n, page); 1379 add_full(n, page);
1411 } 1380 }
1412 slab_unlock(page); 1381 slab_unlock(page);
1413 } else { 1382 } else {
1414 stat(c, DEACTIVATE_EMPTY); 1383 stat(c, DEACTIVATE_EMPTY);
1415 if (n->nr_partial < MIN_PARTIAL) { 1384 if (n->nr_partial < n->min_partial) {
1416 /* 1385 /*
1417 * Adding an empty slab to the partial slabs in order 1386 * Adding an empty slab to the partial slabs in order
1418 * to avoid page allocator overhead. This slab needs 1387 * to avoid page allocator overhead. This slab needs
@@ -1551,7 +1520,7 @@ load_freelist:
1551 object = c->page->freelist; 1520 object = c->page->freelist;
1552 if (unlikely(!object)) 1521 if (unlikely(!object))
1553 goto another_slab; 1522 goto another_slab;
1554 if (unlikely(SlabDebug(c->page))) 1523 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1555 goto debug; 1524 goto debug;
1556 1525
1557 c->freelist = object[c->offset]; 1526 c->freelist = object[c->offset];
@@ -1588,7 +1557,7 @@ new_slab:
1588 if (c->page) 1557 if (c->page)
1589 flush_slab(s, c); 1558 flush_slab(s, c);
1590 slab_lock(new); 1559 slab_lock(new);
1591 SetSlabFrozen(new); 1560 __SetPageSlubFrozen(new);
1592 c->page = new; 1561 c->page = new;
1593 goto load_freelist; 1562 goto load_freelist;
1594 } 1563 }
@@ -1674,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1674 stat(c, FREE_SLOWPATH); 1643 stat(c, FREE_SLOWPATH);
1675 slab_lock(page); 1644 slab_lock(page);
1676 1645
1677 if (unlikely(SlabDebug(page))) 1646 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1678 goto debug; 1647 goto debug;
1679 1648
1680checks_ok: 1649checks_ok:
@@ -1682,7 +1651,7 @@ checks_ok:
1682 page->freelist = object; 1651 page->freelist = object;
1683 page->inuse--; 1652 page->inuse--;
1684 1653
1685 if (unlikely(SlabFrozen(page))) { 1654 if (unlikely(PageSlubFrozen(page))) {
1686 stat(c, FREE_FROZEN); 1655 stat(c, FREE_FROZEN);
1687 goto out_unlock; 1656 goto out_unlock;
1688 } 1657 }
@@ -1944,9 +1913,21 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
1944#endif 1913#endif
1945} 1914}
1946 1915
1947static void init_kmem_cache_node(struct kmem_cache_node *n) 1916static void
1917init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1948{ 1918{
1949 n->nr_partial = 0; 1919 n->nr_partial = 0;
1920
1921 /*
1922 * The larger the object size is, the more pages we want on the partial
1923 * list to avoid pounding the page allocator excessively.
1924 */
1925 n->min_partial = ilog2(s->size);
1926 if (n->min_partial < MIN_PARTIAL)
1927 n->min_partial = MIN_PARTIAL;
1928 else if (n->min_partial > MAX_PARTIAL)
1929 n->min_partial = MAX_PARTIAL;
1930
1950 spin_lock_init(&n->list_lock); 1931 spin_lock_init(&n->list_lock);
1951 INIT_LIST_HEAD(&n->partial); 1932 INIT_LIST_HEAD(&n->partial);
1952#ifdef CONFIG_SLUB_DEBUG 1933#ifdef CONFIG_SLUB_DEBUG
@@ -2118,7 +2099,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2118 init_object(kmalloc_caches, n, 1); 2099 init_object(kmalloc_caches, n, 1);
2119 init_tracking(kmalloc_caches, n); 2100 init_tracking(kmalloc_caches, n);
2120#endif 2101#endif
2121 init_kmem_cache_node(n); 2102 init_kmem_cache_node(n, kmalloc_caches);
2122 inc_slabs_node(kmalloc_caches, node, page->objects); 2103 inc_slabs_node(kmalloc_caches, node, page->objects);
2123 2104
2124 /* 2105 /*
@@ -2175,7 +2156,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2175 2156
2176 } 2157 }
2177 s->node[node] = n; 2158 s->node[node] = n;
2178 init_kmem_cache_node(n); 2159 init_kmem_cache_node(n, s);
2179 } 2160 }
2180 return 1; 2161 return 1;
2181} 2162}
@@ -2186,7 +2167,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
2186 2167
2187static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2168static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2188{ 2169{
2189 init_kmem_cache_node(&s->local_node); 2170 init_kmem_cache_node(&s->local_node, s);
2190 return 1; 2171 return 1;
2191} 2172}
2192#endif 2173#endif
@@ -2317,7 +2298,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2317static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2298static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2318 const char *name, size_t size, 2299 const char *name, size_t size,
2319 size_t align, unsigned long flags, 2300 size_t align, unsigned long flags,
2320 void (*ctor)(struct kmem_cache *, void *)) 2301 void (*ctor)(void *))
2321{ 2302{
2322 memset(s, 0, kmem_size); 2303 memset(s, 0, kmem_size);
2323 s->name = name; 2304 s->name = name;
@@ -2746,7 +2727,6 @@ size_t ksize(const void *object)
2746 */ 2727 */
2747 return s->size; 2728 return s->size;
2748} 2729}
2749EXPORT_SYMBOL(ksize);
2750 2730
2751void kfree(const void *x) 2731void kfree(const void *x)
2752{ 2732{
@@ -2921,7 +2901,7 @@ static int slab_mem_going_online_callback(void *arg)
2921 ret = -ENOMEM; 2901 ret = -ENOMEM;
2922 goto out; 2902 goto out;
2923 } 2903 }
2924 init_kmem_cache_node(n); 2904 init_kmem_cache_node(n, s);
2925 s->node[nid] = n; 2905 s->node[nid] = n;
2926 } 2906 }
2927out: 2907out:
@@ -3073,7 +3053,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3073 3053
3074static struct kmem_cache *find_mergeable(size_t size, 3054static struct kmem_cache *find_mergeable(size_t size,
3075 size_t align, unsigned long flags, const char *name, 3055 size_t align, unsigned long flags, const char *name,
3076 void (*ctor)(struct kmem_cache *, void *)) 3056 void (*ctor)(void *))
3077{ 3057{
3078 struct kmem_cache *s; 3058 struct kmem_cache *s;
3079 3059
@@ -3113,8 +3093,7 @@ static struct kmem_cache *find_mergeable(size_t size,
3113} 3093}
3114 3094
3115struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3095struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3116 size_t align, unsigned long flags, 3096 size_t align, unsigned long flags, void (*ctor)(void *))
3117 void (*ctor)(struct kmem_cache *, void *))
3118{ 3097{
3119 struct kmem_cache *s; 3098 struct kmem_cache *s;
3120 3099
@@ -3317,12 +3296,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3317 s->name, page); 3296 s->name, page);
3318 3297
3319 if (s->flags & DEBUG_DEFAULT_FLAGS) { 3298 if (s->flags & DEBUG_DEFAULT_FLAGS) {
3320 if (!SlabDebug(page)) 3299 if (!PageSlubDebug(page))
3321 printk(KERN_ERR "SLUB %s: SlabDebug not set " 3300 printk(KERN_ERR "SLUB %s: SlubDebug not set "
3322 "on slab 0x%p\n", s->name, page); 3301 "on slab 0x%p\n", s->name, page);
3323 } else { 3302 } else {
3324 if (SlabDebug(page)) 3303 if (PageSlubDebug(page))
3325 printk(KERN_ERR "SLUB %s: SlabDebug set on " 3304 printk(KERN_ERR "SLUB %s: SlubDebug set on "
3326 "slab 0x%p\n", s->name, page); 3305 "slab 0x%p\n", s->name, page);
3327 } 3306 }
3328} 3307}
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section)
147 return (section->section_mem_map >> SECTION_NID_SHIFT); 147 return (section->section_mem_map >> SECTION_NID_SHIFT);
148} 148}
149 149
150/* Record a memory area against a node. */ 150/* Validate the physical addressing limitations of the model */
151void __init memory_present(int nid, unsigned long start, unsigned long end) 151void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
152 unsigned long *end_pfn)
152{ 153{
153 unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); 154 unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
154 unsigned long pfn;
155 155
156 /* 156 /*
157 * Sanity checks - do not allow an architecture to pass 157 * Sanity checks - do not allow an architecture to pass
158 * in larger pfns than the maximum scope of sparsemem: 158 * in larger pfns than the maximum scope of sparsemem:
159 */ 159 */
160 if (start >= max_arch_pfn) 160 if (*start_pfn > max_sparsemem_pfn) {
161 return; 161 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
162 if (end >= max_arch_pfn) 162 "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
163 end = max_arch_pfn; 163 *start_pfn, *end_pfn, max_sparsemem_pfn);
164 WARN_ON_ONCE(1);
165 *start_pfn = max_sparsemem_pfn;
166 *end_pfn = max_sparsemem_pfn;
167 }
168
169 if (*end_pfn > max_sparsemem_pfn) {
170 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
171 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
172 *start_pfn, *end_pfn, max_sparsemem_pfn);
173 WARN_ON_ONCE(1);
174 *end_pfn = max_sparsemem_pfn;
175 }
176}
177
178/* Record a memory area against a node. */
179void __init memory_present(int nid, unsigned long start, unsigned long end)
180{
181 unsigned long pfn;
164 182
165 start &= PAGE_SECTION_MASK; 183 start &= PAGE_SECTION_MASK;
184 mminit_validate_memmodel_limits(&start, &end);
166 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 185 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
167 unsigned long section = pfn_to_section_nr(pfn); 186 unsigned long section = pfn_to_section_nr(pfn);
168 struct mem_section *ms; 187 struct mem_section *ms;
@@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
187 unsigned long pfn; 206 unsigned long pfn;
188 unsigned long nr_pages = 0; 207 unsigned long nr_pages = 0;
189 208
209 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
190 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 210 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
191 if (nid != early_pfn_to_nid(pfn)) 211 if (nid != early_pfn_to_nid(pfn))
192 continue; 212 continue;
@@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void)
248} 268}
249#endif /* CONFIG_MEMORY_HOTPLUG */ 269#endif /* CONFIG_MEMORY_HOTPLUG */
250 270
271#ifdef CONFIG_MEMORY_HOTREMOVE
272static unsigned long * __init
273sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
274{
275 unsigned long section_nr;
276
277 /*
278 * A page may contain usemaps for other sections preventing the
279 * page being freed and making a section unremovable while
280 * other sections referencing the usemap retmain active. Similarly,
281 * a pgdat can prevent a section being removed. If section A
282 * contains a pgdat and section B contains the usemap, both
283 * sections become inter-dependent. This allocates usemaps
284 * from the same section as the pgdat where possible to avoid
285 * this problem.
286 */
287 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
288 return alloc_bootmem_section(usemap_size(), section_nr);
289}
290
291static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
292{
293 unsigned long usemap_snr, pgdat_snr;
294 static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
295 static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
296 struct pglist_data *pgdat = NODE_DATA(nid);
297 int usemap_nid;
298
299 usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
300 pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
301 if (usemap_snr == pgdat_snr)
302 return;
303
304 if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
305 /* skip redundant message */
306 return;
307
308 old_usemap_snr = usemap_snr;
309 old_pgdat_snr = pgdat_snr;
310
311 usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
312 if (usemap_nid != nid) {
313 printk(KERN_INFO
314 "node %d must be removed before remove section %ld\n",
315 nid, usemap_snr);
316 return;
317 }
318 /*
319 * There is a circular dependency.
320 * Some platforms allow un-removable section because they will just
321 * gather other removable sections for dynamic partitioning.
322 * Just notify un-removable section's number here.
323 */
324 printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
325 pgdat_snr, nid);
326 printk(KERN_CONT
327 " have a circular dependency on usemap and pgdat allocations\n");
328}
329#else
330static unsigned long * __init
331sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
332{
333 return NULL;
334}
335
336static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
337{
338}
339#endif /* CONFIG_MEMORY_HOTREMOVE */
340
251static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 341static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
252{ 342{
253 unsigned long *usemap; 343 unsigned long *usemap;
254 struct mem_section *ms = __nr_to_section(pnum); 344 struct mem_section *ms = __nr_to_section(pnum);
255 int nid = sparse_early_nid(ms); 345 int nid = sparse_early_nid(ms);
256 346
257 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 347 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
258 if (usemap) 348 if (usemap)
259 return usemap; 349 return usemap;
260 350
351 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
352 if (usemap) {
353 check_usemap_section_nr(nid, usemap);
354 return usemap;
355 }
356
261 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 357 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
262 nid = 0; 358 nid = 0;
263 359
@@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
280} 376}
281#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 377#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
282 378
283struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 379static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
284{ 380{
285 struct page *map; 381 struct page *map;
286 struct mem_section *ms = __nr_to_section(pnum); 382 struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..9e0cb3118079 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,9 +34,9 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; 39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
40 40
41/* 41/*
42 * This path almost never happens for VM activity - pages are normally 42 * This path almost never happens for VM activity - pages are normally
@@ -278,9 +278,10 @@ int lru_add_drain_all(void)
278 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 278 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
279 * for the remainder of the operation. 279 * for the remainder of the operation.
280 * 280 *
281 * The locking in this function is against shrink_cache(): we recheck the 281 * The locking in this function is against shrink_inactive_list(): we recheck
282 * page count inside the lock to see whether shrink_cache grabbed the page 282 * the page count inside the lock to see whether shrink_inactive_list()
283 * via the LRU. If it did, give up: shrink_cache will free it. 283 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
284 * will free it.
284 */ 285 */
285void release_pages(struct page **pages, int nr, int cold) 286void release_pages(struct page **pages, int nr, int cold)
286{ 287{
@@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec)
443 for (i = 0; i < pagevec_count(pvec); i++) { 444 for (i = 0; i < pagevec_count(pvec); i++) {
444 struct page *page = pvec->pages[i]; 445 struct page *page = pvec->pages[i];
445 446
446 if (PagePrivate(page) && !TestSetPageLocked(page)) { 447 if (PagePrivate(page) && trylock_page(page)) {
447 if (PagePrivate(page)) 448 if (PagePrivate(page))
448 try_to_release_page(page, 0); 449 try_to_release_page(page, 0);
449 unlock_page(page); 450 unlock_page(page);
@@ -493,7 +494,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
493 */ 494 */
494#define ACCT_THRESHOLD max(16, NR_CPUS * 2) 495#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
495 496
496static DEFINE_PER_CPU(long, committed_space) = 0; 497static DEFINE_PER_CPU(long, committed_space);
497 498
498void vm_acct_memory(long pages) 499void vm_acct_memory(long pages)
499{ 500{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..167cf2dc8a03 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
39 39
40struct address_space swapper_space = { 40struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), 42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
43 .a_ops = &swap_aops, 43 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_info, 45 .backing_dev_info = &swap_backing_dev_info,
@@ -56,7 +56,8 @@ static struct {
56 56
57void show_swap_cache_info(void) 57void show_swap_cache_info(void)
58{ 58{
59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", 59 printk("%lu pages in swap cache\n", total_swapcache_pages);
60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 61 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 62 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
@@ -64,7 +65,7 @@ void show_swap_cache_info(void)
64} 65}
65 66
66/* 67/*
67 * add_to_swap_cache resembles add_to_page_cache on swapper_space, 68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 69 * but sets SwapCache flag and private instead of mapping and index.
69 */ 70 */
70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 71int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 78 error = radix_tree_preload(gfp_mask);
78 if (!error) { 79 if (!error) {
79 write_lock_irq(&swapper_space.tree_lock); 80 page_cache_get(page);
81 SetPageSwapCache(page);
82 set_page_private(page, entry.val);
83
84 spin_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 85 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 86 entry.val, page);
82 if (!error) { 87 if (likely(!error)) {
83 page_cache_get(page);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86 total_swapcache_pages++; 88 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 89 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 90 INC_CACHE_INFO(add_total);
89 } 91 }
90 write_unlock_irq(&swapper_space.tree_lock); 92 spin_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 93 radix_tree_preload_end();
94
95 if (unlikely(error)) {
96 set_page_private(page, 0UL);
97 ClearPageSwapCache(page);
98 page_cache_release(page);
99 }
92 } 100 }
93 return error; 101 return error;
94} 102}
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
175 183
176 entry.val = page_private(page); 184 entry.val = page_private(page);
177 185
178 write_lock_irq(&swapper_space.tree_lock); 186 spin_lock_irq(&swapper_space.tree_lock);
179 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
180 write_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
181 189
182 swap_free(entry); 190 swap_free(entry);
183 page_cache_release(page); 191 page_cache_release(page);
@@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page)
193 */ 201 */
194static inline void free_swap_cache(struct page *page) 202static inline void free_swap_cache(struct page *page)
195{ 203{
196 if (PageSwapCache(page) && !TestSetPageLocked(page)) { 204 if (PageSwapCache(page) && trylock_page(page)) {
197 remove_exclusive_swap_page(page); 205 remove_exclusive_swap_page(page);
198 unlock_page(page); 206 unlock_page(page);
199 } 207 }
@@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
294 * re-using the just freed swap entry for an existing page. 302 * re-using the just freed swap entry for an existing page.
295 * May fail (-ENOMEM) if radix-tree node allocation failed. 303 * May fail (-ENOMEM) if radix-tree node allocation failed.
296 */ 304 */
297 SetPageLocked(new_page); 305 set_page_locked(new_page);
298 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 306 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
299 if (!err) { 307 if (likely(!err)) {
300 /* 308 /*
301 * Initiate read into locked page and return. 309 * Initiate read into locked page and return.
302 */ 310 */
@@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
304 swap_readpage(NULL, new_page); 312 swap_readpage(NULL, new_page);
305 return new_page; 313 return new_page;
306 } 314 }
307 ClearPageLocked(new_page); 315 clear_page_locked(new_page);
308 swap_free(entry); 316 swap_free(entry);
309 } while (err != -ENOMEM); 317 } while (err != -ENOMEM);
310 318
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..1e330f2998fa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,17 +33,18 @@
33#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <linux/swapops.h> 34#include <linux/swapops.h>
35 35
36DEFINE_SPINLOCK(swap_lock); 36static DEFINE_SPINLOCK(swap_lock);
37unsigned int nr_swapfiles; 37static unsigned int nr_swapfiles;
38long total_swap_pages; 38long total_swap_pages;
39static int swap_overflow; 39static int swap_overflow;
40static int least_priority;
40 41
41static const char Bad_file[] = "Bad swap file entry "; 42static const char Bad_file[] = "Bad swap file entry ";
42static const char Unused_file[] = "Unused swap file entry "; 43static const char Unused_file[] = "Unused swap file entry ";
43static const char Bad_offset[] = "Bad swap offset entry "; 44static const char Bad_offset[] = "Bad swap offset entry ";
44static const char Unused_offset[] = "Unused swap offset entry "; 45static const char Unused_offset[] = "Unused swap offset entry ";
45 46
46struct swap_list_t swap_list = {-1, -1}; 47static struct swap_list_t swap_list = {-1, -1};
47 48
48static struct swap_info_struct swap_info[MAX_SWAPFILES]; 49static struct swap_info_struct swap_info[MAX_SWAPFILES];
49 50
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
368 retval = 0; 369 retval = 0;
369 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
370 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
371 write_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
372 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == 2) && !PageWriteback(page)) {
373 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
374 SetPageDirty(page); 375 SetPageDirty(page);
375 retval = 1; 376 retval = 1;
376 } 377 }
377 write_unlock_irq(&swapper_space.tree_lock); 378 spin_unlock_irq(&swapper_space.tree_lock);
378 } 379 }
379 spin_unlock(&swap_lock); 380 spin_unlock(&swap_lock);
380 381
@@ -402,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry)
402 if (p) { 403 if (p) {
403 if (swap_entry_free(p, swp_offset(entry)) == 1) { 404 if (swap_entry_free(p, swp_offset(entry)) == 1) {
404 page = find_get_page(&swapper_space, entry.val); 405 page = find_get_page(&swapper_space, entry.val);
405 if (page && unlikely(TestSetPageLocked(page))) { 406 if (page && unlikely(!trylock_page(page))) {
406 page_cache_release(page); 407 page_cache_release(page);
407 page = NULL; 408 page = NULL;
408 } 409 }
@@ -655,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm,
655 656
656 if (!down_read_trylock(&mm->mmap_sem)) { 657 if (!down_read_trylock(&mm->mmap_sem)) {
657 /* 658 /*
658 * Activate page so shrink_cache is unlikely to unmap its 659 * Activate page so shrink_inactive_list is unlikely to unmap
659 * ptes while lock is dropped, so swapoff can make progress. 660 * its ptes while lock is dropped, so swapoff can make progress.
660 */ 661 */
661 activate_page(page); 662 activate_page(page);
662 unlock_page(page); 663 unlock_page(page);
@@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1260 /* just pick something that's safe... */ 1261 /* just pick something that's safe... */
1261 swap_list.next = swap_list.head; 1262 swap_list.next = swap_list.head;
1262 } 1263 }
1264 if (p->prio < 0) {
1265 for (i = p->next; i >= 0; i = swap_info[i].next)
1266 swap_info[i].prio = p->prio--;
1267 least_priority++;
1268 }
1263 nr_swap_pages -= p->pages; 1269 nr_swap_pages -= p->pages;
1264 total_swap_pages -= p->pages; 1270 total_swap_pages -= p->pages;
1265 p->flags &= ~SWP_WRITEOK; 1271 p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1272 if (err) { 1278 if (err) {
1273 /* re-insert swap space back into swap_list */ 1279 /* re-insert swap space back into swap_list */
1274 spin_lock(&swap_lock); 1280 spin_lock(&swap_lock);
1275 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 1281 if (p->prio < 0)
1282 p->prio = --least_priority;
1283 prev = -1;
1284 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1276 if (p->prio >= swap_info[i].prio) 1285 if (p->prio >= swap_info[i].prio)
1277 break; 1286 break;
1287 prev = i;
1288 }
1278 p->next = i; 1289 p->next = i;
1279 if (prev < 0) 1290 if (prev < 0)
1280 swap_list.head = swap_list.next = p - swap_info; 1291 swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1447 unsigned int type; 1458 unsigned int type;
1448 int i, prev; 1459 int i, prev;
1449 int error; 1460 int error;
1450 static int least_priority;
1451 union swap_header *swap_header = NULL; 1461 union swap_header *swap_header = NULL;
1452 int swap_header_version; 1462 int swap_header_version;
1453 unsigned int nr_good_pages = 0; 1463 unsigned int nr_good_pages = 0;
@@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1455 sector_t span; 1465 sector_t span;
1456 unsigned long maxpages = 1; 1466 unsigned long maxpages = 1;
1457 int swapfilesize; 1467 int swapfilesize;
1458 unsigned short *swap_map; 1468 unsigned short *swap_map = NULL;
1459 struct page *page = NULL; 1469 struct page *page = NULL;
1460 struct inode *inode = NULL; 1470 struct inode *inode = NULL;
1461 int did_down = 0; 1471 int did_down = 0;
@@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1474 } 1484 }
1475 if (type >= nr_swapfiles) 1485 if (type >= nr_swapfiles)
1476 nr_swapfiles = type+1; 1486 nr_swapfiles = type+1;
1487 memset(p, 0, sizeof(*p));
1477 INIT_LIST_HEAD(&p->extent_list); 1488 INIT_LIST_HEAD(&p->extent_list);
1478 p->flags = SWP_USED; 1489 p->flags = SWP_USED;
1479 p->swap_file = NULL;
1480 p->old_block_size = 0;
1481 p->swap_map = NULL;
1482 p->lowest_bit = 0;
1483 p->highest_bit = 0;
1484 p->cluster_nr = 0;
1485 p->inuse_pages = 0;
1486 p->next = -1; 1490 p->next = -1;
1487 if (swap_flags & SWAP_FLAG_PREFER) {
1488 p->prio =
1489 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1490 } else {
1491 p->prio = --least_priority;
1492 }
1493 spin_unlock(&swap_lock); 1491 spin_unlock(&swap_lock);
1494 name = getname(specialfile); 1492 name = getname(specialfile);
1495 error = PTR_ERR(name); 1493 error = PTR_ERR(name);
@@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1632 goto bad_swap; 1630 goto bad_swap;
1633 1631
1634 /* OK, set up the swap map and apply the bad block list */ 1632 /* OK, set up the swap map and apply the bad block list */
1635 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1633 swap_map = vmalloc(maxpages * sizeof(short));
1634 if (!swap_map) {
1636 error = -ENOMEM; 1635 error = -ENOMEM;
1637 goto bad_swap; 1636 goto bad_swap;
1638 } 1637 }
1639 1638
1640 error = 0; 1639 error = 0;
1641 memset(p->swap_map, 0, maxpages * sizeof(short)); 1640 memset(swap_map, 0, maxpages * sizeof(short));
1642 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1641 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1643 int page_nr = swap_header->info.badpages[i]; 1642 int page_nr = swap_header->info.badpages[i];
1644 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1643 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1645 error = -EINVAL; 1644 error = -EINVAL;
1646 else 1645 else
1647 p->swap_map[page_nr] = SWAP_MAP_BAD; 1646 swap_map[page_nr] = SWAP_MAP_BAD;
1648 } 1647 }
1649 nr_good_pages = swap_header->info.last_page - 1648 nr_good_pages = swap_header->info.last_page -
1650 swap_header->info.nr_badpages - 1649 swap_header->info.nr_badpages -
@@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1654 } 1653 }
1655 1654
1656 if (nr_good_pages) { 1655 if (nr_good_pages) {
1657 p->swap_map[0] = SWAP_MAP_BAD; 1656 swap_map[0] = SWAP_MAP_BAD;
1658 p->max = maxpages; 1657 p->max = maxpages;
1659 p->pages = nr_good_pages; 1658 p->pages = nr_good_pages;
1660 nr_extents = setup_swap_extents(p, &span); 1659 nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1672 1671
1673 mutex_lock(&swapon_mutex); 1672 mutex_lock(&swapon_mutex);
1674 spin_lock(&swap_lock); 1673 spin_lock(&swap_lock);
1674 if (swap_flags & SWAP_FLAG_PREFER)
1675 p->prio =
1676 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1677 else
1678 p->prio = --least_priority;
1679 p->swap_map = swap_map;
1675 p->flags = SWP_ACTIVE; 1680 p->flags = SWP_ACTIVE;
1676 nr_swap_pages += nr_good_pages; 1681 nr_swap_pages += nr_good_pages;
1677 total_swap_pages += nr_good_pages; 1682 total_swap_pages += nr_good_pages;
@@ -1707,12 +1712,8 @@ bad_swap:
1707 destroy_swap_extents(p); 1712 destroy_swap_extents(p);
1708bad_swap_2: 1713bad_swap_2:
1709 spin_lock(&swap_lock); 1714 spin_lock(&swap_lock);
1710 swap_map = p->swap_map;
1711 p->swap_file = NULL; 1715 p->swap_file = NULL;
1712 p->swap_map = NULL;
1713 p->flags = 0; 1716 p->flags = 0;
1714 if (!(swap_flags & SWAP_FLAG_PREFER))
1715 ++least_priority;
1716 spin_unlock(&swap_lock); 1717 spin_unlock(&swap_lock);
1717 vfree(swap_map); 1718 vfree(swap_map);
1718 if (swap_file) 1719 if (swap_file)
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..250505091d37 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
104 cancel_dirty_page(page, PAGE_CACHE_SIZE); 104 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105 105
106 remove_from_page_cache(page); 106 remove_from_page_cache(page);
107 ClearPageUptodate(page);
108 ClearPageMappedToDisk(page); 107 ClearPageMappedToDisk(page);
109 page_cache_release(page); /* pagecache ref */ 108 page_cache_release(page); /* pagecache ref */
110} 109}
@@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
188 if (page_index > next) 187 if (page_index > next)
189 next = page_index; 188 next = page_index;
190 next++; 189 next++;
191 if (TestSetPageLocked(page)) 190 if (!trylock_page(page))
192 continue; 191 continue;
193 if (PageWriteback(page)) { 192 if (PageWriteback(page)) {
194 unlock_page(page); 193 unlock_page(page);
@@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
281 pgoff_t index; 280 pgoff_t index;
282 int lock_failed; 281 int lock_failed;
283 282
284 lock_failed = TestSetPageLocked(page); 283 lock_failed = !trylock_page(page);
285 284
286 /* 285 /*
287 * We really shouldn't be looking at the ->index of an 286 * We really shouldn't be looking at the ->index of an
@@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 348 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
350 return 0; 349 return 0;
351 350
352 write_lock_irq(&mapping->tree_lock); 351 spin_lock_irq(&mapping->tree_lock);
353 if (PageDirty(page)) 352 if (PageDirty(page))
354 goto failed; 353 goto failed;
355 354
356 BUG_ON(PagePrivate(page)); 355 BUG_ON(PagePrivate(page));
357 __remove_from_page_cache(page); 356 __remove_from_page_cache(page);
358 write_unlock_irq(&mapping->tree_lock); 357 spin_unlock_irq(&mapping->tree_lock);
359 ClearPageUptodate(page);
360 page_cache_release(page); /* pagecache ref */ 358 page_cache_release(page); /* pagecache ref */
361 return 1; 359 return 1;
362failed: 360failed:
363 write_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
364 return 0; 362 return 0;
365} 363}
366 364
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
1#include <linux/mm.h>
1#include <linux/slab.h> 2#include <linux/slab.h>
2#include <linux/string.h> 3#include <linux/string.h>
3#include <linux/module.h> 4#include <linux/module.h>
4#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h>
5#include <asm/uaccess.h> 7#include <asm/uaccess.h>
6 8
7/** 9/**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
68EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
69 71
70/** 72/**
71 * krealloc - reallocate memory. The contents will remain unchanged. 73 * __krealloc - like krealloc() but don't free @p.
72 * @p: object to reallocate memory for. 74 * @p: object to reallocate memory for.
73 * @new_size: how many bytes of memory are required. 75 * @new_size: how many bytes of memory are required.
74 * @flags: the type of memory to allocate. 76 * @flags: the type of memory to allocate.
75 * 77 *
76 * The contents of the object pointed to are preserved up to the 78 * This function is like krealloc() except it never frees the originally
77 * lesser of the new and old sizes. If @p is %NULL, krealloc() 79 * allocated buffer. Use this if you don't want to free the buffer immediately
78 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 80 * like, for example, with RCU.
79 * %NULL pointer, the object pointed to is freed.
80 */ 81 */
81void *krealloc(const void *p, size_t new_size, gfp_t flags) 82void *__krealloc(const void *p, size_t new_size, gfp_t flags)
82{ 83{
83 void *ret; 84 void *ret;
84 size_t ks = 0; 85 size_t ks = 0;
85 86
86 if (unlikely(!new_size)) { 87 if (unlikely(!new_size))
87 kfree(p);
88 return ZERO_SIZE_PTR; 88 return ZERO_SIZE_PTR;
89 }
90 89
91 if (p) 90 if (p)
92 ks = ksize(p); 91 ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
95 return (void *)p; 94 return (void *)p;
96 95
97 ret = kmalloc_track_caller(new_size, flags); 96 ret = kmalloc_track_caller(new_size, flags);
98 if (ret && p) { 97 if (ret && p)
99 memcpy(ret, p, ks); 98 memcpy(ret, p, ks);
99
100 return ret;
101}
102EXPORT_SYMBOL(__krealloc);
103
104/**
105 * krealloc - reallocate memory. The contents will remain unchanged.
106 * @p: object to reallocate memory for.
107 * @new_size: how many bytes of memory are required.
108 * @flags: the type of memory to allocate.
109 *
110 * The contents of the object pointed to are preserved up to the
111 * lesser of the new and old sizes. If @p is %NULL, krealloc()
112 * behaves exactly like kmalloc(). If @size is 0 and @p is not a
113 * %NULL pointer, the object pointed to is freed.
114 */
115void *krealloc(const void *p, size_t new_size, gfp_t flags)
116{
117 void *ret;
118
119 if (unlikely(!new_size)) {
100 kfree(p); 120 kfree(p);
121 return ZERO_SIZE_PTR;
101 } 122 }
123
124 ret = __krealloc(p, new_size, flags);
125 if (ret && p != ret)
126 kfree(p);
127
102 return ret; 128 return ret;
103} 129}
104EXPORT_SYMBOL(krealloc); 130EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
136 return p; 162 return p;
137} 163}
138EXPORT_SYMBOL(strndup_user); 164EXPORT_SYMBOL(strndup_user);
165
166#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
167void arch_pick_mmap_layout(struct mm_struct *mm)
168{
169 mm->mmap_base = TASK_UNMAPPED_BASE;
170 mm->get_unmapped_area = arch_get_unmapped_area;
171 mm->unmap_area = arch_unmap_area;
172}
173#endif
174
175int __attribute__((weak)) get_user_pages_fast(unsigned long start,
176 int nr_pages, int write, struct page **pages)
177{
178 struct mm_struct *mm = current->mm;
179 int ret;
180
181 down_read(&mm->mmap_sem);
182 ret = get_user_pages(current, mm, start, nr_pages,
183 write, 0, pages, NULL);
184 up_read(&mm->mmap_sem);
185
186 return ret;
187}
188EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d125..85b9a0d2c877 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
381 return; 381 return;
382 382
383 if ((PAGE_SIZE-1) & (unsigned long)addr) { 383 if ((PAGE_SIZE-1) & (unsigned long)addr) {
384 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 384 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
385 WARN_ON(1);
386 return; 385 return;
387 } 386 }
388 387
389 area = remove_vm_area(addr); 388 area = remove_vm_area(addr);
390 if (unlikely(!area)) { 389 if (unlikely(!area)) {
391 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 390 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
392 addr); 391 addr);
393 WARN_ON(1);
394 return; 392 return;
395 } 393 }
396 394
@@ -931,6 +929,25 @@ static void s_stop(struct seq_file *m, void *p)
931 read_unlock(&vmlist_lock); 929 read_unlock(&vmlist_lock);
932} 930}
933 931
932static void show_numa_info(struct seq_file *m, struct vm_struct *v)
933{
934 if (NUMA_BUILD) {
935 unsigned int nr, *counters = m->private;
936
937 if (!counters)
938 return;
939
940 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
941
942 for (nr = 0; nr < v->nr_pages; nr++)
943 counters[page_to_nid(v->pages[nr])]++;
944
945 for_each_node_state(nr, N_HIGH_MEMORY)
946 if (counters[nr])
947 seq_printf(m, " N%u=%u", nr, counters[nr]);
948 }
949}
950
934static int s_show(struct seq_file *m, void *p) 951static int s_show(struct seq_file *m, void *p)
935{ 952{
936 struct vm_struct *v = p; 953 struct vm_struct *v = p;
@@ -967,6 +984,7 @@ static int s_show(struct seq_file *m, void *p)
967 if (v->flags & VM_VPAGES) 984 if (v->flags & VM_VPAGES)
968 seq_printf(m, " vpages"); 985 seq_printf(m, " vpages");
969 986
987 show_numa_info(m, v);
970 seq_putc(m, '\n'); 988 seq_putc(m, '\n');
971 return 0; 989 return 0;
972} 990}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..1ff1a58e7c10 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
41 42
42#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
43#include <asm/div64.h> 44#include <asm/div64.h>
@@ -390,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
390} 391}
391 392
392/* 393/*
393 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 394 * Same as remove_mapping, but if the page is removed from the mapping, it
394 * someone else has a ref on the page, abort and return 0. If it was 395 * gets returned with a refcount of 0.
395 * successfully detached, return 1. Assumes the caller has a single ref on
396 * this page.
397 */ 396 */
398int remove_mapping(struct address_space *mapping, struct page *page) 397static int __remove_mapping(struct address_space *mapping, struct page *page)
399{ 398{
400 BUG_ON(!PageLocked(page)); 399 BUG_ON(!PageLocked(page));
401 BUG_ON(mapping != page_mapping(page)); 400 BUG_ON(mapping != page_mapping(page));
402 401
403 write_lock_irq(&mapping->tree_lock); 402 spin_lock_irq(&mapping->tree_lock);
404 /* 403 /*
405 * The non racy check for a busy page. 404 * The non racy check for a busy page.
406 * 405 *
@@ -426,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
426 * Note that if SetPageDirty is always performed via set_page_dirty, 425 * Note that if SetPageDirty is always performed via set_page_dirty,
427 * and thus under tree_lock, then this ordering is not required. 426 * and thus under tree_lock, then this ordering is not required.
428 */ 427 */
429 if (unlikely(page_count(page) != 2)) 428 if (!page_freeze_refs(page, 2))
430 goto cannot_free; 429 goto cannot_free;
431 smp_rmb(); 430 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
432 if (unlikely(PageDirty(page))) 431 if (unlikely(PageDirty(page))) {
432 page_unfreeze_refs(page, 2);
433 goto cannot_free; 433 goto cannot_free;
434 }
434 435
435 if (PageSwapCache(page)) { 436 if (PageSwapCache(page)) {
436 swp_entry_t swap = { .val = page_private(page) }; 437 swp_entry_t swap = { .val = page_private(page) };
437 __delete_from_swap_cache(page); 438 __delete_from_swap_cache(page);
438 write_unlock_irq(&mapping->tree_lock); 439 spin_unlock_irq(&mapping->tree_lock);
439 swap_free(swap); 440 swap_free(swap);
440 __put_page(page); /* The pagecache ref */ 441 } else {
441 return 1; 442 __remove_from_page_cache(page);
443 spin_unlock_irq(&mapping->tree_lock);
442 } 444 }
443 445
444 __remove_from_page_cache(page);
445 write_unlock_irq(&mapping->tree_lock);
446 __put_page(page);
447 return 1; 446 return 1;
448 447
449cannot_free: 448cannot_free:
450 write_unlock_irq(&mapping->tree_lock); 449 spin_unlock_irq(&mapping->tree_lock);
450 return 0;
451}
452
453/*
454 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
455 * someone else has a ref on the page, abort and return 0. If it was
456 * successfully detached, return 1. Assumes the caller has a single ref on
457 * this page.
458 */
459int remove_mapping(struct address_space *mapping, struct page *page)
460{
461 if (__remove_mapping(mapping, page)) {
462 /*
463 * Unfreezing the refcount with 1 rather than 2 effectively
464 * drops the pagecache ref for us without requiring another
465 * atomic operation.
466 */
467 page_unfreeze_refs(page, 1);
468 return 1;
469 }
451 return 0; 470 return 0;
452} 471}
453 472
@@ -477,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
477 page = lru_to_page(page_list); 496 page = lru_to_page(page_list);
478 list_del(&page->lru); 497 list_del(&page->lru);
479 498
480 if (TestSetPageLocked(page)) 499 if (!trylock_page(page))
481 goto keep; 500 goto keep;
482 501
483 VM_BUG_ON(PageActive(page)); 502 VM_BUG_ON(PageActive(page));
@@ -563,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
563 * A synchronous write - probably a ramdisk. Go 582 * A synchronous write - probably a ramdisk. Go
564 * ahead and try to reclaim the page. 583 * ahead and try to reclaim the page.
565 */ 584 */
566 if (TestSetPageLocked(page)) 585 if (!trylock_page(page))
567 goto keep; 586 goto keep;
568 if (PageDirty(page) || PageWriteback(page)) 587 if (PageDirty(page) || PageWriteback(page))
569 goto keep_locked; 588 goto keep_locked;
@@ -597,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
597 if (PagePrivate(page)) { 616 if (PagePrivate(page)) {
598 if (!try_to_release_page(page, sc->gfp_mask)) 617 if (!try_to_release_page(page, sc->gfp_mask))
599 goto activate_locked; 618 goto activate_locked;
600 if (!mapping && page_count(page) == 1) 619 if (!mapping && page_count(page) == 1) {
601 goto free_it; 620 unlock_page(page);
621 if (put_page_testzero(page))
622 goto free_it;
623 else {
624 /*
625 * rare race with speculative reference.
626 * the speculative reference will free
627 * this page shortly, so we may
628 * increment nr_reclaimed here (and
629 * leave it off the LRU).
630 */
631 nr_reclaimed++;
632 continue;
633 }
634 }
602 } 635 }
603 636
604 if (!mapping || !remove_mapping(mapping, page)) 637 if (!mapping || !__remove_mapping(mapping, page))
605 goto keep_locked; 638 goto keep_locked;
606 639
607free_it:
608 unlock_page(page); 640 unlock_page(page);
641free_it:
609 nr_reclaimed++; 642 nr_reclaimed++;
610 if (!pagevec_add(&freed_pvec, page)) 643 if (!pagevec_add(&freed_pvec, page)) {
611 __pagevec_release_nonlru(&freed_pvec); 644 __pagevec_free(&freed_pvec);
645 pagevec_reinit(&freed_pvec);
646 }
612 continue; 647 continue;
613 648
614activate_locked: 649activate_locked:
@@ -622,7 +657,7 @@ keep:
622 } 657 }
623 list_splice(&ret_pages, page_list); 658 list_splice(&ret_pages, page_list);
624 if (pagevec_count(&freed_pvec)) 659 if (pagevec_count(&freed_pvec))
625 __pagevec_release_nonlru(&freed_pvec); 660 __pagevec_free(&freed_pvec);
626 count_vm_events(PGACTIVATE, pgactivate); 661 count_vm_events(PGACTIVATE, pgactivate);
627 return nr_reclaimed; 662 return nr_reclaimed;
628} 663}
@@ -1316,6 +1351,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1316 struct zone *zone; 1351 struct zone *zone;
1317 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1352 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1318 1353
1354 delayacct_freepages_start();
1355
1319 if (scan_global_lru(sc)) 1356 if (scan_global_lru(sc))
1320 count_vm_event(ALLOCSTALL); 1357 count_vm_event(ALLOCSTALL);
1321 /* 1358 /*
@@ -1371,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1371 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1408 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1372 congestion_wait(WRITE, HZ/10); 1409 congestion_wait(WRITE, HZ/10);
1373 } 1410 }
1374 /* top priority shrink_caches still had more to do? don't OOM, then */ 1411 /* top priority shrink_zones still had more to do? don't OOM, then */
1375 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1412 if (!sc->all_unreclaimable && scan_global_lru(sc))
1376 ret = nr_reclaimed; 1413 ret = nr_reclaimed;
1377out: 1414out:
@@ -1396,6 +1433,8 @@ out:
1396 } else 1433 } else
1397 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1434 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1398 1435
1436 delayacct_freepages_end();
1437
1399 return ret; 1438 return ret;
1400} 1439}
1401 1440
@@ -1940,7 +1979,7 @@ module_init(kswapd_init)
1940int zone_reclaim_mode __read_mostly; 1979int zone_reclaim_mode __read_mostly;
1941 1980
1942#define RECLAIM_OFF 0 1981#define RECLAIM_OFF 0
1943#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1982#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
1944#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1983#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1945#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1984#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1946 1985
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db9eabb2c5b3..b0d08e667ece 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/vmstat.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17 18
18#ifdef CONFIG_VM_EVENT_COUNTERS 19#ifdef CONFIG_VM_EVENT_COUNTERS
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
26 27
27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
28 29
29 for_each_cpu_mask(cpu, *cpumask) { 30 for_each_cpu_mask_nr(cpu, *cpumask) {
30 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
31 32
32 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)