aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
commit7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
treee730a4565e0318140d2fbd2f0415d18a339d7336 /mm
parent41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig23
-rw-r--r--mm/Makefile6
-rw-r--r--mm/allocpercpu.c24
-rw-r--r--mm/bootmem.c948
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c474
-rw-r--r--mm/filemap_xip.c70
-rw-r--r--mm/fremap.c30
-rw-r--r--mm/highmem.c6
-rw-r--r--mm/hugetlb.c1733
-rw-r--r--mm/internal.h192
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memcontrol.c772
-rw-r--r--mm/memory.c421
-rw-r--r--mm/memory_hotplug.c99
-rw-r--r--mm/mempolicy.c21
-rw-r--r--mm/migrate.c323
-rw-r--r--mm/mlock.c445
-rw-r--r--mm/mm_init.c152
-rw-r--r--mm/mmap.c267
-rw-r--r--mm/mmu_notifier.c277
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c9
-rw-r--r--mm/mremap.c14
-rw-r--r--mm/nommu.c69
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c34
-rw-r--r--mm/page_alloc.c304
-rw-r--r--mm/page_cgroup.c256
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/pdflush.c6
-rw-r--r--mm/quicklist.c9
-rw-r--r--mm/readahead.c10
-rw-r--r--mm/rmap.c380
-rw-r--r--mm/shmem.c118
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/slab.c64
-rw-r--r--mm/slob.c28
-rw-r--r--mm/slub.c149
-rw-r--r--mm/sparse.c116
-rw-r--r--mm/swap.c183
-rw-r--r--mm/swap_state.c47
-rw-r--r--mm/swapfile.c90
-rw-r--r--mm/tiny-shmem.c27
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/util.c70
-rw-r--r--mm/vmalloc.c1056
-rw-r--r--mm/vmscan.c1117
-rw-r--r--mm/vmstat.c124
50 files changed, 7869 insertions, 2747 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c4de85285bb4..5b5790f8a816 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT
101# with gcc 3.4 and later. 101# with gcc 3.4 and later.
102# 102#
103config SPARSEMEM_STATIC 103config SPARSEMEM_STATIC
104 def_bool n 104 bool
105 105
106# 106#
107# Architecture platforms which require a two level mem_section in SPARSEMEM 107# Architecture platforms which require a two level mem_section in SPARSEMEM
@@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME
113 depends on SPARSEMEM && !SPARSEMEM_STATIC 113 depends on SPARSEMEM && !SPARSEMEM_STATIC
114 114
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 def_bool n 116 bool
117 117
118config SPARSEMEM_VMEMMAP 118config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 119 bool "Sparse Memory virtual memmap"
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
174config MIGRATION 174config MIGRATION
175 bool "Page migration" 175 bool "Page migration"
176 def_bool y 176 def_bool y
177 depends on NUMA 177 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
178 help 178 help
179 Allows the migration of the physical location of pages of processes 179 Allows the migration of the physical location of pages of processes
180 while the virtual addresses are not changed. This is useful for 180 while the virtual addresses are not changed. This is useful for
@@ -187,6 +187,9 @@ config RESOURCES_64BIT
187 help 187 help
188 This option allows memory and IO resources to be 64 bit. 188 This option allows memory and IO resources to be 64 bit.
189 189
190config PHYS_ADDR_T_64BIT
191 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
192
190config ZONE_DMA_FLAG 193config ZONE_DMA_FLAG
191 int 194 int
192 default "0" if !ZONE_DMA 195 default "0" if !ZONE_DMA
@@ -205,3 +208,17 @@ config NR_QUICK
205config VIRT_TO_BUS 208config VIRT_TO_BUS
206 def_bool y 209 def_bool y
207 depends on !ARCH_NO_VIRT_TO_BUS 210 depends on !ARCH_NO_VIRT_TO_BUS
211
212config UNEVICTABLE_LRU
213 bool "Add LRU list to track non-evictable pages"
214 default y
215 depends on MMU
216 help
217 Keeps unevictable pages off of the active and inactive pageout
218 lists, so kswapd will not waste CPU time or have its balancing
219 algorithms thrown off by scanning these pages. Selecting this
220 will use one page flag and increase the code size a little,
221 say Y unless you know what you are doing.
222
223config MMU_NOTIFIER
224 bool
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..c06b45a1ff5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 17obj-$(CONFIG_BOUNCE) += bounce.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 27obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
@@ -32,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
32obj-$(CONFIG_MIGRATION) += migrate.o 33obj-$(CONFIG_MIGRATION) += migrate.o
33obj-$(CONFIG_SMP) += allocpercpu.o 34obj-$(CONFIG_SMP) += allocpercpu.o
34obj-$(CONFIG_QUICKLIST) += quicklist.o 35obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o 36obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
36
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 05f2b4009ccc..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
18 * Depopulating per-cpu data for a cpu going offline would be a typical 18 * Depopulating per-cpu data for a cpu going offline would be a typical
19 * use case. You need to register a cpu hotplug handler for that purpose. 19 * use case. You need to register a cpu hotplug handler for that purpose.
20 */ 20 */
21void percpu_depopulate(void *__pdata, int cpu) 21static void percpu_depopulate(void *__pdata, int cpu)
22{ 22{
23 struct percpu_data *pdata = __percpu_disguise(__pdata); 23 struct percpu_data *pdata = __percpu_disguise(__pdata);
24 24
25 kfree(pdata->ptrs[cpu]); 25 kfree(pdata->ptrs[cpu]);
26 pdata->ptrs[cpu] = NULL; 26 pdata->ptrs[cpu] = NULL;
27} 27}
28EXPORT_SYMBOL_GPL(percpu_depopulate);
29 28
30/** 29/**
31 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's 30 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
32 * @__pdata: per-cpu data to depopulate 31 * @__pdata: per-cpu data to depopulate
33 * @mask: depopulate per-cpu data for cpu's selected through mask bits 32 * @mask: depopulate per-cpu data for cpu's selected through mask bits
34 */ 33 */
35void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
36{ 35{
37 int cpu; 36 int cpu;
38 for_each_cpu_mask(cpu, *mask) 37 for_each_cpu_mask_nr(cpu, *mask)
39 percpu_depopulate(__pdata, cpu); 38 percpu_depopulate(__pdata, cpu);
40} 39}
41EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); 40
41#define percpu_depopulate_mask(__pdata, mask) \
42 __percpu_depopulate_mask((__pdata), &(mask))
42 43
43/** 44/**
44 * percpu_populate - populate per-cpu data for given cpu 45 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
51 * use case. You need to register a cpu hotplug handler for that purpose. 52 * use case. You need to register a cpu hotplug handler for that purpose.
52 * Per-cpu object is populated with zeroed buffer. 53 * Per-cpu object is populated with zeroed buffer.
53 */ 54 */
54void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) 55static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
55{ 56{
56 struct percpu_data *pdata = __percpu_disguise(__pdata); 57 struct percpu_data *pdata = __percpu_disguise(__pdata);
57 int node = cpu_to_node(cpu); 58 int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
68 pdata->ptrs[cpu] = kzalloc(size, gfp); 69 pdata->ptrs[cpu] = kzalloc(size, gfp);
69 return pdata->ptrs[cpu]; 70 return pdata->ptrs[cpu];
70} 71}
71EXPORT_SYMBOL_GPL(percpu_populate);
72 72
73/** 73/**
74 * percpu_populate_mask - populate per-cpu data for more cpu's 74 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate);
79 * 79 *
80 * Per-cpu objects are populated with zeroed buffers. 80 * Per-cpu objects are populated with zeroed buffers.
81 */ 81 */
82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, 82static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
83 cpumask_t *mask) 83 cpumask_t *mask)
84{ 84{
85 cpumask_t populated; 85 cpumask_t populated;
86 int cpu; 86 int cpu;
87 87
88 cpus_clear(populated); 88 cpus_clear(populated);
89 for_each_cpu_mask(cpu, *mask) 89 for_each_cpu_mask_nr(cpu, *mask)
90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { 90 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
91 __percpu_depopulate_mask(__pdata, &populated); 91 __percpu_depopulate_mask(__pdata, &populated);
92 return -ENOMEM; 92 return -ENOMEM;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
94 cpu_set(cpu, populated); 94 cpu_set(cpu, populated);
95 return 0; 95 return 0;
96} 96}
97EXPORT_SYMBOL_GPL(__percpu_populate_mask); 97
98#define percpu_populate_mask(__pdata, size, gfp, mask) \
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
98 100
99/** 101/**
100 * percpu_alloc_mask - initial setup of per-cpu data 102 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8d9f60e06f62..ac5a891f142a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/mm/bootmem.c 2 * bootmem - A boot-time physical memory allocator and configurator
3 * 3 *
4 * Copyright (C) 1999 Ingo Molnar 4 * Copyright (C) 1999 Ingo Molnar
5 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
6 * 7 *
7 * simple boot-time physical memory area allocator and 8 * Access to this subsystem has to be serialized externally (which is true
8 * free memory collector. It's used to deal with reserved 9 * for the boot process anyway).
9 * system memory and memory holes as well.
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
@@ -19,15 +19,10 @@
19 19
20#include "internal.h" 20#include "internal.h"
21 21
22/*
23 * Access to this subsystem has to be serialized externally. (this is
24 * true for the boot process anyway)
25 */
26unsigned long max_low_pfn; 22unsigned long max_low_pfn;
27unsigned long min_low_pfn; 23unsigned long min_low_pfn;
28unsigned long max_pfn; 24unsigned long max_pfn;
29 25
30static LIST_HEAD(bdata_list);
31#ifdef CONFIG_CRASH_DUMP 26#ifdef CONFIG_CRASH_DUMP
32/* 27/*
33 * If we have booted due to a crash, max_pfn will be a very low value. We need 28 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
36unsigned long saved_max_pfn; 31unsigned long saved_max_pfn;
37#endif 32#endif
38 33
39/* return the number of _pages_ that will be allocated for the boot bitmap */ 34bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
40unsigned long __init bootmem_bootmap_pages(unsigned long pages) 35
36static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
37
38static int bootmem_debug;
39
40static int __init bootmem_debug_setup(char *buf)
41{ 41{
42 unsigned long mapsize; 42 bootmem_debug = 1;
43 return 0;
44}
45early_param("bootmem_debug", bootmem_debug_setup);
43 46
44 mapsize = (pages+7)/8; 47#define bdebug(fmt, args...) ({ \
45 mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; 48 if (unlikely(bootmem_debug)) \
46 mapsize >>= PAGE_SHIFT; 49 printk(KERN_INFO \
50 "bootmem::%s " fmt, \
51 __func__, ## args); \
52})
47 53
48 return mapsize; 54static unsigned long __init bootmap_bytes(unsigned long pages)
55{
56 unsigned long bytes = (pages + 7) / 8;
57
58 return ALIGN(bytes, sizeof(long));
49} 59}
50 60
51/* 61/**
52 * link bdata in order 62 * bootmem_bootmap_pages - calculate bitmap size in pages
63 * @pages: number of pages the bitmap has to represent
53 */ 64 */
54static void __init link_bootmem(bootmem_data_t *bdata) 65unsigned long __init bootmem_bootmap_pages(unsigned long pages)
55{ 66{
56 bootmem_data_t *ent; 67 unsigned long bytes = bootmap_bytes(pages);
57 68
58 if (list_empty(&bdata_list)) { 69 return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
59 list_add(&bdata->list, &bdata_list);
60 return;
61 }
62 /* insert in order */
63 list_for_each_entry(ent, &bdata_list, list) {
64 if (bdata->node_boot_start < ent->node_boot_start) {
65 list_add_tail(&bdata->list, &ent->list);
66 return;
67 }
68 }
69 list_add_tail(&bdata->list, &bdata_list);
70} 70}
71 71
72/* 72/*
73 * Given an initialised bdata, it returns the size of the boot bitmap 73 * link bdata in order
74 */ 74 */
75static unsigned long __init get_mapsize(bootmem_data_t *bdata) 75static void __init link_bootmem(bootmem_data_t *bdata)
76{ 76{
77 unsigned long mapsize; 77 struct list_head *iter;
78 unsigned long start = PFN_DOWN(bdata->node_boot_start);
79 unsigned long end = bdata->node_low_pfn;
80 78
81 mapsize = ((end - start) + 7) / 8; 79 list_for_each(iter, &bdata_list) {
82 return ALIGN(mapsize, sizeof(long)); 80 bootmem_data_t *ent;
81
82 ent = list_entry(iter, bootmem_data_t, list);
83 if (bdata->node_min_pfn < ent->node_min_pfn)
84 break;
85 }
86 list_add_tail(&bdata->list, iter);
83} 87}
84 88
85/* 89/*
86 * Called once to set up the allocator itself. 90 * Called once to set up the allocator itself.
87 */ 91 */
88static unsigned long __init init_bootmem_core(pg_data_t *pgdat, 92static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
89 unsigned long mapstart, unsigned long start, unsigned long end) 93 unsigned long mapstart, unsigned long start, unsigned long end)
90{ 94{
91 bootmem_data_t *bdata = pgdat->bdata;
92 unsigned long mapsize; 95 unsigned long mapsize;
93 96
97 mminit_validate_memmodel_limits(&start, &end);
94 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); 98 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
95 bdata->node_boot_start = PFN_PHYS(start); 99 bdata->node_min_pfn = start;
96 bdata->node_low_pfn = end; 100 bdata->node_low_pfn = end;
97 link_bootmem(bdata); 101 link_bootmem(bdata);
98 102
@@ -100,429 +104,484 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
100 * Initially all pages are reserved - setup_arch() has to 104 * Initially all pages are reserved - setup_arch() has to
101 * register free RAM areas explicitly. 105 * register free RAM areas explicitly.
102 */ 106 */
103 mapsize = get_mapsize(bdata); 107 mapsize = bootmap_bytes(end - start);
104 memset(bdata->node_bootmem_map, 0xff, mapsize); 108 memset(bdata->node_bootmem_map, 0xff, mapsize);
105 109
110 bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
111 bdata - bootmem_node_data, start, mapstart, end, mapsize);
112
106 return mapsize; 113 return mapsize;
107} 114}
108 115
109/* 116/**
110 * Marks a particular physical memory range as unallocatable. Usable RAM 117 * init_bootmem_node - register a node as boot memory
111 * might be used for boot-time allocations - or it might get added 118 * @pgdat: node to register
112 * to the free page pool later on. 119 * @freepfn: pfn where the bitmap for this node is to be placed
120 * @startpfn: first pfn on the node
121 * @endpfn: first pfn after the node
122 *
123 * Returns the number of bytes needed to hold the bitmap for this node.
113 */ 124 */
114static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, 125unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
115 unsigned long addr, unsigned long size, int flags) 126 unsigned long startpfn, unsigned long endpfn)
116{ 127{
117 unsigned long sidx, eidx; 128 return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
118 unsigned long i; 129}
119 130
120 BUG_ON(!size); 131/**
132 * init_bootmem - register boot memory
133 * @start: pfn where the bitmap is to be placed
134 * @pages: number of available physical pages
135 *
136 * Returns the number of bytes needed to hold the bitmap.
137 */
138unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
139{
140 max_low_pfn = pages;
141 min_low_pfn = start;
142 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
143}
144
145static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
146{
147 int aligned;
148 struct page *page;
149 unsigned long start, end, pages, count = 0;
121 150
122 /* out of range, don't hold other */ 151 if (!bdata->node_bootmem_map)
123 if (addr + size < bdata->node_boot_start ||
124 PFN_DOWN(addr) > bdata->node_low_pfn)
125 return 0; 152 return 0;
126 153
154 start = bdata->node_min_pfn;
155 end = bdata->node_low_pfn;
156
127 /* 157 /*
128 * Round up to index to the range. 158 * If the start is aligned to the machines wordsize, we might
159 * be able to free pages in bulks of that order.
129 */ 160 */
130 if (addr > bdata->node_boot_start) 161 aligned = !(start & (BITS_PER_LONG - 1));
131 sidx= PFN_DOWN(addr - bdata->node_boot_start);
132 else
133 sidx = 0;
134 162
135 eidx = PFN_UP(addr + size - bdata->node_boot_start); 163 bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
136 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 164 bdata - bootmem_node_data, start, end, aligned);
137 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
138 165
139 for (i = sidx; i < eidx; i++) { 166 while (start < end) {
140 if (test_bit(i, bdata->node_bootmem_map)) { 167 unsigned long *map, idx, vec;
141 if (flags & BOOTMEM_EXCLUSIVE) 168
142 return -EBUSY; 169 map = bdata->node_bootmem_map;
170 idx = start - bdata->node_min_pfn;
171 vec = ~map[idx / BITS_PER_LONG];
172
173 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
174 int order = ilog2(BITS_PER_LONG);
175
176 __free_pages_bootmem(pfn_to_page(start), order);
177 count += BITS_PER_LONG;
178 } else {
179 unsigned long off = 0;
180
181 while (vec && off < BITS_PER_LONG) {
182 if (vec & 1) {
183 page = pfn_to_page(start + off);
184 __free_pages_bootmem(page, 0);
185 count++;
186 }
187 vec >>= 1;
188 off++;
189 }
143 } 190 }
191 start += BITS_PER_LONG;
144 } 192 }
145 193
146 return 0; 194 page = virt_to_page(bdata->node_bootmem_map);
195 pages = bdata->node_low_pfn - bdata->node_min_pfn;
196 pages = bootmem_bootmap_pages(pages);
197 count += pages;
198 while (pages--)
199 __free_pages_bootmem(page++, 0);
147 200
201 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
202
203 return count;
148} 204}
149 205
150static void __init reserve_bootmem_core(bootmem_data_t *bdata, 206/**
151 unsigned long addr, unsigned long size, int flags) 207 * free_all_bootmem_node - release a node's free pages to the buddy allocator
208 * @pgdat: node to be released
209 *
210 * Returns the number of pages actually released.
211 */
212unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
152{ 213{
153 unsigned long sidx, eidx; 214 register_page_bootmem_info_node(pgdat);
154 unsigned long i; 215 return free_all_bootmem_core(pgdat->bdata);
216}
155 217
156 BUG_ON(!size); 218/**
219 * free_all_bootmem - release free pages to the buddy allocator
220 *
221 * Returns the number of pages actually released.
222 */
223unsigned long __init free_all_bootmem(void)
224{
225 return free_all_bootmem_core(NODE_DATA(0)->bdata);
226}
157 227
158 /* out of range */ 228static void __init __free(bootmem_data_t *bdata,
159 if (addr + size < bdata->node_boot_start || 229 unsigned long sidx, unsigned long eidx)
160 PFN_DOWN(addr) > bdata->node_low_pfn) 230{
161 return; 231 unsigned long idx;
162 232
163 /* 233 bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
164 * Round up to index to the range. 234 sidx + bdata->node_min_pfn,
165 */ 235 eidx + bdata->node_min_pfn);
166 if (addr > bdata->node_boot_start)
167 sidx= PFN_DOWN(addr - bdata->node_boot_start);
168 else
169 sidx = 0;
170 236
171 eidx = PFN_UP(addr + size - bdata->node_boot_start); 237 if (bdata->hint_idx > sidx)
172 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 238 bdata->hint_idx = sidx;
173 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
174 239
175 for (i = sidx; i < eidx; i++) { 240 for (idx = sidx; idx < eidx; idx++)
176 if (test_and_set_bit(i, bdata->node_bootmem_map)) { 241 if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
177#ifdef CONFIG_DEBUG_BOOTMEM 242 BUG();
178 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); 243}
179#endif 244
245static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
246 unsigned long eidx, int flags)
247{
248 unsigned long idx;
249 int exclusive = flags & BOOTMEM_EXCLUSIVE;
250
251 bdebug("nid=%td start=%lx end=%lx flags=%x\n",
252 bdata - bootmem_node_data,
253 sidx + bdata->node_min_pfn,
254 eidx + bdata->node_min_pfn,
255 flags);
256
257 for (idx = sidx; idx < eidx; idx++)
258 if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
259 if (exclusive) {
260 __free(bdata, sidx, idx);
261 return -EBUSY;
262 }
263 bdebug("silent double reserve of PFN %lx\n",
264 idx + bdata->node_min_pfn);
180 } 265 }
181 } 266 return 0;
182} 267}
183 268
184static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 269static int __init mark_bootmem_node(bootmem_data_t *bdata,
185 unsigned long size) 270 unsigned long start, unsigned long end,
271 int reserve, int flags)
186{ 272{
187 unsigned long sidx, eidx; 273 unsigned long sidx, eidx;
188 unsigned long i;
189 274
190 BUG_ON(!size); 275 bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
276 bdata - bootmem_node_data, start, end, reserve, flags);
191 277
192 /* out range */ 278 BUG_ON(start < bdata->node_min_pfn);
193 if (addr + size < bdata->node_boot_start || 279 BUG_ON(end > bdata->node_low_pfn);
194 PFN_DOWN(addr) > bdata->node_low_pfn)
195 return;
196 /*
197 * round down end of usable mem, partially free pages are
198 * considered reserved.
199 */
200 280
201 if (addr >= bdata->node_boot_start && addr < bdata->last_success) 281 sidx = start - bdata->node_min_pfn;
202 bdata->last_success = addr; 282 eidx = end - bdata->node_min_pfn;
203 283
204 /* 284 if (reserve)
205 * Round up to index to the range. 285 return __reserve(bdata, sidx, eidx, flags);
206 */
207 if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
208 sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
209 else 286 else
210 sidx = 0; 287 __free(bdata, sidx, eidx);
288 return 0;
289}
211 290
212 eidx = PFN_DOWN(addr + size - bdata->node_boot_start); 291static int __init mark_bootmem(unsigned long start, unsigned long end,
213 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) 292 int reserve, int flags)
214 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); 293{
294 unsigned long pos;
295 bootmem_data_t *bdata;
215 296
216 for (i = sidx; i < eidx; i++) { 297 pos = start;
217 if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) 298 list_for_each_entry(bdata, &bdata_list, list) {
218 BUG(); 299 int err;
300 unsigned long max;
301
302 if (pos < bdata->node_min_pfn ||
303 pos >= bdata->node_low_pfn) {
304 BUG_ON(pos != start);
305 continue;
306 }
307
308 max = min(bdata->node_low_pfn, end);
309
310 err = mark_bootmem_node(bdata, pos, max, reserve, flags);
311 if (reserve && err) {
312 mark_bootmem(start, pos, 0, 0);
313 return err;
314 }
315
316 if (max == end)
317 return 0;
318 pos = bdata->node_low_pfn;
219 } 319 }
320 BUG();
220} 321}
221 322
222/* 323/**
223 * We 'merge' subsequent allocations to save space. We might 'lose' 324 * free_bootmem_node - mark a page range as usable
224 * some fraction of a page if allocations cannot be satisfied due to 325 * @pgdat: node the range resides on
225 * size constraints on boxes where there is physical RAM space 326 * @physaddr: starting address of the range
226 * fragmentation - in these cases (mostly large memory boxes) this 327 * @size: size of the range in bytes
227 * is not a problem.
228 *
229 * On low memory boxes we get it right in 100% of the cases.
230 * 328 *
231 * alignment has to be a power of 2 value. 329 * Partial pages will be considered reserved and left as they are.
232 * 330 *
233 * NOTE: This function is _not_ reentrant. 331 * The range must reside completely on the specified node.
234 */ 332 */
235void * __init 333void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
236__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, 334 unsigned long size)
237 unsigned long align, unsigned long goal, unsigned long limit)
238{ 335{
239 unsigned long areasize, preferred; 336 unsigned long start, end;
240 unsigned long i, start = 0, incr, eidx, end_pfn;
241 void *ret;
242 unsigned long node_boot_start;
243 void *node_bootmem_map;
244
245 if (!size) {
246 printk("__alloc_bootmem_core(): zero-sized request\n");
247 BUG();
248 }
249 BUG_ON(align & (align-1));
250 337
251 /* on nodes without memory - bootmem_map is NULL */ 338 start = PFN_UP(physaddr);
252 if (!bdata->node_bootmem_map) 339 end = PFN_DOWN(physaddr + size);
253 return NULL;
254
255 /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
256 node_boot_start = bdata->node_boot_start;
257 node_bootmem_map = bdata->node_bootmem_map;
258 if (align) {
259 node_boot_start = ALIGN(bdata->node_boot_start, align);
260 if (node_boot_start > bdata->node_boot_start)
261 node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
262 PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
263 }
264
265 if (limit && node_boot_start >= limit)
266 return NULL;
267 340
268 end_pfn = bdata->node_low_pfn; 341 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
269 limit = PFN_DOWN(limit); 342}
270 if (limit && end_pfn > limit)
271 end_pfn = limit;
272 343
273 eidx = end_pfn - PFN_DOWN(node_boot_start); 344/**
345 * free_bootmem - mark a page range as usable
346 * @addr: starting address of the range
347 * @size: size of the range in bytes
348 *
349 * Partial pages will be considered reserved and left as they are.
350 *
351 * The range must be contiguous but may span node boundaries.
352 */
353void __init free_bootmem(unsigned long addr, unsigned long size)
354{
355 unsigned long start, end;
274 356
275 /* 357 start = PFN_UP(addr);
276 * We try to allocate bootmem pages above 'goal' 358 end = PFN_DOWN(addr + size);
277 * first, then we try to allocate lower pages.
278 */
279 preferred = 0;
280 if (goal && PFN_DOWN(goal) < end_pfn) {
281 if (goal > node_boot_start)
282 preferred = goal - node_boot_start;
283
284 if (bdata->last_success > node_boot_start &&
285 bdata->last_success - node_boot_start >= preferred)
286 if (!limit || (limit && limit > bdata->last_success))
287 preferred = bdata->last_success - node_boot_start;
288 }
289 359
290 preferred = PFN_DOWN(ALIGN(preferred, align)); 360 mark_bootmem(start, end, 0, 0);
291 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; 361}
292 incr = align >> PAGE_SHIFT ? : 1;
293 362
294restart_scan: 363/**
295 for (i = preferred; i < eidx;) { 364 * reserve_bootmem_node - mark a page range as reserved
296 unsigned long j; 365 * @pgdat: node the range resides on
366 * @physaddr: starting address of the range
367 * @size: size of the range in bytes
368 * @flags: reservation flags (see linux/bootmem.h)
369 *
370 * Partial pages will be reserved.
371 *
372 * The range must reside completely on the specified node.
373 */
374int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
375 unsigned long size, int flags)
376{
377 unsigned long start, end;
297 378
298 i = find_next_zero_bit(node_bootmem_map, eidx, i); 379 start = PFN_DOWN(physaddr);
299 i = ALIGN(i, incr); 380 end = PFN_UP(physaddr + size);
300 if (i >= eidx)
301 break;
302 if (test_bit(i, node_bootmem_map)) {
303 i += incr;
304 continue;
305 }
306 for (j = i + 1; j < i + areasize; ++j) {
307 if (j >= eidx)
308 goto fail_block;
309 if (test_bit(j, node_bootmem_map))
310 goto fail_block;
311 }
312 start = i;
313 goto found;
314 fail_block:
315 i = ALIGN(j, incr);
316 if (i == j)
317 i += incr;
318 }
319 381
320 if (preferred > 0) { 382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
321 preferred = 0; 383}
322 goto restart_scan;
323 }
324 return NULL;
325 384
326found: 385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
327 bdata->last_success = PFN_PHYS(start) + node_boot_start; 386/**
328 BUG_ON(start >= eidx); 387 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range
389 * @size: size of the range in bytes
390 * @flags: reservation flags (see linux/bootmem.h)
391 *
392 * Partial pages will be reserved.
393 *
394 * The range must be contiguous but may span node boundaries.
395 */
396int __init reserve_bootmem(unsigned long addr, unsigned long size,
397 int flags)
398{
399 unsigned long start, end;
329 400
330 /* 401 start = PFN_DOWN(addr);
331 * Is the next page of the previous allocation-end the start 402 end = PFN_UP(addr + size);
332 * of this allocation's buffer? If yes then we can 'merge'
333 * the previous partial page with this allocation.
334 */
335 if (align < PAGE_SIZE &&
336 bdata->last_offset && bdata->last_pos+1 == start) {
337 unsigned long offset, remaining_size;
338 offset = ALIGN(bdata->last_offset, align);
339 BUG_ON(offset > PAGE_SIZE);
340 remaining_size = PAGE_SIZE - offset;
341 if (size < remaining_size) {
342 areasize = 0;
343 /* last_pos unchanged */
344 bdata->last_offset = offset + size;
345 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
346 offset + node_boot_start);
347 } else {
348 remaining_size = size - remaining_size;
349 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
350 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
351 offset + node_boot_start);
352 bdata->last_pos = start + areasize - 1;
353 bdata->last_offset = remaining_size;
354 }
355 bdata->last_offset &= ~PAGE_MASK;
356 } else {
357 bdata->last_pos = start + areasize - 1;
358 bdata->last_offset = size & ~PAGE_MASK;
359 ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
360 }
361 403
362 /* 404 return mark_bootmem(start, end, 1, flags);
363 * Reserve the area now:
364 */
365 for (i = start; i < start + areasize; i++)
366 if (unlikely(test_and_set_bit(i, node_bootmem_map)))
367 BUG();
368 memset(ret, 0, size);
369 return ret;
370} 405}
406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
371 407
372static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 408static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
409 unsigned long step)
373{ 410{
374 struct page *page; 411 unsigned long base = bdata->node_min_pfn;
375 unsigned long pfn;
376 bootmem_data_t *bdata = pgdat->bdata;
377 unsigned long i, count, total = 0;
378 unsigned long idx;
379 unsigned long *map;
380 int gofast = 0;
381
382 BUG_ON(!bdata->node_bootmem_map);
383
384 count = 0;
385 /* first extant page of the node */
386 pfn = PFN_DOWN(bdata->node_boot_start);
387 idx = bdata->node_low_pfn - pfn;
388 map = bdata->node_bootmem_map;
389 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
390 if (bdata->node_boot_start == 0 ||
391 ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
392 gofast = 1;
393 for (i = 0; i < idx; ) {
394 unsigned long v = ~map[i / BITS_PER_LONG];
395
396 if (gofast && v == ~0UL) {
397 int order;
398
399 page = pfn_to_page(pfn);
400 count += BITS_PER_LONG;
401 order = ffs(BITS_PER_LONG) - 1;
402 __free_pages_bootmem(page, order);
403 i += BITS_PER_LONG;
404 page += BITS_PER_LONG;
405 } else if (v) {
406 unsigned long m;
407
408 page = pfn_to_page(pfn);
409 for (m = 1; m && i < idx; m<<=1, page++, i++) {
410 if (v & m) {
411 count++;
412 __free_pages_bootmem(page, 0);
413 }
414 }
415 } else {
416 i += BITS_PER_LONG;
417 }
418 pfn += BITS_PER_LONG;
419 }
420 total += count;
421 412
422 /* 413 /*
423 * Now free the allocator bitmap itself, it's not 414 * Align the index with respect to the node start so that the
424 * needed anymore: 415 * combination of both satisfies the requested alignment.
425 */ 416 */
426 page = virt_to_page(bdata->node_bootmem_map);
427 count = 0;
428 idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
429 for (i = 0; i < idx; i++, page++) {
430 __free_pages_bootmem(page, 0);
431 count++;
432 }
433 total += count;
434 bdata->node_bootmem_map = NULL;
435 417
436 return total; 418 return ALIGN(base + idx, step) - base;
437} 419}
438 420
439unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, 421static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
440 unsigned long startpfn, unsigned long endpfn) 422 unsigned long align)
441{ 423{
442 return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); 424 unsigned long base = PFN_PHYS(bdata->node_min_pfn);
425
426 /* Same as align_idx for byte offsets */
427
428 return ALIGN(base + off, align) - base;
443} 429}
444 430
445int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 431static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
446 unsigned long size, int flags) 432 unsigned long size, unsigned long align,
433 unsigned long goal, unsigned long limit)
447{ 434{
448 int ret; 435 unsigned long fallback = 0;
436 unsigned long min, max, start, sidx, midx, step;
449 437
450 ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); 438 BUG_ON(!size);
451 if (ret < 0) 439 BUG_ON(align & (align - 1));
452 return -ENOMEM; 440 BUG_ON(limit && goal + size > limit);
453 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
454 441
455 return 0; 442 if (!bdata->node_bootmem_map)
456} 443 return NULL;
457 444
458void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 445 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
459 unsigned long size) 446 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
460{ 447 align, goal, limit);
461 free_bootmem_core(pgdat->bdata, physaddr, size);
462}
463 448
464unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 449 min = bdata->node_min_pfn;
465{ 450 max = bdata->node_low_pfn;
466 register_page_bootmem_info_node(pgdat);
467 return free_all_bootmem_core(pgdat);
468}
469 451
470unsigned long __init init_bootmem(unsigned long start, unsigned long pages) 452 goal >>= PAGE_SHIFT;
471{ 453 limit >>= PAGE_SHIFT;
472 max_low_pfn = pages;
473 min_low_pfn = start;
474 return init_bootmem_core(NODE_DATA(0), start, 0, pages);
475}
476 454
477#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 455 if (limit && max > limit)
478int __init reserve_bootmem(unsigned long addr, unsigned long size, 456 max = limit;
479 int flags) 457 if (max <= min)
480{ 458 return NULL;
481 bootmem_data_t *bdata;
482 int ret;
483 459
484 list_for_each_entry(bdata, &bdata_list, list) { 460 step = max(align >> PAGE_SHIFT, 1UL);
485 ret = can_reserve_bootmem_core(bdata, addr, size, flags); 461
486 if (ret < 0) 462 if (goal && min < goal && goal < max)
487 return ret; 463 start = ALIGN(goal, step);
464 else
465 start = ALIGN(min, step);
466
467 sidx = start - bdata->node_min_pfn;
468 midx = max - bdata->node_min_pfn;
469
470 if (bdata->hint_idx > sidx) {
471 /*
472 * Handle the valid case of sidx being zero and still
473 * catch the fallback below.
474 */
475 fallback = sidx + 1;
476 sidx = align_idx(bdata, bdata->hint_idx, step);
488 } 477 }
489 list_for_each_entry(bdata, &bdata_list, list)
490 reserve_bootmem_core(bdata, addr, size, flags);
491 478
492 return 0; 479 while (1) {
493} 480 int merge;
494#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 481 void *region;
482 unsigned long eidx, i, start_off, end_off;
483find_block:
484 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
485 sidx = align_idx(bdata, sidx, step);
486 eidx = sidx + PFN_UP(size);
495 487
496void __init free_bootmem(unsigned long addr, unsigned long size) 488 if (sidx >= midx || eidx > midx)
497{ 489 break;
498 bootmem_data_t *bdata;
499 list_for_each_entry(bdata, &bdata_list, list)
500 free_bootmem_core(bdata, addr, size);
501}
502 490
503unsigned long __init free_all_bootmem(void) 491 for (i = sidx; i < eidx; i++)
504{ 492 if (test_bit(i, bdata->node_bootmem_map)) {
505 return free_all_bootmem_core(NODE_DATA(0)); 493 sidx = align_idx(bdata, i, step);
494 if (sidx == i)
495 sidx += step;
496 goto find_block;
497 }
498
499 if (bdata->last_end_off & (PAGE_SIZE - 1) &&
500 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
501 start_off = align_off(bdata, bdata->last_end_off, align);
502 else
503 start_off = PFN_PHYS(sidx);
504
505 merge = PFN_DOWN(start_off) < sidx;
506 end_off = start_off + size;
507
508 bdata->last_end_off = end_off;
509 bdata->hint_idx = PFN_UP(end_off);
510
511 /*
512 * Reserve the area now:
513 */
514 if (__reserve(bdata, PFN_DOWN(start_off) + merge,
515 PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
516 BUG();
517
518 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
519 start_off);
520 memset(region, 0, size);
521 return region;
522 }
523
524 if (fallback) {
525 sidx = align_idx(bdata, fallback - 1, step);
526 fallback = 0;
527 goto find_block;
528 }
529
530 return NULL;
506} 531}
507 532
508void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 533static void * __init ___alloc_bootmem_nopanic(unsigned long size,
509 unsigned long goal) 534 unsigned long align,
535 unsigned long goal,
536 unsigned long limit)
510{ 537{
511 bootmem_data_t *bdata; 538 bootmem_data_t *bdata;
512 void *ptr;
513 539
540restart:
514 list_for_each_entry(bdata, &bdata_list, list) { 541 list_for_each_entry(bdata, &bdata_list, list) {
515 ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); 542 void *region;
516 if (ptr) 543
517 return ptr; 544 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
545 continue;
546 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
547 break;
548
549 region = alloc_bootmem_core(bdata, size, align, goal, limit);
550 if (region)
551 return region;
552 }
553
554 if (goal) {
555 goal = 0;
556 goto restart;
518 } 557 }
558
519 return NULL; 559 return NULL;
520} 560}
521 561
522void * __init __alloc_bootmem(unsigned long size, unsigned long align, 562/**
523 unsigned long goal) 563 * __alloc_bootmem_nopanic - allocate boot memory without panicking
564 * @size: size of the request in bytes
565 * @align: alignment of the region
566 * @goal: preferred starting address of the region
567 *
568 * The goal is dropped if it can not be satisfied and the allocation will
569 * fall back to memory below @goal.
570 *
571 * Allocation may happen on any node in the system.
572 *
573 * Returns NULL on failure.
574 */
575void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
576 unsigned long goal)
524{ 577{
525 void *mem = __alloc_bootmem_nopanic(size,align,goal); 578 return ___alloc_bootmem_nopanic(size, align, goal, 0);
579}
580
581static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
582 unsigned long goal, unsigned long limit)
583{
584 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
526 585
527 if (mem) 586 if (mem)
528 return mem; 587 return mem;
@@ -534,78 +593,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
534 return NULL; 593 return NULL;
535} 594}
536 595
596/**
597 * __alloc_bootmem - allocate boot memory
598 * @size: size of the request in bytes
599 * @align: alignment of the region
600 * @goal: preferred starting address of the region
601 *
602 * The goal is dropped if it can not be satisfied and the allocation will
603 * fall back to memory below @goal.
604 *
605 * Allocation may happen on any node in the system.
606 *
607 * The function panics if the request can not be satisfied.
608 */
609void * __init __alloc_bootmem(unsigned long size, unsigned long align,
610 unsigned long goal)
611{
612 return ___alloc_bootmem(size, align, goal, 0);
613}
537 614
538void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 615static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
539 unsigned long align, unsigned long goal) 616 unsigned long size, unsigned long align,
617 unsigned long goal, unsigned long limit)
540{ 618{
541 void *ptr; 619 void *ptr;
542 620
543 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 621 ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
544 if (ptr) 622 if (ptr)
545 return ptr; 623 return ptr;
546 624
547 return __alloc_bootmem(size, align, goal); 625 return ___alloc_bootmem(size, align, goal, limit);
626}
627
628/**
629 * __alloc_bootmem_node - allocate boot memory from a specific node
630 * @pgdat: node to allocate from
631 * @size: size of the request in bytes
632 * @align: alignment of the region
633 * @goal: preferred starting address of the region
634 *
635 * The goal is dropped if it can not be satisfied and the allocation will
636 * fall back to memory below @goal.
637 *
638 * Allocation may fall back to any node in the system if the specified node
639 * can not hold the requested memory.
640 *
641 * The function panics if the request can not be satisfied.
642 */
643void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
644 unsigned long align, unsigned long goal)
645{
646 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
548} 647}
549 648
550#ifdef CONFIG_SPARSEMEM 649#ifdef CONFIG_SPARSEMEM
650/**
651 * alloc_bootmem_section - allocate boot memory from a specific section
652 * @size: size of the request in bytes
653 * @section_nr: sparse map section to allocate from
654 *
655 * Return NULL on failure.
656 */
551void * __init alloc_bootmem_section(unsigned long size, 657void * __init alloc_bootmem_section(unsigned long size,
552 unsigned long section_nr) 658 unsigned long section_nr)
553{ 659{
554 void *ptr; 660 bootmem_data_t *bdata;
555 unsigned long limit, goal, start_nr, end_nr, pfn; 661 unsigned long pfn, goal, limit;
556 struct pglist_data *pgdat;
557 662
558 pfn = section_nr_to_pfn(section_nr); 663 pfn = section_nr_to_pfn(section_nr);
559 goal = PFN_PHYS(pfn); 664 goal = pfn << PAGE_SHIFT;
560 limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; 665 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
561 pgdat = NODE_DATA(early_pfn_to_nid(pfn)); 666 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
562 ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
563 limit);
564 667
565 if (!ptr) 668 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
566 return NULL; 669}
670#endif
567 671
568 start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); 672void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
569 end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); 673 unsigned long align, unsigned long goal)
570 if (start_nr != section_nr || end_nr != section_nr) { 674{
571 printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", 675 void *ptr;
572 section_nr);
573 free_bootmem_core(pgdat->bdata, __pa(ptr), size);
574 ptr = NULL;
575 }
576 676
577 return ptr; 677 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
678 if (ptr)
679 return ptr;
680
681 return __alloc_bootmem_nopanic(size, align, goal);
578} 682}
579#endif
580 683
581#ifndef ARCH_LOW_ADDRESS_LIMIT 684#ifndef ARCH_LOW_ADDRESS_LIMIT
582#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 685#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
583#endif 686#endif
584 687
688/**
689 * __alloc_bootmem_low - allocate low boot memory
690 * @size: size of the request in bytes
691 * @align: alignment of the region
692 * @goal: preferred starting address of the region
693 *
694 * The goal is dropped if it can not be satisfied and the allocation will
695 * fall back to memory below @goal.
696 *
697 * Allocation may happen on any node in the system.
698 *
699 * The function panics if the request can not be satisfied.
700 */
585void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, 701void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
586 unsigned long goal) 702 unsigned long goal)
587{ 703{
588 bootmem_data_t *bdata; 704 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
589 void *ptr;
590
591 list_for_each_entry(bdata, &bdata_list, list) {
592 ptr = __alloc_bootmem_core(bdata, size, align, goal,
593 ARCH_LOW_ADDRESS_LIMIT);
594 if (ptr)
595 return ptr;
596 }
597
598 /*
599 * Whoops, we cannot satisfy the allocation request.
600 */
601 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
602 panic("Out of low memory");
603 return NULL;
604} 705}
605 706
707/**
708 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
709 * @pgdat: node to allocate from
710 * @size: size of the request in bytes
711 * @align: alignment of the region
712 * @goal: preferred starting address of the region
713 *
714 * The goal is dropped if it can not be satisfied and the allocation will
715 * fall back to memory below @goal.
716 *
717 * Allocation may fall back to any node in the system if the specified node
718 * can not hold the requested memory.
719 *
720 * The function panics if the request can not be satisfied.
721 */
606void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 722void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
607 unsigned long align, unsigned long goal) 723 unsigned long align, unsigned long goal)
608{ 724{
609 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, 725 return ___alloc_bootmem_node(pgdat->bdata, size, align,
610 ARCH_LOW_ADDRESS_LIMIT); 726 goal, ARCH_LOW_ADDRESS_LIMIT);
611} 727}
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019b..06722c403058 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
267 /* 267 /*
268 * Data-less bio, nothing to bounce 268 * Data-less bio, nothing to bounce
269 */ 269 */
270 if (bio_empty_barrier(*bio_orig)) 270 if (!bio_has_data(*bio_orig))
271 return; 271 return;
272 272
273 /* 273 /*
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 343cfdfebd9e..a1da969bd980 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds 4 * Copyright (C) 2002, Linus Torvalds
5 * 5 *
6 * 11Jan2003 akpm@digeo.com 6 * 11Jan2003 Andrew Morton
7 * Initial version. 7 * Initial version.
8 */ 8 */
9 9
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d9d9e2b755..ab8553658af3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -42,9 +43,6 @@
42 43
43#include <asm/mman.h> 44#include <asm/mman.h>
44 45
45static ssize_t
46generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
47 loff_t offset, unsigned long nr_segs);
48 46
49/* 47/*
50 * Shared mappings implemented 30.11.1994. It's not fully working yet, 48 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -112,18 +110,18 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
112/* 110/*
113 * Remove a page from the page cache and free it. Caller has to make 111 * Remove a page from the page cache and free it. Caller has to make
114 * sure the page is locked and that nobody else uses it - or that usage 112 * sure the page is locked and that nobody else uses it - or that usage
115 * is safe. The caller must hold a write_lock on the mapping's tree_lock. 113 * is safe. The caller must hold the mapping's tree_lock.
116 */ 114 */
117void __remove_from_page_cache(struct page *page) 115void __remove_from_page_cache(struct page *page)
118{ 116{
119 struct address_space *mapping = page->mapping; 117 struct address_space *mapping = page->mapping;
120 118
121 mem_cgroup_uncharge_page(page);
122 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
123 page->mapping = NULL; 120 page->mapping = NULL;
124 mapping->nrpages--; 121 mapping->nrpages--;
125 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
126 BUG_ON(page_mapped(page)); 123 BUG_ON(page_mapped(page));
124 mem_cgroup_uncharge_cache_page(page);
127 125
128 /* 126 /*
129 * Some filesystems seem to re-dirty the page even after 127 * Some filesystems seem to re-dirty the page even after
@@ -144,9 +142,9 @@ void remove_from_page_cache(struct page *page)
144 142
145 BUG_ON(!PageLocked(page)); 143 BUG_ON(!PageLocked(page));
146 144
147 write_lock_irq(&mapping->tree_lock); 145 spin_lock_irq(&mapping->tree_lock);
148 __remove_from_page_cache(page); 146 __remove_from_page_cache(page);
149 write_unlock_irq(&mapping->tree_lock); 147 spin_unlock_irq(&mapping->tree_lock);
150} 148}
151 149
152static int sync_page(void *word) 150static int sync_page(void *word)
@@ -445,55 +443,74 @@ int filemap_write_and_wait_range(struct address_space *mapping,
445} 443}
446 444
447/** 445/**
448 * add_to_page_cache - add newly allocated pagecache pages 446 * add_to_page_cache_locked - add a locked page to the pagecache
449 * @page: page to add 447 * @page: page to add
450 * @mapping: the page's address_space 448 * @mapping: the page's address_space
451 * @offset: page index 449 * @offset: page index
452 * @gfp_mask: page allocation mode 450 * @gfp_mask: page allocation mode
453 * 451 *
454 * This function is used to add newly allocated pagecache pages; 452 * This function is used to add a page to the pagecache. It must be locked.
455 * the page is new, so we can just run SetPageLocked() against it.
456 * The other page state flags were set by rmqueue().
457 *
458 * This function does not add the page to the LRU. The caller must do that. 453 * This function does not add the page to the LRU. The caller must do that.
459 */ 454 */
460int add_to_page_cache(struct page *page, struct address_space *mapping, 455int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
461 pgoff_t offset, gfp_t gfp_mask) 456 pgoff_t offset, gfp_t gfp_mask)
462{ 457{
463 int error = mem_cgroup_cache_charge(page, current->mm, 458 int error;
459
460 VM_BUG_ON(!PageLocked(page));
461
462 error = mem_cgroup_cache_charge(page, current->mm,
464 gfp_mask & ~__GFP_HIGHMEM); 463 gfp_mask & ~__GFP_HIGHMEM);
465 if (error) 464 if (error)
466 goto out; 465 goto out;
467 466
468 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 467 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
469 if (error == 0) { 468 if (error == 0) {
470 write_lock_irq(&mapping->tree_lock); 469 page_cache_get(page);
470 page->mapping = mapping;
471 page->index = offset;
472
473 spin_lock_irq(&mapping->tree_lock);
471 error = radix_tree_insert(&mapping->page_tree, offset, page); 474 error = radix_tree_insert(&mapping->page_tree, offset, page);
472 if (!error) { 475 if (likely(!error)) {
473 page_cache_get(page);
474 SetPageLocked(page);
475 page->mapping = mapping;
476 page->index = offset;
477 mapping->nrpages++; 476 mapping->nrpages++;
478 __inc_zone_page_state(page, NR_FILE_PAGES); 477 __inc_zone_page_state(page, NR_FILE_PAGES);
479 } else 478 } else {
480 mem_cgroup_uncharge_page(page); 479 page->mapping = NULL;
480 mem_cgroup_uncharge_cache_page(page);
481 page_cache_release(page);
482 }
481 483
482 write_unlock_irq(&mapping->tree_lock); 484 spin_unlock_irq(&mapping->tree_lock);
483 radix_tree_preload_end(); 485 radix_tree_preload_end();
484 } else 486 } else
485 mem_cgroup_uncharge_page(page); 487 mem_cgroup_uncharge_cache_page(page);
486out: 488out:
487 return error; 489 return error;
488} 490}
489EXPORT_SYMBOL(add_to_page_cache); 491EXPORT_SYMBOL(add_to_page_cache_locked);
490 492
491int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 493int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
492 pgoff_t offset, gfp_t gfp_mask) 494 pgoff_t offset, gfp_t gfp_mask)
493{ 495{
494 int ret = add_to_page_cache(page, mapping, offset, gfp_mask); 496 int ret;
495 if (ret == 0) 497
496 lru_cache_add(page); 498 /*
499 * Splice_read and readahead add shmem/tmpfs pages into the page cache
500 * before shmem_readpage has a chance to mark them as SwapBacked: they
501 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
502 * (called in add_to_page_cache) needs to know where they're going too.
503 */
504 if (mapping_cap_swap_backed(mapping))
505 SetPageSwapBacked(page);
506
507 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
508 if (ret == 0) {
509 if (page_is_file_cache(page))
510 lru_cache_add_file(page);
511 else
512 lru_cache_add_active_anon(page);
513 }
497 return ret; 514 return ret;
498} 515}
499 516
@@ -556,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
556 * mechananism between PageLocked pages and PageWriteback pages is shared. 573 * mechananism between PageLocked pages and PageWriteback pages is shared.
557 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 574 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
558 * 575 *
559 * The first mb is necessary to safely close the critical section opened by the 576 * The mb is necessary to enforce ordering between the clear_bit and the read
560 * TestSetPageLocked(), the second mb is necessary to enforce ordering between 577 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
561 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
562 * parallel wait_on_page_locked()).
563 */ 578 */
564void unlock_page(struct page *page) 579void unlock_page(struct page *page)
565{ 580{
566 smp_mb__before_clear_bit(); 581 VM_BUG_ON(!PageLocked(page));
567 if (!TestClearPageLocked(page)) 582 clear_bit_unlock(PG_locked, &page->flags);
568 BUG(); 583 smp_mb__after_clear_bit();
569 smp_mb__after_clear_bit();
570 wake_up_page(page, PG_locked); 584 wake_up_page(page, PG_locked);
571} 585}
572EXPORT_SYMBOL(unlock_page); 586EXPORT_SYMBOL(unlock_page);
@@ -636,15 +650,35 @@ void __lock_page_nosync(struct page *page)
636 * Is there a pagecache struct page at the given (mapping, offset) tuple? 650 * Is there a pagecache struct page at the given (mapping, offset) tuple?
637 * If yes, increment its refcount and return it; if no, return NULL. 651 * If yes, increment its refcount and return it; if no, return NULL.
638 */ 652 */
639struct page * find_get_page(struct address_space *mapping, pgoff_t offset) 653struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
640{ 654{
655 void **pagep;
641 struct page *page; 656 struct page *page;
642 657
643 read_lock_irq(&mapping->tree_lock); 658 rcu_read_lock();
644 page = radix_tree_lookup(&mapping->page_tree, offset); 659repeat:
645 if (page) 660 page = NULL;
646 page_cache_get(page); 661 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
647 read_unlock_irq(&mapping->tree_lock); 662 if (pagep) {
663 page = radix_tree_deref_slot(pagep);
664 if (unlikely(!page || page == RADIX_TREE_RETRY))
665 goto repeat;
666
667 if (!page_cache_get_speculative(page))
668 goto repeat;
669
670 /*
671 * Has the page moved?
672 * This is part of the lockless pagecache protocol. See
673 * include/linux/pagemap.h for details.
674 */
675 if (unlikely(page != *pagep)) {
676 page_cache_release(page);
677 goto repeat;
678 }
679 }
680 rcu_read_unlock();
681
648 return page; 682 return page;
649} 683}
650EXPORT_SYMBOL(find_get_page); 684EXPORT_SYMBOL(find_get_page);
@@ -659,32 +693,22 @@ EXPORT_SYMBOL(find_get_page);
659 * 693 *
660 * Returns zero if the page was not present. find_lock_page() may sleep. 694 * Returns zero if the page was not present. find_lock_page() may sleep.
661 */ 695 */
662struct page *find_lock_page(struct address_space *mapping, 696struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
663 pgoff_t offset)
664{ 697{
665 struct page *page; 698 struct page *page;
666 699
667repeat: 700repeat:
668 read_lock_irq(&mapping->tree_lock); 701 page = find_get_page(mapping, offset);
669 page = radix_tree_lookup(&mapping->page_tree, offset);
670 if (page) { 702 if (page) {
671 page_cache_get(page); 703 lock_page(page);
672 if (TestSetPageLocked(page)) { 704 /* Has the page been truncated? */
673 read_unlock_irq(&mapping->tree_lock); 705 if (unlikely(page->mapping != mapping)) {
674 __lock_page(page); 706 unlock_page(page);
675 707 page_cache_release(page);
676 /* Has the page been truncated while we slept? */ 708 goto repeat;
677 if (unlikely(page->mapping != mapping)) {
678 unlock_page(page);
679 page_cache_release(page);
680 goto repeat;
681 }
682 VM_BUG_ON(page->index != offset);
683 goto out;
684 } 709 }
710 VM_BUG_ON(page->index != offset);
685 } 711 }
686 read_unlock_irq(&mapping->tree_lock);
687out:
688 return page; 712 return page;
689} 713}
690EXPORT_SYMBOL(find_lock_page); 714EXPORT_SYMBOL(find_lock_page);
@@ -750,13 +774,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
750{ 774{
751 unsigned int i; 775 unsigned int i;
752 unsigned int ret; 776 unsigned int ret;
777 unsigned int nr_found;
778
779 rcu_read_lock();
780restart:
781 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
782 (void ***)pages, start, nr_pages);
783 ret = 0;
784 for (i = 0; i < nr_found; i++) {
785 struct page *page;
786repeat:
787 page = radix_tree_deref_slot((void **)pages[i]);
788 if (unlikely(!page))
789 continue;
790 /*
791 * this can only trigger if nr_found == 1, making livelock
792 * a non issue.
793 */
794 if (unlikely(page == RADIX_TREE_RETRY))
795 goto restart;
753 796
754 read_lock_irq(&mapping->tree_lock); 797 if (!page_cache_get_speculative(page))
755 ret = radix_tree_gang_lookup(&mapping->page_tree, 798 goto repeat;
756 (void **)pages, start, nr_pages); 799
757 for (i = 0; i < ret; i++) 800 /* Has the page moved? */
758 page_cache_get(pages[i]); 801 if (unlikely(page != *((void **)pages[i]))) {
759 read_unlock_irq(&mapping->tree_lock); 802 page_cache_release(page);
803 goto repeat;
804 }
805
806 pages[ret] = page;
807 ret++;
808 }
809 rcu_read_unlock();
760 return ret; 810 return ret;
761} 811}
762 812
@@ -777,19 +827,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
777{ 827{
778 unsigned int i; 828 unsigned int i;
779 unsigned int ret; 829 unsigned int ret;
830 unsigned int nr_found;
831
832 rcu_read_lock();
833restart:
834 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
835 (void ***)pages, index, nr_pages);
836 ret = 0;
837 for (i = 0; i < nr_found; i++) {
838 struct page *page;
839repeat:
840 page = radix_tree_deref_slot((void **)pages[i]);
841 if (unlikely(!page))
842 continue;
843 /*
844 * this can only trigger if nr_found == 1, making livelock
845 * a non issue.
846 */
847 if (unlikely(page == RADIX_TREE_RETRY))
848 goto restart;
780 849
781 read_lock_irq(&mapping->tree_lock); 850 if (page->mapping == NULL || page->index != index)
782 ret = radix_tree_gang_lookup(&mapping->page_tree,
783 (void **)pages, index, nr_pages);
784 for (i = 0; i < ret; i++) {
785 if (pages[i]->mapping == NULL || pages[i]->index != index)
786 break; 851 break;
787 852
788 page_cache_get(pages[i]); 853 if (!page_cache_get_speculative(page))
854 goto repeat;
855
856 /* Has the page moved? */
857 if (unlikely(page != *((void **)pages[i]))) {
858 page_cache_release(page);
859 goto repeat;
860 }
861
862 pages[ret] = page;
863 ret++;
789 index++; 864 index++;
790 } 865 }
791 read_unlock_irq(&mapping->tree_lock); 866 rcu_read_unlock();
792 return i; 867 return ret;
793} 868}
794EXPORT_SYMBOL(find_get_pages_contig); 869EXPORT_SYMBOL(find_get_pages_contig);
795 870
@@ -809,15 +884,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
809{ 884{
810 unsigned int i; 885 unsigned int i;
811 unsigned int ret; 886 unsigned int ret;
887 unsigned int nr_found;
888
889 rcu_read_lock();
890restart:
891 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
892 (void ***)pages, *index, nr_pages, tag);
893 ret = 0;
894 for (i = 0; i < nr_found; i++) {
895 struct page *page;
896repeat:
897 page = radix_tree_deref_slot((void **)pages[i]);
898 if (unlikely(!page))
899 continue;
900 /*
901 * this can only trigger if nr_found == 1, making livelock
902 * a non issue.
903 */
904 if (unlikely(page == RADIX_TREE_RETRY))
905 goto restart;
906
907 if (!page_cache_get_speculative(page))
908 goto repeat;
909
910 /* Has the page moved? */
911 if (unlikely(page != *((void **)pages[i]))) {
912 page_cache_release(page);
913 goto repeat;
914 }
915
916 pages[ret] = page;
917 ret++;
918 }
919 rcu_read_unlock();
812 920
813 read_lock_irq(&mapping->tree_lock);
814 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
815 (void **)pages, *index, nr_pages, tag);
816 for (i = 0; i < ret; i++)
817 page_cache_get(pages[i]);
818 if (ret) 921 if (ret)
819 *index = pages[ret - 1]->index + 1; 922 *index = pages[ret - 1]->index + 1;
820 read_unlock_irq(&mapping->tree_lock); 923
821 return ret; 924 return ret;
822} 925}
823EXPORT_SYMBOL(find_get_pages_tag); 926EXPORT_SYMBOL(find_get_pages_tag);
@@ -841,7 +944,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
841 struct page *page = find_get_page(mapping, index); 944 struct page *page = find_get_page(mapping, index);
842 945
843 if (page) { 946 if (page) {
844 if (!TestSetPageLocked(page)) 947 if (trylock_page(page))
845 return page; 948 return page;
846 page_cache_release(page); 949 page_cache_release(page);
847 return NULL; 950 return NULL;
@@ -933,8 +1036,17 @@ find_page:
933 ra, filp, page, 1036 ra, filp, page,
934 index, last_index - index); 1037 index, last_index - index);
935 } 1038 }
936 if (!PageUptodate(page)) 1039 if (!PageUptodate(page)) {
937 goto page_not_up_to_date; 1040 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1041 !mapping->a_ops->is_partially_uptodate)
1042 goto page_not_up_to_date;
1043 if (!trylock_page(page))
1044 goto page_not_up_to_date;
1045 if (!mapping->a_ops->is_partially_uptodate(page,
1046 desc, offset))
1047 goto page_not_up_to_date_locked;
1048 unlock_page(page);
1049 }
938page_ok: 1050page_ok:
939 /* 1051 /*
940 * i_size must be checked after we know the page is Uptodate. 1052 * i_size must be checked after we know the page is Uptodate.
@@ -1001,9 +1113,11 @@ page_ok:
1001 1113
1002page_not_up_to_date: 1114page_not_up_to_date:
1003 /* Get exclusive access to the page ... */ 1115 /* Get exclusive access to the page ... */
1004 if (lock_page_killable(page)) 1116 error = lock_page_killable(page);
1005 goto readpage_eio; 1117 if (unlikely(error))
1118 goto readpage_error;
1006 1119
1120page_not_up_to_date_locked:
1007 /* Did it get truncated before we got the lock? */ 1121 /* Did it get truncated before we got the lock? */
1008 if (!page->mapping) { 1122 if (!page->mapping) {
1009 unlock_page(page); 1123 unlock_page(page);
@@ -1030,8 +1144,9 @@ readpage:
1030 } 1144 }
1031 1145
1032 if (!PageUptodate(page)) { 1146 if (!PageUptodate(page)) {
1033 if (lock_page_killable(page)) 1147 error = lock_page_killable(page);
1034 goto readpage_eio; 1148 if (unlikely(error))
1149 goto readpage_error;
1035 if (!PageUptodate(page)) { 1150 if (!PageUptodate(page)) {
1036 if (page->mapping == NULL) { 1151 if (page->mapping == NULL) {
1037 /* 1152 /*
@@ -1043,15 +1158,14 @@ readpage:
1043 } 1158 }
1044 unlock_page(page); 1159 unlock_page(page);
1045 shrink_readahead_size_eio(filp, ra); 1160 shrink_readahead_size_eio(filp, ra);
1046 goto readpage_eio; 1161 error = -EIO;
1162 goto readpage_error;
1047 } 1163 }
1048 unlock_page(page); 1164 unlock_page(page);
1049 } 1165 }
1050 1166
1051 goto page_ok; 1167 goto page_ok;
1052 1168
1053readpage_eio:
1054 error = -EIO;
1055readpage_error: 1169readpage_error:
1056 /* UHHUH! A synchronous read error occurred. Report it */ 1170 /* UHHUH! A synchronous read error occurred. Report it */
1057 desc->error = error; 1171 desc->error = error;
@@ -1086,8 +1200,7 @@ out:
1086 ra->prev_pos |= prev_offset; 1200 ra->prev_pos |= prev_offset;
1087 1201
1088 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1202 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1089 if (filp) 1203 file_accessed(filp);
1090 file_accessed(filp);
1091} 1204}
1092 1205
1093int file_read_actor(read_descriptor_t *desc, struct page *page, 1206int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1200,42 +1313,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1200 1313
1201 mapping = filp->f_mapping; 1314 mapping = filp->f_mapping;
1202 inode = mapping->host; 1315 inode = mapping->host;
1203 retval = 0;
1204 if (!count) 1316 if (!count)
1205 goto out; /* skip atime */ 1317 goto out; /* skip atime */
1206 size = i_size_read(inode); 1318 size = i_size_read(inode);
1207 if (pos < size) { 1319 if (pos < size) {
1208 retval = generic_file_direct_IO(READ, iocb, 1320 retval = filemap_write_and_wait(mapping);
1209 iov, pos, nr_segs); 1321 if (!retval) {
1322 retval = mapping->a_ops->direct_IO(READ, iocb,
1323 iov, pos, nr_segs);
1324 }
1210 if (retval > 0) 1325 if (retval > 0)
1211 *ppos = pos + retval; 1326 *ppos = pos + retval;
1212 } 1327 if (retval) {
1213 if (likely(retval != 0)) { 1328 file_accessed(filp);
1214 file_accessed(filp); 1329 goto out;
1215 goto out; 1330 }
1216 } 1331 }
1217 } 1332 }
1218 1333
1219 retval = 0; 1334 for (seg = 0; seg < nr_segs; seg++) {
1220 if (count) { 1335 read_descriptor_t desc;
1221 for (seg = 0; seg < nr_segs; seg++) {
1222 read_descriptor_t desc;
1223 1336
1224 desc.written = 0; 1337 desc.written = 0;
1225 desc.arg.buf = iov[seg].iov_base; 1338 desc.arg.buf = iov[seg].iov_base;
1226 desc.count = iov[seg].iov_len; 1339 desc.count = iov[seg].iov_len;
1227 if (desc.count == 0) 1340 if (desc.count == 0)
1228 continue; 1341 continue;
1229 desc.error = 0; 1342 desc.error = 0;
1230 do_generic_file_read(filp,ppos,&desc,file_read_actor); 1343 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1231 retval += desc.written; 1344 retval += desc.written;
1232 if (desc.error) { 1345 if (desc.error) {
1233 retval = retval ?: desc.error; 1346 retval = retval ?: desc.error;
1234 break; 1347 break;
1235 }
1236 if (desc.count > 0)
1237 break;
1238 } 1348 }
1349 if (desc.count > 0)
1350 break;
1239 } 1351 }
1240out: 1352out:
1241 return retval; 1353 return retval;
@@ -1669,8 +1781,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
1669 return notify_change(dentry, &newattrs); 1781 return notify_change(dentry, &newattrs);
1670} 1782}
1671 1783
1672int remove_suid(struct dentry *dentry) 1784int file_remove_suid(struct file *file)
1673{ 1785{
1786 struct dentry *dentry = file->f_path.dentry;
1674 int killsuid = should_remove_suid(dentry); 1787 int killsuid = should_remove_suid(dentry);
1675 int killpriv = security_inode_need_killpriv(dentry); 1788 int killpriv = security_inode_need_killpriv(dentry);
1676 int error = 0; 1789 int error = 0;
@@ -1684,7 +1797,7 @@ int remove_suid(struct dentry *dentry)
1684 1797
1685 return error; 1798 return error;
1686} 1799}
1687EXPORT_SYMBOL(remove_suid); 1800EXPORT_SYMBOL(file_remove_suid);
1688 1801
1689static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1802static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1690 const struct iovec *iov, size_t base, size_t bytes) 1803 const struct iovec *iov, size_t base, size_t bytes)
@@ -1779,7 +1892,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
1779 * The !iov->iov_len check ensures we skip over unlikely 1892 * The !iov->iov_len check ensures we skip over unlikely
1780 * zero-length segments (without overruning the iovec). 1893 * zero-length segments (without overruning the iovec).
1781 */ 1894 */
1782 while (bytes || unlikely(!iov->iov_len && i->count)) { 1895 while (bytes || unlikely(i->count && !iov->iov_len)) {
1783 int copy; 1896 int copy;
1784 1897
1785 copy = min(bytes, iov->iov_len - base); 1898 copy = min(bytes, iov->iov_len - base);
@@ -2004,11 +2117,62 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2004 struct address_space *mapping = file->f_mapping; 2117 struct address_space *mapping = file->f_mapping;
2005 struct inode *inode = mapping->host; 2118 struct inode *inode = mapping->host;
2006 ssize_t written; 2119 ssize_t written;
2120 size_t write_len;
2121 pgoff_t end;
2007 2122
2008 if (count != ocount) 2123 if (count != ocount)
2009 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2124 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2010 2125
2011 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2126 /*
2127 * Unmap all mmappings of the file up-front.
2128 *
2129 * This will cause any pte dirty bits to be propagated into the
2130 * pageframes for the subsequent filemap_write_and_wait().
2131 */
2132 write_len = iov_length(iov, *nr_segs);
2133 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2134 if (mapping_mapped(mapping))
2135 unmap_mapping_range(mapping, pos, write_len, 0);
2136
2137 written = filemap_write_and_wait(mapping);
2138 if (written)
2139 goto out;
2140
2141 /*
2142 * After a write we want buffered reads to be sure to go to disk to get
2143 * the new data. We invalidate clean cached page from the region we're
2144 * about to write. We do this *before* the write so that we can return
2145 * without clobbering -EIOCBQUEUED from ->direct_IO().
2146 */
2147 if (mapping->nrpages) {
2148 written = invalidate_inode_pages2_range(mapping,
2149 pos >> PAGE_CACHE_SHIFT, end);
2150 /*
2151 * If a page can not be invalidated, return 0 to fall back
2152 * to buffered write.
2153 */
2154 if (written) {
2155 if (written == -EBUSY)
2156 return 0;
2157 goto out;
2158 }
2159 }
2160
2161 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2162
2163 /*
2164 * Finally, try again to invalidate clean pages which might have been
2165 * cached by non-direct readahead, or faulted in by get_user_pages()
2166 * if the source of the write was an mmap'ed region of the file
2167 * we're writing. Either one is a pretty crazy thing to do,
2168 * so we don't support it 100%. If this invalidation
2169 * fails, tough, the write still worked...
2170 */
2171 if (mapping->nrpages) {
2172 invalidate_inode_pages2_range(mapping,
2173 pos >> PAGE_CACHE_SHIFT, end);
2174 }
2175
2012 if (written > 0) { 2176 if (written > 0) {
2013 loff_t end = pos + written; 2177 loff_t end = pos + written;
2014 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2178 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2024,6 +2188,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2024 * i_mutex is held, which protects generic_osync_inode() from 2188 * i_mutex is held, which protects generic_osync_inode() from
2025 * livelocking. AIO O_DIRECT ops attempt to sync metadata here. 2189 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2026 */ 2190 */
2191out:
2027 if ((written >= 0 || written == -EIOCBQUEUED) && 2192 if ((written >= 0 || written == -EIOCBQUEUED) &&
2028 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2193 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2029 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2194 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2395,7 +2560,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2395 if (count == 0) 2560 if (count == 0)
2396 goto out; 2561 goto out;
2397 2562
2398 err = remove_suid(file->f_path.dentry); 2563 err = file_remove_suid(file);
2399 if (err) 2564 if (err)
2400 goto out; 2565 goto out;
2401 2566
@@ -2511,66 +2676,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2511} 2676}
2512EXPORT_SYMBOL(generic_file_aio_write); 2677EXPORT_SYMBOL(generic_file_aio_write);
2513 2678
2514/*
2515 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2516 * went wrong during pagecache shootdown.
2517 */
2518static ssize_t
2519generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2520 loff_t offset, unsigned long nr_segs)
2521{
2522 struct file *file = iocb->ki_filp;
2523 struct address_space *mapping = file->f_mapping;
2524 ssize_t retval;
2525 size_t write_len;
2526 pgoff_t end = 0; /* silence gcc */
2527
2528 /*
2529 * If it's a write, unmap all mmappings of the file up-front. This
2530 * will cause any pte dirty bits to be propagated into the pageframes
2531 * for the subsequent filemap_write_and_wait().
2532 */
2533 if (rw == WRITE) {
2534 write_len = iov_length(iov, nr_segs);
2535 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2536 if (mapping_mapped(mapping))
2537 unmap_mapping_range(mapping, offset, write_len, 0);
2538 }
2539
2540 retval = filemap_write_and_wait(mapping);
2541 if (retval)
2542 goto out;
2543
2544 /*
2545 * After a write we want buffered reads to be sure to go to disk to get
2546 * the new data. We invalidate clean cached page from the region we're
2547 * about to write. We do this *before* the write so that we can return
2548 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2549 */
2550 if (rw == WRITE && mapping->nrpages) {
2551 retval = invalidate_inode_pages2_range(mapping,
2552 offset >> PAGE_CACHE_SHIFT, end);
2553 if (retval)
2554 goto out;
2555 }
2556
2557 retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2558
2559 /*
2560 * Finally, try again to invalidate clean pages which might have been
2561 * cached by non-direct readahead, or faulted in by get_user_pages()
2562 * if the source of the write was an mmap'ed region of the file
2563 * we're writing. Either one is a pretty crazy thing to do,
2564 * so we don't support it 100%. If this invalidation
2565 * fails, tough, the write still worked...
2566 */
2567 if (rw == WRITE && mapping->nrpages) {
2568 invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2569 }
2570out:
2571 return retval;
2572}
2573
2574/** 2679/**
2575 * try_to_release_page() - release old fs-specific metadata on a page 2680 * try_to_release_page() - release old fs-specific metadata on a page
2576 * 2681 *
@@ -2582,9 +2687,8 @@ out:
2582 * Otherwise return zero. 2687 * Otherwise return zero.
2583 * 2688 *
2584 * The @gfp_mask argument specifies whether I/O may be performed to release 2689 * The @gfp_mask argument specifies whether I/O may be performed to release
2585 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). 2690 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2586 * 2691 *
2587 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2588 */ 2692 */
2589int try_to_release_page(struct page *page, gfp_t gfp_mask) 2693int try_to_release_page(struct page *page, gfp_t gfp_mask)
2590{ 2694{
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..b5167dfb2f2d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,7 +13,10 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/mmu_notifier.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/seqlock.h>
19#include <linux/mutex.h>
17#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
18#include <asm/io.h> 21#include <asm/io.h>
19 22
@@ -21,22 +24,18 @@
21 * We do use our own empty page to avoid interference with other users 24 * We do use our own empty page to avoid interference with other users
22 * of ZERO_PAGE(), such as /dev/zero 25 * of ZERO_PAGE(), such as /dev/zero
23 */ 26 */
27static DEFINE_MUTEX(xip_sparse_mutex);
28static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
24static struct page *__xip_sparse_page; 29static struct page *__xip_sparse_page;
25 30
31/* called under xip_sparse_mutex */
26static struct page *xip_sparse_page(void) 32static struct page *xip_sparse_page(void)
27{ 33{
28 if (!__xip_sparse_page) { 34 if (!__xip_sparse_page) {
29 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); 35 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
30 36
31 if (page) { 37 if (page)
32 static DEFINE_SPINLOCK(xip_alloc_lock); 38 __xip_sparse_page = page;
33 spin_lock(&xip_alloc_lock);
34 if (!__xip_sparse_page)
35 __xip_sparse_page = page;
36 else
37 __free_page(page);
38 spin_unlock(&xip_alloc_lock);
39 }
40 } 39 }
41 return __xip_sparse_page; 40 return __xip_sparse_page;
42} 41}
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping,
173 pte_t pteval; 172 pte_t pteval;
174 spinlock_t *ptl; 173 spinlock_t *ptl;
175 struct page *page; 174 struct page *page;
175 unsigned count;
176 int locked = 0;
177
178 count = read_seqcount_begin(&xip_sparse_seq);
176 179
177 page = __xip_sparse_page; 180 page = __xip_sparse_page;
178 if (!page) 181 if (!page)
179 return; 182 return;
180 183
184retry:
181 spin_lock(&mapping->i_mmap_lock); 185 spin_lock(&mapping->i_mmap_lock);
182 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 186 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
183 mm = vma->vm_mm; 187 mm = vma->vm_mm;
184 address = vma->vm_start + 188 address = vma->vm_start +
185 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
186 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 190 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
187 pte = page_check_address(page, mm, address, &ptl); 191 pte = page_check_address(page, mm, address, &ptl, 1);
188 if (pte) { 192 if (pte) {
189 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
190 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
191 pteval = ptep_clear_flush(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
192 page_remove_rmap(page, vma); 196 page_remove_rmap(page, vma);
193 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
197 } 201 }
198 } 202 }
199 spin_unlock(&mapping->i_mmap_lock); 203 spin_unlock(&mapping->i_mmap_lock);
204
205 if (locked) {
206 mutex_unlock(&xip_sparse_mutex);
207 } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
208 mutex_lock(&xip_sparse_mutex);
209 locked = 1;
210 goto retry;
211 }
200} 212}
201 213
202/* 214/*
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
217 int error; 229 int error;
218 230
219 /* XXX: are VM_FAULT_ codes OK? */ 231 /* XXX: are VM_FAULT_ codes OK? */
220 232again:
221 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 233 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
222 if (vmf->pgoff >= size) 234 if (vmf->pgoff >= size)
223 return VM_FAULT_SIGBUS; 235 return VM_FAULT_SIGBUS;
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
236 int err; 248 int err;
237 249
238 /* maybe shared writable, allocate new block */ 250 /* maybe shared writable, allocate new block */
251 mutex_lock(&xip_sparse_mutex);
239 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, 252 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
240 &xip_mem, &xip_pfn); 253 &xip_mem, &xip_pfn);
254 mutex_unlock(&xip_sparse_mutex);
241 if (error) 255 if (error)
242 return VM_FAULT_SIGBUS; 256 return VM_FAULT_SIGBUS;
243 /* unmap sparse mappings at pgoff from all other vmas */ 257 /* unmap sparse mappings at pgoff from all other vmas */
@@ -251,14 +265,34 @@ found:
251 BUG_ON(err); 265 BUG_ON(err);
252 return VM_FAULT_NOPAGE; 266 return VM_FAULT_NOPAGE;
253 } else { 267 } else {
268 int err, ret = VM_FAULT_OOM;
269
270 mutex_lock(&xip_sparse_mutex);
271 write_seqcount_begin(&xip_sparse_seq);
272 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
273 &xip_mem, &xip_pfn);
274 if (unlikely(!error)) {
275 write_seqcount_end(&xip_sparse_seq);
276 mutex_unlock(&xip_sparse_mutex);
277 goto again;
278 }
279 if (error != -ENODATA)
280 goto out;
254 /* not shared and writable, use xip_sparse_page() */ 281 /* not shared and writable, use xip_sparse_page() */
255 page = xip_sparse_page(); 282 page = xip_sparse_page();
256 if (!page) 283 if (!page)
257 return VM_FAULT_OOM; 284 goto out;
285 err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
286 page);
287 if (err == -ENOMEM)
288 goto out;
258 289
259 page_cache_get(page); 290 ret = VM_FAULT_NOPAGE;
260 vmf->page = page; 291out:
261 return 0; 292 write_seqcount_end(&xip_sparse_seq);
293 mutex_unlock(&xip_sparse_mutex);
294
295 return ret;
262 } 296 }
263} 297}
264 298
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
307 &xip_mem, &xip_pfn); 341 &xip_mem, &xip_pfn);
308 if (status == -ENODATA) { 342 if (status == -ENODATA) {
309 /* we allocate a new page unmap it */ 343 /* we allocate a new page unmap it */
344 mutex_lock(&xip_sparse_mutex);
310 status = a_ops->get_xip_mem(mapping, index, 1, 345 status = a_ops->get_xip_mem(mapping, index, 1,
311 &xip_mem, &xip_pfn); 346 &xip_mem, &xip_pfn);
347 mutex_unlock(&xip_sparse_mutex);
312 if (!status) 348 if (!status)
313 /* unmap page at pgoff from all other vmas */ 349 /* unmap page at pgoff from all other vmas */
314 __xip_unmap(mapping, index); 350 __xip_unmap(mapping, index);
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
380 if (count == 0) 416 if (count == 0)
381 goto out_backing; 417 goto out_backing;
382 418
383 ret = remove_suid(filp->f_path.dentry); 419 ret = file_remove_suid(filp);
384 if (ret) 420 if (ret)
385 goto out_backing; 421 goto out_backing;
386 422
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7d12ca70ef7b 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,11 +15,14 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h>
18 19
19#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
20#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
21#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
22 23
24#include "internal.h"
25
23static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, 26static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
24 unsigned long addr, pte_t *ptep) 27 unsigned long addr, pte_t *ptep)
25{ 28{
@@ -214,13 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
214 spin_unlock(&mapping->i_mmap_lock); 217 spin_unlock(&mapping->i_mmap_lock);
215 } 218 }
216 219
220 if (vma->vm_flags & VM_LOCKED) {
221 /*
222 * drop PG_Mlocked flag for over-mapped range
223 */
224 unsigned int saved_flags = vma->vm_flags;
225 munlock_vma_pages_range(vma, start, start + size);
226 vma->vm_flags = saved_flags;
227 }
228
229 mmu_notifier_invalidate_range_start(mm, start, start + size);
217 err = populate_range(mm, vma, start, size, pgoff); 230 err = populate_range(mm, vma, start, size, pgoff);
231 mmu_notifier_invalidate_range_end(mm, start, start + size);
218 if (!err && !(flags & MAP_NONBLOCK)) { 232 if (!err && !(flags & MAP_NONBLOCK)) {
219 if (unlikely(has_write_lock)) { 233 if (vma->vm_flags & VM_LOCKED) {
220 downgrade_write(&mm->mmap_sem); 234 /*
221 has_write_lock = 0; 235 * might be mapping previously unmapped range of file
236 */
237 mlock_vma_pages_range(vma, start, start + size);
238 } else {
239 if (unlikely(has_write_lock)) {
240 downgrade_write(&mm->mmap_sem);
241 has_write_lock = 0;
242 }
243 make_pages_present(start, start+size);
222 } 244 }
223 make_pages_present(start, start+size);
224 } 245 }
225 246
226 /* 247 /*
@@ -237,4 +258,3 @@ out:
237 258
238 return err; 259 return err;
239} 260}
240
diff --git a/mm/highmem.c b/mm/highmem.c
index 7da4a7b6af11..b36b83b920ff 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -40,6 +40,7 @@
40#ifdef CONFIG_HIGHMEM 40#ifdef CONFIG_HIGHMEM
41 41
42unsigned long totalhigh_pages __read_mostly; 42unsigned long totalhigh_pages __read_mostly;
43EXPORT_SYMBOL(totalhigh_pages);
43 44
44unsigned int nr_free_highpages (void) 45unsigned int nr_free_highpages (void)
45{ 46{
@@ -69,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
69static void flush_all_zero_pkmaps(void) 70static void flush_all_zero_pkmaps(void)
70{ 71{
71 int i; 72 int i;
73 int need_flush = 0;
72 74
73 flush_cache_kmaps(); 75 flush_cache_kmaps();
74 76
@@ -100,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
100 &pkmap_page_table[i]); 102 &pkmap_page_table[i]);
101 103
102 set_page_address(page, NULL); 104 set_page_address(page, NULL);
105 need_flush = 1;
103 } 106 }
104 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); 107 if (need_flush)
108 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
105} 109}
106 110
107/** 111/**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..421aee99b84a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,45 +7,360 @@
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/seq_file.h>
10#include <linux/sysctl.h> 11#include <linux/sysctl.h>
11#include <linux/highmem.h> 12#include <linux/highmem.h>
13#include <linux/mmu_notifier.h>
12#include <linux/nodemask.h> 14#include <linux/nodemask.h>
13#include <linux/pagemap.h> 15#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 16#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 17#include <linux/cpuset.h>
16#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/bootmem.h>
20#include <linux/sysfs.h>
17 21
18#include <asm/page.h> 22#include <asm/page.h>
19#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/io.h>
20 25
21#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
22#include "internal.h" 27#include "internal.h"
23 28
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 29const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 30static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 31unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid; 32
33static int max_hstate;
34unsigned int default_hstate_idx;
35struct hstate hstates[HUGE_MAX_HSTATE];
36
37__initdata LIST_HEAD(huge_boot_pages);
38
39/* for command line parsing */
40static struct hstate * __initdata parsed_hstate;
41static unsigned long __initdata default_hstate_max_huge_pages;
42static unsigned long __initdata default_hstate_size;
43
44#define for_each_hstate(h) \
45 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
37 46
38/* 47/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 48 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
40 */ 49 */
41static DEFINE_SPINLOCK(hugetlb_lock); 50static DEFINE_SPINLOCK(hugetlb_lock);
42 51
43static void clear_huge_page(struct page *page, unsigned long addr) 52/*
53 * Region tracking -- allows tracking of reservations and instantiated pages
54 * across the pages in a mapping.
55 *
56 * The region data structures are protected by a combination of the mmap_sem
57 * and the hugetlb_instantion_mutex. To access or modify a region the caller
58 * must either hold the mmap_sem for write, or the mmap_sem for read and
59 * the hugetlb_instantiation mutex:
60 *
61 * down_write(&mm->mmap_sem);
62 * or
63 * down_read(&mm->mmap_sem);
64 * mutex_lock(&hugetlb_instantiation_mutex);
65 */
66struct file_region {
67 struct list_head link;
68 long from;
69 long to;
70};
71
72static long region_add(struct list_head *head, long f, long t)
73{
74 struct file_region *rg, *nrg, *trg;
75
76 /* Locate the region we are either in or before. */
77 list_for_each_entry(rg, head, link)
78 if (f <= rg->to)
79 break;
80
81 /* Round our left edge to the current segment if it encloses us. */
82 if (f > rg->from)
83 f = rg->from;
84
85 /* Check for and consume any regions we now overlap with. */
86 nrg = rg;
87 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
88 if (&rg->link == head)
89 break;
90 if (rg->from > t)
91 break;
92
93 /* If this area reaches higher then extend our area to
94 * include it completely. If this is not the first area
95 * which we intend to reuse, free it. */
96 if (rg->to > t)
97 t = rg->to;
98 if (rg != nrg) {
99 list_del(&rg->link);
100 kfree(rg);
101 }
102 }
103 nrg->from = f;
104 nrg->to = t;
105 return 0;
106}
107
108static long region_chg(struct list_head *head, long f, long t)
109{
110 struct file_region *rg, *nrg;
111 long chg = 0;
112
113 /* Locate the region we are before or in. */
114 list_for_each_entry(rg, head, link)
115 if (f <= rg->to)
116 break;
117
118 /* If we are below the current region then a new region is required.
119 * Subtle, allocate a new region at the position but make it zero
120 * size such that we can guarantee to record the reservation. */
121 if (&rg->link == head || t < rg->from) {
122 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
123 if (!nrg)
124 return -ENOMEM;
125 nrg->from = f;
126 nrg->to = f;
127 INIT_LIST_HEAD(&nrg->link);
128 list_add(&nrg->link, rg->link.prev);
129
130 return t - f;
131 }
132
133 /* Round our left edge to the current segment if it encloses us. */
134 if (f > rg->from)
135 f = rg->from;
136 chg = t - f;
137
138 /* Check for and consume any regions we now overlap with. */
139 list_for_each_entry(rg, rg->link.prev, link) {
140 if (&rg->link == head)
141 break;
142 if (rg->from > t)
143 return chg;
144
145 /* We overlap with this area, if it extends futher than
146 * us then we must extend ourselves. Account for its
147 * existing reservation. */
148 if (rg->to > t) {
149 chg += rg->to - t;
150 t = rg->to;
151 }
152 chg -= rg->to - rg->from;
153 }
154 return chg;
155}
156
157static long region_truncate(struct list_head *head, long end)
158{
159 struct file_region *rg, *trg;
160 long chg = 0;
161
162 /* Locate the region we are either in or before. */
163 list_for_each_entry(rg, head, link)
164 if (end <= rg->to)
165 break;
166 if (&rg->link == head)
167 return 0;
168
169 /* If we are in the middle of a region then adjust it. */
170 if (end > rg->from) {
171 chg = rg->to - end;
172 rg->to = end;
173 rg = list_entry(rg->link.next, typeof(*rg), link);
174 }
175
176 /* Drop any remaining regions. */
177 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
178 if (&rg->link == head)
179 break;
180 chg += rg->to - rg->from;
181 list_del(&rg->link);
182 kfree(rg);
183 }
184 return chg;
185}
186
187static long region_count(struct list_head *head, long f, long t)
188{
189 struct file_region *rg;
190 long chg = 0;
191
192 /* Locate each segment we overlap with, and count that overlap. */
193 list_for_each_entry(rg, head, link) {
194 int seg_from;
195 int seg_to;
196
197 if (rg->to <= f)
198 continue;
199 if (rg->from >= t)
200 break;
201
202 seg_from = max(rg->from, f);
203 seg_to = min(rg->to, t);
204
205 chg += seg_to - seg_from;
206 }
207
208 return chg;
209}
210
211/*
212 * Convert the address within this vma to the page offset within
213 * the mapping, in pagecache page units; huge pages here.
214 */
215static pgoff_t vma_hugecache_offset(struct hstate *h,
216 struct vm_area_struct *vma, unsigned long address)
217{
218 return ((address - vma->vm_start) >> huge_page_shift(h)) +
219 (vma->vm_pgoff >> huge_page_order(h));
220}
221
222/*
223 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
224 * bits of the reservation map pointer, which are always clear due to
225 * alignment.
226 */
227#define HPAGE_RESV_OWNER (1UL << 0)
228#define HPAGE_RESV_UNMAPPED (1UL << 1)
229#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
230
231/*
232 * These helpers are used to track how many pages are reserved for
233 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
234 * is guaranteed to have their future faults succeed.
235 *
236 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
237 * the reserve counters are updated with the hugetlb_lock held. It is safe
238 * to reset the VMA at fork() time as it is not in use yet and there is no
239 * chance of the global counters getting corrupted as a result of the values.
240 *
241 * The private mapping reservation is represented in a subtly different
242 * manner to a shared mapping. A shared mapping has a region map associated
243 * with the underlying file, this region map represents the backing file
244 * pages which have ever had a reservation assigned which this persists even
245 * after the page is instantiated. A private mapping has a region map
246 * associated with the original mmap which is attached to all VMAs which
247 * reference it, this region map represents those offsets which have consumed
248 * reservation ie. where pages have been instantiated.
249 */
250static unsigned long get_vma_private_data(struct vm_area_struct *vma)
251{
252 return (unsigned long)vma->vm_private_data;
253}
254
255static void set_vma_private_data(struct vm_area_struct *vma,
256 unsigned long value)
257{
258 vma->vm_private_data = (void *)value;
259}
260
261struct resv_map {
262 struct kref refs;
263 struct list_head regions;
264};
265
266static struct resv_map *resv_map_alloc(void)
267{
268 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
269 if (!resv_map)
270 return NULL;
271
272 kref_init(&resv_map->refs);
273 INIT_LIST_HEAD(&resv_map->regions);
274
275 return resv_map;
276}
277
278static void resv_map_release(struct kref *ref)
279{
280 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
281
282 /* Clear out any active regions before we release the map. */
283 region_truncate(&resv_map->regions, 0);
284 kfree(resv_map);
285}
286
287static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
288{
289 VM_BUG_ON(!is_vm_hugetlb_page(vma));
290 if (!(vma->vm_flags & VM_SHARED))
291 return (struct resv_map *)(get_vma_private_data(vma) &
292 ~HPAGE_RESV_MASK);
293 return NULL;
294}
295
296static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
297{
298 VM_BUG_ON(!is_vm_hugetlb_page(vma));
299 VM_BUG_ON(vma->vm_flags & VM_SHARED);
300
301 set_vma_private_data(vma, (get_vma_private_data(vma) &
302 HPAGE_RESV_MASK) | (unsigned long)map);
303}
304
305static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
306{
307 VM_BUG_ON(!is_vm_hugetlb_page(vma));
308 VM_BUG_ON(vma->vm_flags & VM_SHARED);
309
310 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
311}
312
313static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
314{
315 VM_BUG_ON(!is_vm_hugetlb_page(vma));
316
317 return (get_vma_private_data(vma) & flag) != 0;
318}
319
320/* Decrement the reserved pages in the hugepage pool by one */
321static void decrement_hugepage_resv_vma(struct hstate *h,
322 struct vm_area_struct *vma)
323{
324 if (vma->vm_flags & VM_NORESERVE)
325 return;
326
327 if (vma->vm_flags & VM_SHARED) {
328 /* Shared mappings always use reserves */
329 h->resv_huge_pages--;
330 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
331 /*
332 * Only the process that called mmap() has reserves for
333 * private mappings.
334 */
335 h->resv_huge_pages--;
336 }
337}
338
339/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
340void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
341{
342 VM_BUG_ON(!is_vm_hugetlb_page(vma));
343 if (!(vma->vm_flags & VM_SHARED))
344 vma->vm_private_data = (void *)0;
345}
346
347/* Returns true if the VMA has associated reserve pages */
348static int vma_has_reserves(struct vm_area_struct *vma)
349{
350 if (vma->vm_flags & VM_SHARED)
351 return 1;
352 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
353 return 1;
354 return 0;
355}
356
357static void clear_huge_page(struct page *page,
358 unsigned long addr, unsigned long sz)
44{ 359{
45 int i; 360 int i;
46 361
47 might_sleep(); 362 might_sleep();
48 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 363 for (i = 0; i < sz/PAGE_SIZE; i++) {
49 cond_resched(); 364 cond_resched();
50 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 365 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
51 } 366 }
@@ -55,42 +370,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
55 unsigned long addr, struct vm_area_struct *vma) 370 unsigned long addr, struct vm_area_struct *vma)
56{ 371{
57 int i; 372 int i;
373 struct hstate *h = hstate_vma(vma);
58 374
59 might_sleep(); 375 might_sleep();
60 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 376 for (i = 0; i < pages_per_huge_page(h); i++) {
61 cond_resched(); 377 cond_resched();
62 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 378 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
63 } 379 }
64} 380}
65 381
66static void enqueue_huge_page(struct page *page) 382static void enqueue_huge_page(struct hstate *h, struct page *page)
67{ 383{
68 int nid = page_to_nid(page); 384 int nid = page_to_nid(page);
69 list_add(&page->lru, &hugepage_freelists[nid]); 385 list_add(&page->lru, &h->hugepage_freelists[nid]);
70 free_huge_pages++; 386 h->free_huge_pages++;
71 free_huge_pages_node[nid]++; 387 h->free_huge_pages_node[nid]++;
72} 388}
73 389
74static struct page *dequeue_huge_page(void) 390static struct page *dequeue_huge_page(struct hstate *h)
75{ 391{
76 int nid; 392 int nid;
77 struct page *page = NULL; 393 struct page *page = NULL;
78 394
79 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 395 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
80 if (!list_empty(&hugepage_freelists[nid])) { 396 if (!list_empty(&h->hugepage_freelists[nid])) {
81 page = list_entry(hugepage_freelists[nid].next, 397 page = list_entry(h->hugepage_freelists[nid].next,
82 struct page, lru); 398 struct page, lru);
83 list_del(&page->lru); 399 list_del(&page->lru);
84 free_huge_pages--; 400 h->free_huge_pages--;
85 free_huge_pages_node[nid]--; 401 h->free_huge_pages_node[nid]--;
86 break; 402 break;
87 } 403 }
88 } 404 }
89 return page; 405 return page;
90} 406}
91 407
92static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 408static struct page *dequeue_huge_page_vma(struct hstate *h,
93 unsigned long address) 409 struct vm_area_struct *vma,
410 unsigned long address, int avoid_reserve)
94{ 411{
95 int nid; 412 int nid;
96 struct page *page = NULL; 413 struct page *page = NULL;
@@ -101,18 +418,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
101 struct zone *zone; 418 struct zone *zone;
102 struct zoneref *z; 419 struct zoneref *z;
103 420
421 /*
422 * A child process with MAP_PRIVATE mappings created by their parent
423 * have no page reserves. This check ensures that reservations are
424 * not "stolen". The child may still get SIGKILLed
425 */
426 if (!vma_has_reserves(vma) &&
427 h->free_huge_pages - h->resv_huge_pages == 0)
428 return NULL;
429
430 /* If reserves cannot be used, ensure enough pages are in the pool */
431 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
432 return NULL;
433
104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 434 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) { 435 MAX_NR_ZONES - 1, nodemask) {
106 nid = zone_to_nid(zone); 436 nid = zone_to_nid(zone);
107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 437 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
108 !list_empty(&hugepage_freelists[nid])) { 438 !list_empty(&h->hugepage_freelists[nid])) {
109 page = list_entry(hugepage_freelists[nid].next, 439 page = list_entry(h->hugepage_freelists[nid].next,
110 struct page, lru); 440 struct page, lru);
111 list_del(&page->lru); 441 list_del(&page->lru);
112 free_huge_pages--; 442 h->free_huge_pages--;
113 free_huge_pages_node[nid]--; 443 h->free_huge_pages_node[nid]--;
114 if (vma && vma->vm_flags & VM_MAYSHARE) 444
115 resv_huge_pages--; 445 if (!avoid_reserve)
446 decrement_hugepage_resv_vma(h, vma);
447
116 break; 448 break;
117 } 449 }
118 } 450 }
@@ -120,12 +452,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
120 return page; 452 return page;
121} 453}
122 454
123static void update_and_free_page(struct page *page) 455static void update_and_free_page(struct hstate *h, struct page *page)
124{ 456{
125 int i; 457 int i;
126 nr_huge_pages--; 458
127 nr_huge_pages_node[page_to_nid(page)]--; 459 h->nr_huge_pages--;
128 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 460 h->nr_huge_pages_node[page_to_nid(page)]--;
461 for (i = 0; i < pages_per_huge_page(h); i++) {
129 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 462 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
130 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 463 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
131 1 << PG_private | 1<< PG_writeback); 464 1 << PG_private | 1<< PG_writeback);
@@ -133,11 +466,27 @@ static void update_and_free_page(struct page *page)
133 set_compound_page_dtor(page, NULL); 466 set_compound_page_dtor(page, NULL);
134 set_page_refcounted(page); 467 set_page_refcounted(page);
135 arch_release_hugepage(page); 468 arch_release_hugepage(page);
136 __free_pages(page, HUGETLB_PAGE_ORDER); 469 __free_pages(page, huge_page_order(h));
470}
471
472struct hstate *size_to_hstate(unsigned long size)
473{
474 struct hstate *h;
475
476 for_each_hstate(h) {
477 if (huge_page_size(h) == size)
478 return h;
479 }
480 return NULL;
137} 481}
138 482
139static void free_huge_page(struct page *page) 483static void free_huge_page(struct page *page)
140{ 484{
485 /*
486 * Can't pass hstate in here because it is called from the
487 * compound page destructor.
488 */
489 struct hstate *h = page_hstate(page);
141 int nid = page_to_nid(page); 490 int nid = page_to_nid(page);
142 struct address_space *mapping; 491 struct address_space *mapping;
143 492
@@ -147,12 +496,12 @@ static void free_huge_page(struct page *page)
147 INIT_LIST_HEAD(&page->lru); 496 INIT_LIST_HEAD(&page->lru);
148 497
149 spin_lock(&hugetlb_lock); 498 spin_lock(&hugetlb_lock);
150 if (surplus_huge_pages_node[nid]) { 499 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
151 update_and_free_page(page); 500 update_and_free_page(h, page);
152 surplus_huge_pages--; 501 h->surplus_huge_pages--;
153 surplus_huge_pages_node[nid]--; 502 h->surplus_huge_pages_node[nid]--;
154 } else { 503 } else {
155 enqueue_huge_page(page); 504 enqueue_huge_page(h, page);
156 } 505 }
157 spin_unlock(&hugetlb_lock); 506 spin_unlock(&hugetlb_lock);
158 if (mapping) 507 if (mapping)
@@ -164,7 +513,7 @@ static void free_huge_page(struct page *page)
164 * balanced by operating on them in a round-robin fashion. 513 * balanced by operating on them in a round-robin fashion.
165 * Returns 1 if an adjustment was made. 514 * Returns 1 if an adjustment was made.
166 */ 515 */
167static int adjust_pool_surplus(int delta) 516static int adjust_pool_surplus(struct hstate *h, int delta)
168{ 517{
169 static int prev_nid; 518 static int prev_nid;
170 int nid = prev_nid; 519 int nid = prev_nid;
@@ -177,15 +526,15 @@ static int adjust_pool_surplus(int delta)
177 nid = first_node(node_online_map); 526 nid = first_node(node_online_map);
178 527
179 /* To shrink on this node, there must be a surplus page */ 528 /* To shrink on this node, there must be a surplus page */
180 if (delta < 0 && !surplus_huge_pages_node[nid]) 529 if (delta < 0 && !h->surplus_huge_pages_node[nid])
181 continue; 530 continue;
182 /* Surplus cannot exceed the total number of pages */ 531 /* Surplus cannot exceed the total number of pages */
183 if (delta > 0 && surplus_huge_pages_node[nid] >= 532 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
184 nr_huge_pages_node[nid]) 533 h->nr_huge_pages_node[nid])
185 continue; 534 continue;
186 535
187 surplus_huge_pages += delta; 536 h->surplus_huge_pages += delta;
188 surplus_huge_pages_node[nid] += delta; 537 h->surplus_huge_pages_node[nid] += delta;
189 ret = 1; 538 ret = 1;
190 break; 539 break;
191 } while (nid != prev_nid); 540 } while (nid != prev_nid);
@@ -194,59 +543,74 @@ static int adjust_pool_surplus(int delta)
194 return ret; 543 return ret;
195} 544}
196 545
197static struct page *alloc_fresh_huge_page_node(int nid) 546static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
547{
548 set_compound_page_dtor(page, free_huge_page);
549 spin_lock(&hugetlb_lock);
550 h->nr_huge_pages++;
551 h->nr_huge_pages_node[nid]++;
552 spin_unlock(&hugetlb_lock);
553 put_page(page); /* free it into the hugepage allocator */
554}
555
556static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
198{ 557{
199 struct page *page; 558 struct page *page;
200 559
560 if (h->order >= MAX_ORDER)
561 return NULL;
562
201 page = alloc_pages_node(nid, 563 page = alloc_pages_node(nid,
202 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 564 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
203 __GFP_REPEAT|__GFP_NOWARN, 565 __GFP_REPEAT|__GFP_NOWARN,
204 HUGETLB_PAGE_ORDER); 566 huge_page_order(h));
205 if (page) { 567 if (page) {
206 if (arch_prepare_hugepage(page)) { 568 if (arch_prepare_hugepage(page)) {
207 __free_pages(page, HUGETLB_PAGE_ORDER); 569 __free_pages(page, huge_page_order(h));
208 return NULL; 570 return NULL;
209 } 571 }
210 set_compound_page_dtor(page, free_huge_page); 572 prep_new_huge_page(h, page, nid);
211 spin_lock(&hugetlb_lock);
212 nr_huge_pages++;
213 nr_huge_pages_node[nid]++;
214 spin_unlock(&hugetlb_lock);
215 put_page(page); /* free it into the hugepage allocator */
216 } 573 }
217 574
218 return page; 575 return page;
219} 576}
220 577
221static int alloc_fresh_huge_page(void) 578/*
579 * Use a helper variable to find the next node and then
580 * copy it back to hugetlb_next_nid afterwards:
581 * otherwise there's a window in which a racer might
582 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
583 * But we don't need to use a spin_lock here: it really
584 * doesn't matter if occasionally a racer chooses the
585 * same nid as we do. Move nid forward in the mask even
586 * if we just successfully allocated a hugepage so that
587 * the next caller gets hugepages on the next node.
588 */
589static int hstate_next_node(struct hstate *h)
590{
591 int next_nid;
592 next_nid = next_node(h->hugetlb_next_nid, node_online_map);
593 if (next_nid == MAX_NUMNODES)
594 next_nid = first_node(node_online_map);
595 h->hugetlb_next_nid = next_nid;
596 return next_nid;
597}
598
599static int alloc_fresh_huge_page(struct hstate *h)
222{ 600{
223 struct page *page; 601 struct page *page;
224 int start_nid; 602 int start_nid;
225 int next_nid; 603 int next_nid;
226 int ret = 0; 604 int ret = 0;
227 605
228 start_nid = hugetlb_next_nid; 606 start_nid = h->hugetlb_next_nid;
229 607
230 do { 608 do {
231 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 609 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
232 if (page) 610 if (page)
233 ret = 1; 611 ret = 1;
234 /* 612 next_nid = hstate_next_node(h);
235 * Use a helper variable to find the next node and then 613 } while (!page && h->hugetlb_next_nid != start_nid);
236 * copy it back to hugetlb_next_nid afterwards:
237 * otherwise there's a window in which a racer might
238 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
239 * But we don't need to use a spin_lock here: it really
240 * doesn't matter if occasionally a racer chooses the
241 * same nid as we do. Move nid forward in the mask even
242 * if we just successfully allocated a hugepage so that
243 * the next caller gets hugepages on the next node.
244 */
245 next_nid = next_node(hugetlb_next_nid, node_online_map);
246 if (next_nid == MAX_NUMNODES)
247 next_nid = first_node(node_online_map);
248 hugetlb_next_nid = next_nid;
249 } while (!page && hugetlb_next_nid != start_nid);
250 614
251 if (ret) 615 if (ret)
252 count_vm_event(HTLB_BUDDY_PGALLOC); 616 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +620,15 @@ static int alloc_fresh_huge_page(void)
256 return ret; 620 return ret;
257} 621}
258 622
259static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 623static struct page *alloc_buddy_huge_page(struct hstate *h,
260 unsigned long address) 624 struct vm_area_struct *vma, unsigned long address)
261{ 625{
262 struct page *page; 626 struct page *page;
263 unsigned int nid; 627 unsigned int nid;
264 628
629 if (h->order >= MAX_ORDER)
630 return NULL;
631
265 /* 632 /*
266 * Assume we will successfully allocate the surplus page to 633 * Assume we will successfully allocate the surplus page to
267 * prevent racing processes from causing the surplus to exceed 634 * prevent racing processes from causing the surplus to exceed
@@ -286,18 +653,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
286 * per-node value is checked there. 653 * per-node value is checked there.
287 */ 654 */
288 spin_lock(&hugetlb_lock); 655 spin_lock(&hugetlb_lock);
289 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 656 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
290 spin_unlock(&hugetlb_lock); 657 spin_unlock(&hugetlb_lock);
291 return NULL; 658 return NULL;
292 } else { 659 } else {
293 nr_huge_pages++; 660 h->nr_huge_pages++;
294 surplus_huge_pages++; 661 h->surplus_huge_pages++;
295 } 662 }
296 spin_unlock(&hugetlb_lock); 663 spin_unlock(&hugetlb_lock);
297 664
298 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 665 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
299 __GFP_REPEAT|__GFP_NOWARN, 666 __GFP_REPEAT|__GFP_NOWARN,
300 HUGETLB_PAGE_ORDER); 667 huge_page_order(h));
668
669 if (page && arch_prepare_hugepage(page)) {
670 __free_pages(page, huge_page_order(h));
671 return NULL;
672 }
301 673
302 spin_lock(&hugetlb_lock); 674 spin_lock(&hugetlb_lock);
303 if (page) { 675 if (page) {
@@ -312,12 +684,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
312 /* 684 /*
313 * We incremented the global counters already 685 * We incremented the global counters already
314 */ 686 */
315 nr_huge_pages_node[nid]++; 687 h->nr_huge_pages_node[nid]++;
316 surplus_huge_pages_node[nid]++; 688 h->surplus_huge_pages_node[nid]++;
317 __count_vm_event(HTLB_BUDDY_PGALLOC); 689 __count_vm_event(HTLB_BUDDY_PGALLOC);
318 } else { 690 } else {
319 nr_huge_pages--; 691 h->nr_huge_pages--;
320 surplus_huge_pages--; 692 h->surplus_huge_pages--;
321 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 693 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
322 } 694 }
323 spin_unlock(&hugetlb_lock); 695 spin_unlock(&hugetlb_lock);
@@ -329,16 +701,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
329 * Increase the hugetlb pool such that it can accomodate a reservation 701 * Increase the hugetlb pool such that it can accomodate a reservation
330 * of size 'delta'. 702 * of size 'delta'.
331 */ 703 */
332static int gather_surplus_pages(int delta) 704static int gather_surplus_pages(struct hstate *h, int delta)
333{ 705{
334 struct list_head surplus_list; 706 struct list_head surplus_list;
335 struct page *page, *tmp; 707 struct page *page, *tmp;
336 int ret, i; 708 int ret, i;
337 int needed, allocated; 709 int needed, allocated;
338 710
339 needed = (resv_huge_pages + delta) - free_huge_pages; 711 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
340 if (needed <= 0) { 712 if (needed <= 0) {
341 resv_huge_pages += delta; 713 h->resv_huge_pages += delta;
342 return 0; 714 return 0;
343 } 715 }
344 716
@@ -349,7 +721,7 @@ static int gather_surplus_pages(int delta)
349retry: 721retry:
350 spin_unlock(&hugetlb_lock); 722 spin_unlock(&hugetlb_lock);
351 for (i = 0; i < needed; i++) { 723 for (i = 0; i < needed; i++) {
352 page = alloc_buddy_huge_page(NULL, 0); 724 page = alloc_buddy_huge_page(h, NULL, 0);
353 if (!page) { 725 if (!page) {
354 /* 726 /*
355 * We were not able to allocate enough pages to 727 * We were not able to allocate enough pages to
@@ -370,7 +742,8 @@ retry:
370 * because either resv_huge_pages or free_huge_pages may have changed. 742 * because either resv_huge_pages or free_huge_pages may have changed.
371 */ 743 */
372 spin_lock(&hugetlb_lock); 744 spin_lock(&hugetlb_lock);
373 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 745 needed = (h->resv_huge_pages + delta) -
746 (h->free_huge_pages + allocated);
374 if (needed > 0) 747 if (needed > 0)
375 goto retry; 748 goto retry;
376 749
@@ -383,7 +756,7 @@ retry:
383 * before they are reserved. 756 * before they are reserved.
384 */ 757 */
385 needed += allocated; 758 needed += allocated;
386 resv_huge_pages += delta; 759 h->resv_huge_pages += delta;
387 ret = 0; 760 ret = 0;
388free: 761free:
389 /* Free the needed pages to the hugetlb pool */ 762 /* Free the needed pages to the hugetlb pool */
@@ -391,7 +764,7 @@ free:
391 if ((--needed) < 0) 764 if ((--needed) < 0)
392 break; 765 break;
393 list_del(&page->lru); 766 list_del(&page->lru);
394 enqueue_huge_page(page); 767 enqueue_huge_page(h, page);
395 } 768 }
396 769
397 /* Free unnecessary surplus pages to the buddy allocator */ 770 /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +792,8 @@ free:
419 * allocated to satisfy the reservation must be explicitly freed if they were 792 * allocated to satisfy the reservation must be explicitly freed if they were
420 * never used. 793 * never used.
421 */ 794 */
422static void return_unused_surplus_pages(unsigned long unused_resv_pages) 795static void return_unused_surplus_pages(struct hstate *h,
796 unsigned long unused_resv_pages)
423{ 797{
424 static int nid = -1; 798 static int nid = -1;
425 struct page *page; 799 struct page *page;
@@ -434,157 +808,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
434 unsigned long remaining_iterations = num_online_nodes(); 808 unsigned long remaining_iterations = num_online_nodes();
435 809
436 /* Uncommit the reservation */ 810 /* Uncommit the reservation */
437 resv_huge_pages -= unused_resv_pages; 811 h->resv_huge_pages -= unused_resv_pages;
438 812
439 nr_pages = min(unused_resv_pages, surplus_huge_pages); 813 /* Cannot return gigantic pages currently */
814 if (h->order >= MAX_ORDER)
815 return;
816
817 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
440 818
441 while (remaining_iterations-- && nr_pages) { 819 while (remaining_iterations-- && nr_pages) {
442 nid = next_node(nid, node_online_map); 820 nid = next_node(nid, node_online_map);
443 if (nid == MAX_NUMNODES) 821 if (nid == MAX_NUMNODES)
444 nid = first_node(node_online_map); 822 nid = first_node(node_online_map);
445 823
446 if (!surplus_huge_pages_node[nid]) 824 if (!h->surplus_huge_pages_node[nid])
447 continue; 825 continue;
448 826
449 if (!list_empty(&hugepage_freelists[nid])) { 827 if (!list_empty(&h->hugepage_freelists[nid])) {
450 page = list_entry(hugepage_freelists[nid].next, 828 page = list_entry(h->hugepage_freelists[nid].next,
451 struct page, lru); 829 struct page, lru);
452 list_del(&page->lru); 830 list_del(&page->lru);
453 update_and_free_page(page); 831 update_and_free_page(h, page);
454 free_huge_pages--; 832 h->free_huge_pages--;
455 free_huge_pages_node[nid]--; 833 h->free_huge_pages_node[nid]--;
456 surplus_huge_pages--; 834 h->surplus_huge_pages--;
457 surplus_huge_pages_node[nid]--; 835 h->surplus_huge_pages_node[nid]--;
458 nr_pages--; 836 nr_pages--;
459 remaining_iterations = num_online_nodes(); 837 remaining_iterations = num_online_nodes();
460 } 838 }
461 } 839 }
462} 840}
463 841
842/*
843 * Determine if the huge page at addr within the vma has an associated
844 * reservation. Where it does not we will need to logically increase
845 * reservation and actually increase quota before an allocation can occur.
846 * Where any new reservation would be required the reservation change is
847 * prepared, but not committed. Once the page has been quota'd allocated
848 * an instantiated the change should be committed via vma_commit_reservation.
849 * No action is required on failure.
850 */
851static int vma_needs_reservation(struct hstate *h,
852 struct vm_area_struct *vma, unsigned long addr)
853{
854 struct address_space *mapping = vma->vm_file->f_mapping;
855 struct inode *inode = mapping->host;
856
857 if (vma->vm_flags & VM_SHARED) {
858 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
859 return region_chg(&inode->i_mapping->private_list,
860 idx, idx + 1);
464 861
465static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 862 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
466 unsigned long addr) 863 return 1;
864
865 } else {
866 int err;
867 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
868 struct resv_map *reservations = vma_resv_map(vma);
869
870 err = region_chg(&reservations->regions, idx, idx + 1);
871 if (err < 0)
872 return err;
873 return 0;
874 }
875}
876static void vma_commit_reservation(struct hstate *h,
877 struct vm_area_struct *vma, unsigned long addr)
467{ 878{
468 struct page *page; 879 struct address_space *mapping = vma->vm_file->f_mapping;
880 struct inode *inode = mapping->host;
469 881
470 spin_lock(&hugetlb_lock); 882 if (vma->vm_flags & VM_SHARED) {
471 page = dequeue_huge_page_vma(vma, addr); 883 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
472 spin_unlock(&hugetlb_lock); 884 region_add(&inode->i_mapping->private_list, idx, idx + 1);
473 return page ? page : ERR_PTR(-VM_FAULT_OOM); 885
886 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
887 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
888 struct resv_map *reservations = vma_resv_map(vma);
889
890 /* Mark this page used in the map. */
891 region_add(&reservations->regions, idx, idx + 1);
892 }
474} 893}
475 894
476static struct page *alloc_huge_page_private(struct vm_area_struct *vma, 895static struct page *alloc_huge_page(struct vm_area_struct *vma,
477 unsigned long addr) 896 unsigned long addr, int avoid_reserve)
478{ 897{
479 struct page *page = NULL; 898 struct hstate *h = hstate_vma(vma);
899 struct page *page;
900 struct address_space *mapping = vma->vm_file->f_mapping;
901 struct inode *inode = mapping->host;
902 unsigned int chg;
480 903
481 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) 904 /*
482 return ERR_PTR(-VM_FAULT_SIGBUS); 905 * Processes that did not create the mapping will have no reserves and
906 * will not have accounted against quota. Check that the quota can be
907 * made before satisfying the allocation
908 * MAP_NORESERVE mappings may also need pages and quota allocated
909 * if no reserve mapping overlaps.
910 */
911 chg = vma_needs_reservation(h, vma, addr);
912 if (chg < 0)
913 return ERR_PTR(chg);
914 if (chg)
915 if (hugetlb_get_quota(inode->i_mapping, chg))
916 return ERR_PTR(-ENOSPC);
483 917
484 spin_lock(&hugetlb_lock); 918 spin_lock(&hugetlb_lock);
485 if (free_huge_pages > resv_huge_pages) 919 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
486 page = dequeue_huge_page_vma(vma, addr);
487 spin_unlock(&hugetlb_lock); 920 spin_unlock(&hugetlb_lock);
921
488 if (!page) { 922 if (!page) {
489 page = alloc_buddy_huge_page(vma, addr); 923 page = alloc_buddy_huge_page(h, vma, addr);
490 if (!page) { 924 if (!page) {
491 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 925 hugetlb_put_quota(inode->i_mapping, chg);
492 return ERR_PTR(-VM_FAULT_OOM); 926 return ERR_PTR(-VM_FAULT_OOM);
493 } 927 }
494 } 928 }
929
930 set_page_refcounted(page);
931 set_page_private(page, (unsigned long) mapping);
932
933 vma_commit_reservation(h, vma, addr);
934
495 return page; 935 return page;
496} 936}
497 937
498static struct page *alloc_huge_page(struct vm_area_struct *vma, 938__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
499 unsigned long addr)
500{ 939{
501 struct page *page; 940 struct huge_bootmem_page *m;
502 struct address_space *mapping = vma->vm_file->f_mapping; 941 int nr_nodes = nodes_weight(node_online_map);
503 942
504 if (vma->vm_flags & VM_MAYSHARE) 943 while (nr_nodes) {
505 page = alloc_huge_page_shared(vma, addr); 944 void *addr;
506 else 945
507 page = alloc_huge_page_private(vma, addr); 946 addr = __alloc_bootmem_node_nopanic(
947 NODE_DATA(h->hugetlb_next_nid),
948 huge_page_size(h), huge_page_size(h), 0);
508 949
509 if (!IS_ERR(page)) { 950 if (addr) {
510 set_page_refcounted(page); 951 /*
511 set_page_private(page, (unsigned long) mapping); 952 * Use the beginning of the huge page to store the
953 * huge_bootmem_page struct (until gather_bootmem
954 * puts them into the mem_map).
955 */
956 m = addr;
957 if (m)
958 goto found;
959 }
960 hstate_next_node(h);
961 nr_nodes--;
512 } 962 }
513 return page; 963 return 0;
964
965found:
966 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
967 /* Put them into a private list first because mem_map is not up yet */
968 list_add(&m->list, &huge_boot_pages);
969 m->hstate = h;
970 return 1;
514} 971}
515 972
516static int __init hugetlb_init(void) 973/* Put bootmem huge pages into the standard lists after mem_map is up */
974static void __init gather_bootmem_prealloc(void)
517{ 975{
518 unsigned long i; 976 struct huge_bootmem_page *m;
519 977
520 if (HPAGE_SHIFT == 0) 978 list_for_each_entry(m, &huge_boot_pages, list) {
521 return 0; 979 struct page *page = virt_to_page(m);
522 980 struct hstate *h = m->hstate;
523 for (i = 0; i < MAX_NUMNODES; ++i) 981 __ClearPageReserved(page);
524 INIT_LIST_HEAD(&hugepage_freelists[i]); 982 WARN_ON(page_count(page) != 1);
983 prep_compound_page(page, h->order);
984 prep_new_huge_page(h, page, page_to_nid(page));
985 }
986}
525 987
526 hugetlb_next_nid = first_node(node_online_map); 988static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
989{
990 unsigned long i;
527 991
528 for (i = 0; i < max_huge_pages; ++i) { 992 for (i = 0; i < h->max_huge_pages; ++i) {
529 if (!alloc_fresh_huge_page()) 993 if (h->order >= MAX_ORDER) {
994 if (!alloc_bootmem_huge_page(h))
995 break;
996 } else if (!alloc_fresh_huge_page(h))
530 break; 997 break;
531 } 998 }
532 max_huge_pages = free_huge_pages = nr_huge_pages = i; 999 h->max_huge_pages = i;
533 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
534 return 0;
535} 1000}
536module_init(hugetlb_init);
537 1001
538static int __init hugetlb_setup(char *s) 1002static void __init hugetlb_init_hstates(void)
539{ 1003{
540 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 1004 struct hstate *h;
541 max_huge_pages = 0; 1005
542 return 1; 1006 for_each_hstate(h) {
1007 /* oversize hugepages were init'ed in early boot */
1008 if (h->order < MAX_ORDER)
1009 hugetlb_hstate_alloc_pages(h);
1010 }
543} 1011}
544__setup("hugepages=", hugetlb_setup);
545 1012
546static unsigned int cpuset_mems_nr(unsigned int *array) 1013static char * __init memfmt(char *buf, unsigned long n)
547{ 1014{
548 int node; 1015 if (n >= (1UL << 30))
549 unsigned int nr = 0; 1016 sprintf(buf, "%lu GB", n >> 30);
550 1017 else if (n >= (1UL << 20))
551 for_each_node_mask(node, cpuset_current_mems_allowed) 1018 sprintf(buf, "%lu MB", n >> 20);
552 nr += array[node]; 1019 else
1020 sprintf(buf, "%lu KB", n >> 10);
1021 return buf;
1022}
553 1023
554 return nr; 1024static void __init report_hugepages(void)
1025{
1026 struct hstate *h;
1027
1028 for_each_hstate(h) {
1029 char buf[32];
1030 printk(KERN_INFO "HugeTLB registered %s page size, "
1031 "pre-allocated %ld pages\n",
1032 memfmt(buf, huge_page_size(h)),
1033 h->free_huge_pages);
1034 }
555} 1035}
556 1036
557#ifdef CONFIG_SYSCTL
558#ifdef CONFIG_HIGHMEM 1037#ifdef CONFIG_HIGHMEM
559static void try_to_free_low(unsigned long count) 1038static void try_to_free_low(struct hstate *h, unsigned long count)
560{ 1039{
561 int i; 1040 int i;
562 1041
1042 if (h->order >= MAX_ORDER)
1043 return;
1044
563 for (i = 0; i < MAX_NUMNODES; ++i) { 1045 for (i = 0; i < MAX_NUMNODES; ++i) {
564 struct page *page, *next; 1046 struct page *page, *next;
565 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 1047 struct list_head *freel = &h->hugepage_freelists[i];
566 if (count >= nr_huge_pages) 1048 list_for_each_entry_safe(page, next, freel, lru) {
1049 if (count >= h->nr_huge_pages)
567 return; 1050 return;
568 if (PageHighMem(page)) 1051 if (PageHighMem(page))
569 continue; 1052 continue;
570 list_del(&page->lru); 1053 list_del(&page->lru);
571 update_and_free_page(page); 1054 update_and_free_page(h, page);
572 free_huge_pages--; 1055 h->free_huge_pages--;
573 free_huge_pages_node[page_to_nid(page)]--; 1056 h->free_huge_pages_node[page_to_nid(page)]--;
574 } 1057 }
575 } 1058 }
576} 1059}
577#else 1060#else
578static inline void try_to_free_low(unsigned long count) 1061static inline void try_to_free_low(struct hstate *h, unsigned long count)
579{ 1062{
580} 1063}
581#endif 1064#endif
582 1065
583#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 1066#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
584static unsigned long set_max_huge_pages(unsigned long count) 1067static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
585{ 1068{
586 unsigned long min_count, ret; 1069 unsigned long min_count, ret;
587 1070
1071 if (h->order >= MAX_ORDER)
1072 return h->max_huge_pages;
1073
588 /* 1074 /*
589 * Increase the pool size 1075 * Increase the pool size
590 * First take pages out of surplus state. Then make up the 1076 * First take pages out of surplus state. Then make up the
@@ -597,20 +1083,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
597 * within all the constraints specified by the sysctls. 1083 * within all the constraints specified by the sysctls.
598 */ 1084 */
599 spin_lock(&hugetlb_lock); 1085 spin_lock(&hugetlb_lock);
600 while (surplus_huge_pages && count > persistent_huge_pages) { 1086 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
601 if (!adjust_pool_surplus(-1)) 1087 if (!adjust_pool_surplus(h, -1))
602 break; 1088 break;
603 } 1089 }
604 1090
605 while (count > persistent_huge_pages) { 1091 while (count > persistent_huge_pages(h)) {
606 int ret;
607 /* 1092 /*
608 * If this allocation races such that we no longer need the 1093 * If this allocation races such that we no longer need the
609 * page, free_huge_page will handle it by freeing the page 1094 * page, free_huge_page will handle it by freeing the page
610 * and reducing the surplus. 1095 * and reducing the surplus.
611 */ 1096 */
612 spin_unlock(&hugetlb_lock); 1097 spin_unlock(&hugetlb_lock);
613 ret = alloc_fresh_huge_page(); 1098 ret = alloc_fresh_huge_page(h);
614 spin_lock(&hugetlb_lock); 1099 spin_lock(&hugetlb_lock);
615 if (!ret) 1100 if (!ret)
616 goto out; 1101 goto out;
@@ -632,31 +1117,305 @@ static unsigned long set_max_huge_pages(unsigned long count)
632 * and won't grow the pool anywhere else. Not until one of the 1117 * and won't grow the pool anywhere else. Not until one of the
633 * sysctls are changed, or the surplus pages go out of use. 1118 * sysctls are changed, or the surplus pages go out of use.
634 */ 1119 */
635 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 1120 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
636 min_count = max(count, min_count); 1121 min_count = max(count, min_count);
637 try_to_free_low(min_count); 1122 try_to_free_low(h, min_count);
638 while (min_count < persistent_huge_pages) { 1123 while (min_count < persistent_huge_pages(h)) {
639 struct page *page = dequeue_huge_page(); 1124 struct page *page = dequeue_huge_page(h);
640 if (!page) 1125 if (!page)
641 break; 1126 break;
642 update_and_free_page(page); 1127 update_and_free_page(h, page);
643 } 1128 }
644 while (count < persistent_huge_pages) { 1129 while (count < persistent_huge_pages(h)) {
645 if (!adjust_pool_surplus(1)) 1130 if (!adjust_pool_surplus(h, 1))
646 break; 1131 break;
647 } 1132 }
648out: 1133out:
649 ret = persistent_huge_pages; 1134 ret = persistent_huge_pages(h);
650 spin_unlock(&hugetlb_lock); 1135 spin_unlock(&hugetlb_lock);
651 return ret; 1136 return ret;
652} 1137}
653 1138
1139#define HSTATE_ATTR_RO(_name) \
1140 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1141
1142#define HSTATE_ATTR(_name) \
1143 static struct kobj_attribute _name##_attr = \
1144 __ATTR(_name, 0644, _name##_show, _name##_store)
1145
1146static struct kobject *hugepages_kobj;
1147static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1148
1149static struct hstate *kobj_to_hstate(struct kobject *kobj)
1150{
1151 int i;
1152 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1153 if (hstate_kobjs[i] == kobj)
1154 return &hstates[i];
1155 BUG();
1156 return NULL;
1157}
1158
1159static ssize_t nr_hugepages_show(struct kobject *kobj,
1160 struct kobj_attribute *attr, char *buf)
1161{
1162 struct hstate *h = kobj_to_hstate(kobj);
1163 return sprintf(buf, "%lu\n", h->nr_huge_pages);
1164}
1165static ssize_t nr_hugepages_store(struct kobject *kobj,
1166 struct kobj_attribute *attr, const char *buf, size_t count)
1167{
1168 int err;
1169 unsigned long input;
1170 struct hstate *h = kobj_to_hstate(kobj);
1171
1172 err = strict_strtoul(buf, 10, &input);
1173 if (err)
1174 return 0;
1175
1176 h->max_huge_pages = set_max_huge_pages(h, input);
1177
1178 return count;
1179}
1180HSTATE_ATTR(nr_hugepages);
1181
1182static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1183 struct kobj_attribute *attr, char *buf)
1184{
1185 struct hstate *h = kobj_to_hstate(kobj);
1186 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1187}
1188static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1189 struct kobj_attribute *attr, const char *buf, size_t count)
1190{
1191 int err;
1192 unsigned long input;
1193 struct hstate *h = kobj_to_hstate(kobj);
1194
1195 err = strict_strtoul(buf, 10, &input);
1196 if (err)
1197 return 0;
1198
1199 spin_lock(&hugetlb_lock);
1200 h->nr_overcommit_huge_pages = input;
1201 spin_unlock(&hugetlb_lock);
1202
1203 return count;
1204}
1205HSTATE_ATTR(nr_overcommit_hugepages);
1206
1207static ssize_t free_hugepages_show(struct kobject *kobj,
1208 struct kobj_attribute *attr, char *buf)
1209{
1210 struct hstate *h = kobj_to_hstate(kobj);
1211 return sprintf(buf, "%lu\n", h->free_huge_pages);
1212}
1213HSTATE_ATTR_RO(free_hugepages);
1214
1215static ssize_t resv_hugepages_show(struct kobject *kobj,
1216 struct kobj_attribute *attr, char *buf)
1217{
1218 struct hstate *h = kobj_to_hstate(kobj);
1219 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1220}
1221HSTATE_ATTR_RO(resv_hugepages);
1222
1223static ssize_t surplus_hugepages_show(struct kobject *kobj,
1224 struct kobj_attribute *attr, char *buf)
1225{
1226 struct hstate *h = kobj_to_hstate(kobj);
1227 return sprintf(buf, "%lu\n", h->surplus_huge_pages);
1228}
1229HSTATE_ATTR_RO(surplus_hugepages);
1230
1231static struct attribute *hstate_attrs[] = {
1232 &nr_hugepages_attr.attr,
1233 &nr_overcommit_hugepages_attr.attr,
1234 &free_hugepages_attr.attr,
1235 &resv_hugepages_attr.attr,
1236 &surplus_hugepages_attr.attr,
1237 NULL,
1238};
1239
1240static struct attribute_group hstate_attr_group = {
1241 .attrs = hstate_attrs,
1242};
1243
1244static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
1245{
1246 int retval;
1247
1248 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
1249 hugepages_kobj);
1250 if (!hstate_kobjs[h - hstates])
1251 return -ENOMEM;
1252
1253 retval = sysfs_create_group(hstate_kobjs[h - hstates],
1254 &hstate_attr_group);
1255 if (retval)
1256 kobject_put(hstate_kobjs[h - hstates]);
1257
1258 return retval;
1259}
1260
1261static void __init hugetlb_sysfs_init(void)
1262{
1263 struct hstate *h;
1264 int err;
1265
1266 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1267 if (!hugepages_kobj)
1268 return;
1269
1270 for_each_hstate(h) {
1271 err = hugetlb_sysfs_add_hstate(h);
1272 if (err)
1273 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1274 h->name);
1275 }
1276}
1277
1278static void __exit hugetlb_exit(void)
1279{
1280 struct hstate *h;
1281
1282 for_each_hstate(h) {
1283 kobject_put(hstate_kobjs[h - hstates]);
1284 }
1285
1286 kobject_put(hugepages_kobj);
1287}
1288module_exit(hugetlb_exit);
1289
1290static int __init hugetlb_init(void)
1291{
1292 /* Some platform decide whether they support huge pages at boot
1293 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1294 * there is no such support
1295 */
1296 if (HPAGE_SHIFT == 0)
1297 return 0;
1298
1299 if (!size_to_hstate(default_hstate_size)) {
1300 default_hstate_size = HPAGE_SIZE;
1301 if (!size_to_hstate(default_hstate_size))
1302 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1303 }
1304 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1305 if (default_hstate_max_huge_pages)
1306 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1307
1308 hugetlb_init_hstates();
1309
1310 gather_bootmem_prealloc();
1311
1312 report_hugepages();
1313
1314 hugetlb_sysfs_init();
1315
1316 return 0;
1317}
1318module_init(hugetlb_init);
1319
1320/* Should be called on processing a hugepagesz=... option */
1321void __init hugetlb_add_hstate(unsigned order)
1322{
1323 struct hstate *h;
1324 unsigned long i;
1325
1326 if (size_to_hstate(PAGE_SIZE << order)) {
1327 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1328 return;
1329 }
1330 BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1331 BUG_ON(order == 0);
1332 h = &hstates[max_hstate++];
1333 h->order = order;
1334 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1335 h->nr_huge_pages = 0;
1336 h->free_huge_pages = 0;
1337 for (i = 0; i < MAX_NUMNODES; ++i)
1338 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1339 h->hugetlb_next_nid = first_node(node_online_map);
1340 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1341 huge_page_size(h)/1024);
1342
1343 parsed_hstate = h;
1344}
1345
1346static int __init hugetlb_nrpages_setup(char *s)
1347{
1348 unsigned long *mhp;
1349 static unsigned long *last_mhp;
1350
1351 /*
1352 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1353 * so this hugepages= parameter goes to the "default hstate".
1354 */
1355 if (!max_hstate)
1356 mhp = &default_hstate_max_huge_pages;
1357 else
1358 mhp = &parsed_hstate->max_huge_pages;
1359
1360 if (mhp == last_mhp) {
1361 printk(KERN_WARNING "hugepages= specified twice without "
1362 "interleaving hugepagesz=, ignoring\n");
1363 return 1;
1364 }
1365
1366 if (sscanf(s, "%lu", mhp) <= 0)
1367 *mhp = 0;
1368
1369 /*
1370 * Global state is always initialized later in hugetlb_init.
1371 * But we need to allocate >= MAX_ORDER hstates here early to still
1372 * use the bootmem allocator.
1373 */
1374 if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1375 hugetlb_hstate_alloc_pages(parsed_hstate);
1376
1377 last_mhp = mhp;
1378
1379 return 1;
1380}
1381__setup("hugepages=", hugetlb_nrpages_setup);
1382
1383static int __init hugetlb_default_setup(char *s)
1384{
1385 default_hstate_size = memparse(s, &s);
1386 return 1;
1387}
1388__setup("default_hugepagesz=", hugetlb_default_setup);
1389
1390static unsigned int cpuset_mems_nr(unsigned int *array)
1391{
1392 int node;
1393 unsigned int nr = 0;
1394
1395 for_each_node_mask(node, cpuset_current_mems_allowed)
1396 nr += array[node];
1397
1398 return nr;
1399}
1400
1401#ifdef CONFIG_SYSCTL
654int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1402int hugetlb_sysctl_handler(struct ctl_table *table, int write,
655 struct file *file, void __user *buffer, 1403 struct file *file, void __user *buffer,
656 size_t *length, loff_t *ppos) 1404 size_t *length, loff_t *ppos)
657{ 1405{
1406 struct hstate *h = &default_hstate;
1407 unsigned long tmp;
1408
1409 if (!write)
1410 tmp = h->max_huge_pages;
1411
1412 table->data = &tmp;
1413 table->maxlen = sizeof(unsigned long);
658 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1414 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
659 max_huge_pages = set_max_huge_pages(max_huge_pages); 1415
1416 if (write)
1417 h->max_huge_pages = set_max_huge_pages(h, tmp);
1418
660 return 0; 1419 return 0;
661} 1420}
662 1421
@@ -676,45 +1435,141 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
676 struct file *file, void __user *buffer, 1435 struct file *file, void __user *buffer,
677 size_t *length, loff_t *ppos) 1436 size_t *length, loff_t *ppos)
678{ 1437{
1438 struct hstate *h = &default_hstate;
1439 unsigned long tmp;
1440
1441 if (!write)
1442 tmp = h->nr_overcommit_huge_pages;
1443
1444 table->data = &tmp;
1445 table->maxlen = sizeof(unsigned long);
679 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1446 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
680 spin_lock(&hugetlb_lock); 1447
681 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 1448 if (write) {
682 spin_unlock(&hugetlb_lock); 1449 spin_lock(&hugetlb_lock);
1450 h->nr_overcommit_huge_pages = tmp;
1451 spin_unlock(&hugetlb_lock);
1452 }
1453
683 return 0; 1454 return 0;
684} 1455}
685 1456
686#endif /* CONFIG_SYSCTL */ 1457#endif /* CONFIG_SYSCTL */
687 1458
688int hugetlb_report_meminfo(char *buf) 1459void hugetlb_report_meminfo(struct seq_file *m)
689{ 1460{
690 return sprintf(buf, 1461 struct hstate *h = &default_hstate;
691 "HugePages_Total: %5lu\n" 1462 seq_printf(m,
692 "HugePages_Free: %5lu\n" 1463 "HugePages_Total: %5lu\n"
693 "HugePages_Rsvd: %5lu\n" 1464 "HugePages_Free: %5lu\n"
694 "HugePages_Surp: %5lu\n" 1465 "HugePages_Rsvd: %5lu\n"
695 "Hugepagesize: %5lu kB\n", 1466 "HugePages_Surp: %5lu\n"
696 nr_huge_pages, 1467 "Hugepagesize: %8lu kB\n",
697 free_huge_pages, 1468 h->nr_huge_pages,
698 resv_huge_pages, 1469 h->free_huge_pages,
699 surplus_huge_pages, 1470 h->resv_huge_pages,
700 HPAGE_SIZE/1024); 1471 h->surplus_huge_pages,
1472 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
701} 1473}
702 1474
703int hugetlb_report_node_meminfo(int nid, char *buf) 1475int hugetlb_report_node_meminfo(int nid, char *buf)
704{ 1476{
1477 struct hstate *h = &default_hstate;
705 return sprintf(buf, 1478 return sprintf(buf,
706 "Node %d HugePages_Total: %5u\n" 1479 "Node %d HugePages_Total: %5u\n"
707 "Node %d HugePages_Free: %5u\n" 1480 "Node %d HugePages_Free: %5u\n"
708 "Node %d HugePages_Surp: %5u\n", 1481 "Node %d HugePages_Surp: %5u\n",
709 nid, nr_huge_pages_node[nid], 1482 nid, h->nr_huge_pages_node[nid],
710 nid, free_huge_pages_node[nid], 1483 nid, h->free_huge_pages_node[nid],
711 nid, surplus_huge_pages_node[nid]); 1484 nid, h->surplus_huge_pages_node[nid]);
712} 1485}
713 1486
714/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 1487/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
715unsigned long hugetlb_total_pages(void) 1488unsigned long hugetlb_total_pages(void)
716{ 1489{
717 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 1490 struct hstate *h = &default_hstate;
1491 return h->nr_huge_pages * pages_per_huge_page(h);
1492}
1493
1494static int hugetlb_acct_memory(struct hstate *h, long delta)
1495{
1496 int ret = -ENOMEM;
1497
1498 spin_lock(&hugetlb_lock);
1499 /*
1500 * When cpuset is configured, it breaks the strict hugetlb page
1501 * reservation as the accounting is done on a global variable. Such
1502 * reservation is completely rubbish in the presence of cpuset because
1503 * the reservation is not checked against page availability for the
1504 * current cpuset. Application can still potentially OOM'ed by kernel
1505 * with lack of free htlb page in cpuset that the task is in.
1506 * Attempt to enforce strict accounting with cpuset is almost
1507 * impossible (or too ugly) because cpuset is too fluid that
1508 * task or memory node can be dynamically moved between cpusets.
1509 *
1510 * The change of semantics for shared hugetlb mapping with cpuset is
1511 * undesirable. However, in order to preserve some of the semantics,
1512 * we fall back to check against current free page availability as
1513 * a best attempt and hopefully to minimize the impact of changing
1514 * semantics that cpuset has.
1515 */
1516 if (delta > 0) {
1517 if (gather_surplus_pages(h, delta) < 0)
1518 goto out;
1519
1520 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1521 return_unused_surplus_pages(h, delta);
1522 goto out;
1523 }
1524 }
1525
1526 ret = 0;
1527 if (delta < 0)
1528 return_unused_surplus_pages(h, (unsigned long) -delta);
1529
1530out:
1531 spin_unlock(&hugetlb_lock);
1532 return ret;
1533}
1534
1535static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1536{
1537 struct resv_map *reservations = vma_resv_map(vma);
1538
1539 /*
1540 * This new VMA should share its siblings reservation map if present.
1541 * The VMA will only ever have a valid reservation map pointer where
1542 * it is being copied for another still existing VMA. As that VMA
1543 * has a reference to the reservation map it cannot dissappear until
1544 * after this open call completes. It is therefore safe to take a
1545 * new reference here without additional locking.
1546 */
1547 if (reservations)
1548 kref_get(&reservations->refs);
1549}
1550
1551static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1552{
1553 struct hstate *h = hstate_vma(vma);
1554 struct resv_map *reservations = vma_resv_map(vma);
1555 unsigned long reserve;
1556 unsigned long start;
1557 unsigned long end;
1558
1559 if (reservations) {
1560 start = vma_hugecache_offset(h, vma, vma->vm_start);
1561 end = vma_hugecache_offset(h, vma, vma->vm_end);
1562
1563 reserve = (end - start) -
1564 region_count(&reservations->regions, start, end);
1565
1566 kref_put(&reservations->refs, resv_map_release);
1567
1568 if (reserve) {
1569 hugetlb_acct_memory(h, -reserve);
1570 hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
1571 }
1572 }
718} 1573}
719 1574
720/* 1575/*
@@ -731,6 +1586,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
731 1586
732struct vm_operations_struct hugetlb_vm_ops = { 1587struct vm_operations_struct hugetlb_vm_ops = {
733 .fault = hugetlb_vm_op_fault, 1588 .fault = hugetlb_vm_op_fault,
1589 .open = hugetlb_vm_op_open,
1590 .close = hugetlb_vm_op_close,
734}; 1591};
735 1592
736static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 1593static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1626,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
769 struct page *ptepage; 1626 struct page *ptepage;
770 unsigned long addr; 1627 unsigned long addr;
771 int cow; 1628 int cow;
1629 struct hstate *h = hstate_vma(vma);
1630 unsigned long sz = huge_page_size(h);
772 1631
773 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1632 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
774 1633
775 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 1634 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
776 src_pte = huge_pte_offset(src, addr); 1635 src_pte = huge_pte_offset(src, addr);
777 if (!src_pte) 1636 if (!src_pte)
778 continue; 1637 continue;
779 dst_pte = huge_pte_alloc(dst, addr); 1638 dst_pte = huge_pte_alloc(dst, addr, sz);
780 if (!dst_pte) 1639 if (!dst_pte)
781 goto nomem; 1640 goto nomem;
782 1641
@@ -804,7 +1663,7 @@ nomem:
804} 1663}
805 1664
806void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1665void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
807 unsigned long end) 1666 unsigned long end, struct page *ref_page)
808{ 1667{
809 struct mm_struct *mm = vma->vm_mm; 1668 struct mm_struct *mm = vma->vm_mm;
810 unsigned long address; 1669 unsigned long address;
@@ -812,6 +1671,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
812 pte_t pte; 1671 pte_t pte;
813 struct page *page; 1672 struct page *page;
814 struct page *tmp; 1673 struct page *tmp;
1674 struct hstate *h = hstate_vma(vma);
1675 unsigned long sz = huge_page_size(h);
1676
815 /* 1677 /*
816 * A page gathering list, protected by per file i_mmap_lock. The 1678 * A page gathering list, protected by per file i_mmap_lock. The
817 * lock is used to avoid list corruption from multiple unmapping 1679 * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1682,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
820 LIST_HEAD(page_list); 1682 LIST_HEAD(page_list);
821 1683
822 WARN_ON(!is_vm_hugetlb_page(vma)); 1684 WARN_ON(!is_vm_hugetlb_page(vma));
823 BUG_ON(start & ~HPAGE_MASK); 1685 BUG_ON(start & ~huge_page_mask(h));
824 BUG_ON(end & ~HPAGE_MASK); 1686 BUG_ON(end & ~huge_page_mask(h));
825 1687
1688 mmu_notifier_invalidate_range_start(mm, start, end);
826 spin_lock(&mm->page_table_lock); 1689 spin_lock(&mm->page_table_lock);
827 for (address = start; address < end; address += HPAGE_SIZE) { 1690 for (address = start; address < end; address += sz) {
828 ptep = huge_pte_offset(mm, address); 1691 ptep = huge_pte_offset(mm, address);
829 if (!ptep) 1692 if (!ptep)
830 continue; 1693 continue;
@@ -832,6 +1695,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
832 if (huge_pmd_unshare(mm, &address, ptep)) 1695 if (huge_pmd_unshare(mm, &address, ptep))
833 continue; 1696 continue;
834 1697
1698 /*
1699 * If a reference page is supplied, it is because a specific
1700 * page is being unmapped, not a range. Ensure the page we
1701 * are about to unmap is the actual page of interest.
1702 */
1703 if (ref_page) {
1704 pte = huge_ptep_get(ptep);
1705 if (huge_pte_none(pte))
1706 continue;
1707 page = pte_page(pte);
1708 if (page != ref_page)
1709 continue;
1710
1711 /*
1712 * Mark the VMA as having unmapped its page so that
1713 * future faults in this VMA will fail rather than
1714 * looking like data was lost
1715 */
1716 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
1717 }
1718
835 pte = huge_ptep_get_and_clear(mm, address, ptep); 1719 pte = huge_ptep_get_and_clear(mm, address, ptep);
836 if (huge_pte_none(pte)) 1720 if (huge_pte_none(pte))
837 continue; 1721 continue;
@@ -843,6 +1727,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
843 } 1727 }
844 spin_unlock(&mm->page_table_lock); 1728 spin_unlock(&mm->page_table_lock);
845 flush_tlb_range(vma, start, end); 1729 flush_tlb_range(vma, start, end);
1730 mmu_notifier_invalidate_range_end(mm, start, end);
846 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1731 list_for_each_entry_safe(page, tmp, &page_list, lru) {
847 list_del(&page->lru); 1732 list_del(&page->lru);
848 put_page(page); 1733 put_page(page);
@@ -850,31 +1735,69 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
850} 1735}
851 1736
852void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1737void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
853 unsigned long end) 1738 unsigned long end, struct page *ref_page)
854{ 1739{
1740 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1741 __unmap_hugepage_range(vma, start, end, ref_page);
1742 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1743}
1744
1745/*
1746 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1747 * mappping it owns the reserve page for. The intention is to unmap the page
1748 * from other VMAs and let the children be SIGKILLed if they are faulting the
1749 * same region.
1750 */
1751static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1752 struct page *page, unsigned long address)
1753{
1754 struct vm_area_struct *iter_vma;
1755 struct address_space *mapping;
1756 struct prio_tree_iter iter;
1757 pgoff_t pgoff;
1758
855 /* 1759 /*
856 * It is undesirable to test vma->vm_file as it should be non-null 1760 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
857 * for valid hugetlb area. However, vm_file will be NULL in the error 1761 * from page cache lookup which is in HPAGE_SIZE units.
858 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
859 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
860 * to clean up. Since no pte has actually been setup, it is safe to
861 * do nothing in this case.
862 */ 1762 */
863 if (vma->vm_file) { 1763 address = address & huge_page_mask(hstate_vma(vma));
864 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1764 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
865 __unmap_hugepage_range(vma, start, end); 1765 + (vma->vm_pgoff >> PAGE_SHIFT);
866 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1766 mapping = (struct address_space *)page_private(page);
1767
1768 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1769 /* Do not unmap the current VMA */
1770 if (iter_vma == vma)
1771 continue;
1772
1773 /*
1774 * Unmap the page from other VMAs without their own reserves.
1775 * They get marked to be SIGKILLed if they fault in these
1776 * areas. This is because a future no-page fault on this VMA
1777 * could insert a zeroed page instead of the data existing
1778 * from the time of fork. This would look like data corruption
1779 */
1780 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1781 unmap_hugepage_range(iter_vma,
1782 address, address + HPAGE_SIZE,
1783 page);
867 } 1784 }
1785
1786 return 1;
868} 1787}
869 1788
870static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1789static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
871 unsigned long address, pte_t *ptep, pte_t pte) 1790 unsigned long address, pte_t *ptep, pte_t pte,
1791 struct page *pagecache_page)
872{ 1792{
1793 struct hstate *h = hstate_vma(vma);
873 struct page *old_page, *new_page; 1794 struct page *old_page, *new_page;
874 int avoidcopy; 1795 int avoidcopy;
1796 int outside_reserve = 0;
875 1797
876 old_page = pte_page(pte); 1798 old_page = pte_page(pte);
877 1799
1800retry_avoidcopy:
878 /* If no-one else is actually using this page, avoid the copy 1801 /* If no-one else is actually using this page, avoid the copy
879 * and just make the page writable */ 1802 * and just make the page writable */
880 avoidcopy = (page_count(old_page) == 1); 1803 avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1806,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
883 return 0; 1806 return 0;
884 } 1807 }
885 1808
1809 /*
1810 * If the process that created a MAP_PRIVATE mapping is about to
1811 * perform a COW due to a shared page count, attempt to satisfy
1812 * the allocation without using the existing reserves. The pagecache
1813 * page is used to determine if the reserve at this address was
1814 * consumed or not. If reserves were used, a partial faulted mapping
1815 * at the time of fork() could consume its reserves on COW instead
1816 * of the full address range.
1817 */
1818 if (!(vma->vm_flags & VM_SHARED) &&
1819 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1820 old_page != pagecache_page)
1821 outside_reserve = 1;
1822
886 page_cache_get(old_page); 1823 page_cache_get(old_page);
887 new_page = alloc_huge_page(vma, address); 1824 new_page = alloc_huge_page(vma, address, outside_reserve);
888 1825
889 if (IS_ERR(new_page)) { 1826 if (IS_ERR(new_page)) {
890 page_cache_release(old_page); 1827 page_cache_release(old_page);
1828
1829 /*
1830 * If a process owning a MAP_PRIVATE mapping fails to COW,
1831 * it is due to references held by a child and an insufficient
1832 * huge page pool. To guarantee the original mappers
1833 * reliability, unmap the page from child processes. The child
1834 * may get SIGKILLed if it later faults.
1835 */
1836 if (outside_reserve) {
1837 BUG_ON(huge_pte_none(pte));
1838 if (unmap_ref_private(mm, vma, old_page, address)) {
1839 BUG_ON(page_count(old_page) != 1);
1840 BUG_ON(huge_pte_none(pte));
1841 goto retry_avoidcopy;
1842 }
1843 WARN_ON_ONCE(1);
1844 }
1845
891 return -PTR_ERR(new_page); 1846 return -PTR_ERR(new_page);
892 } 1847 }
893 1848
@@ -896,7 +1851,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
896 __SetPageUptodate(new_page); 1851 __SetPageUptodate(new_page);
897 spin_lock(&mm->page_table_lock); 1852 spin_lock(&mm->page_table_lock);
898 1853
899 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1854 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
900 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1855 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
901 /* Break COW */ 1856 /* Break COW */
902 huge_ptep_clear_flush(vma, address, ptep); 1857 huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1865,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
910 return 0; 1865 return 0;
911} 1866}
912 1867
1868/* Return the pagecache page at a given address within a VMA */
1869static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1870 struct vm_area_struct *vma, unsigned long address)
1871{
1872 struct address_space *mapping;
1873 pgoff_t idx;
1874
1875 mapping = vma->vm_file->f_mapping;
1876 idx = vma_hugecache_offset(h, vma, address);
1877
1878 return find_lock_page(mapping, idx);
1879}
1880
913static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1881static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
914 unsigned long address, pte_t *ptep, int write_access) 1882 unsigned long address, pte_t *ptep, int write_access)
915{ 1883{
1884 struct hstate *h = hstate_vma(vma);
916 int ret = VM_FAULT_SIGBUS; 1885 int ret = VM_FAULT_SIGBUS;
917 unsigned long idx; 1886 pgoff_t idx;
918 unsigned long size; 1887 unsigned long size;
919 struct page *page; 1888 struct page *page;
920 struct address_space *mapping; 1889 struct address_space *mapping;
921 pte_t new_pte; 1890 pte_t new_pte;
922 1891
1892 /*
1893 * Currently, we are forced to kill the process in the event the
1894 * original mapper has unmapped pages from the child due to a failed
1895 * COW. Warn that such a situation has occured as it may not be obvious
1896 */
1897 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1898 printk(KERN_WARNING
1899 "PID %d killed due to inadequate hugepage pool\n",
1900 current->pid);
1901 return ret;
1902 }
1903
923 mapping = vma->vm_file->f_mapping; 1904 mapping = vma->vm_file->f_mapping;
924 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1905 idx = vma_hugecache_offset(h, vma, address);
925 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
926 1906
927 /* 1907 /*
928 * Use page lock to guard against racing truncation 1908 * Use page lock to guard against racing truncation
@@ -931,15 +1911,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
931retry: 1911retry:
932 page = find_lock_page(mapping, idx); 1912 page = find_lock_page(mapping, idx);
933 if (!page) { 1913 if (!page) {
934 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1914 size = i_size_read(mapping->host) >> huge_page_shift(h);
935 if (idx >= size) 1915 if (idx >= size)
936 goto out; 1916 goto out;
937 page = alloc_huge_page(vma, address); 1917 page = alloc_huge_page(vma, address, 0);
938 if (IS_ERR(page)) { 1918 if (IS_ERR(page)) {
939 ret = -PTR_ERR(page); 1919 ret = -PTR_ERR(page);
940 goto out; 1920 goto out;
941 } 1921 }
942 clear_huge_page(page, address); 1922 clear_huge_page(page, address, huge_page_size(h));
943 __SetPageUptodate(page); 1923 __SetPageUptodate(page);
944 1924
945 if (vma->vm_flags & VM_SHARED) { 1925 if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1935,26 @@ retry:
955 } 1935 }
956 1936
957 spin_lock(&inode->i_lock); 1937 spin_lock(&inode->i_lock);
958 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1938 inode->i_blocks += blocks_per_huge_page(h);
959 spin_unlock(&inode->i_lock); 1939 spin_unlock(&inode->i_lock);
960 } else 1940 } else
961 lock_page(page); 1941 lock_page(page);
962 } 1942 }
963 1943
1944 /*
1945 * If we are going to COW a private mapping later, we examine the
1946 * pending reservations for this page now. This will ensure that
1947 * any allocations necessary to record that reservation occur outside
1948 * the spinlock.
1949 */
1950 if (write_access && !(vma->vm_flags & VM_SHARED))
1951 if (vma_needs_reservation(h, vma, address) < 0) {
1952 ret = VM_FAULT_OOM;
1953 goto backout_unlocked;
1954 }
1955
964 spin_lock(&mm->page_table_lock); 1956 spin_lock(&mm->page_table_lock);
965 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1957 size = i_size_read(mapping->host) >> huge_page_shift(h);
966 if (idx >= size) 1958 if (idx >= size)
967 goto backout; 1959 goto backout;
968 1960
@@ -976,7 +1968,7 @@ retry:
976 1968
977 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1969 if (write_access && !(vma->vm_flags & VM_SHARED)) {
978 /* Optimization, do the COW without a second fault */ 1970 /* Optimization, do the COW without a second fault */
979 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1971 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
980 } 1972 }
981 1973
982 spin_unlock(&mm->page_table_lock); 1974 spin_unlock(&mm->page_table_lock);
@@ -986,6 +1978,7 @@ out:
986 1978
987backout: 1979backout:
988 spin_unlock(&mm->page_table_lock); 1980 spin_unlock(&mm->page_table_lock);
1981backout_unlocked:
989 unlock_page(page); 1982 unlock_page(page);
990 put_page(page); 1983 put_page(page);
991 goto out; 1984 goto out;
@@ -997,9 +1990,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
997 pte_t *ptep; 1990 pte_t *ptep;
998 pte_t entry; 1991 pte_t entry;
999 int ret; 1992 int ret;
1993 struct page *pagecache_page = NULL;
1000 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1994 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1995 struct hstate *h = hstate_vma(vma);
1001 1996
1002 ptep = huge_pte_alloc(mm, address); 1997 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
1003 if (!ptep) 1998 if (!ptep)
1004 return VM_FAULT_OOM; 1999 return VM_FAULT_OOM;
1005 2000
@@ -1012,23 +2007,79 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1012 entry = huge_ptep_get(ptep); 2007 entry = huge_ptep_get(ptep);
1013 if (huge_pte_none(entry)) { 2008 if (huge_pte_none(entry)) {
1014 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 2009 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
1015 mutex_unlock(&hugetlb_instantiation_mutex); 2010 goto out_mutex;
1016 return ret;
1017 } 2011 }
1018 2012
1019 ret = 0; 2013 ret = 0;
1020 2014
2015 /*
2016 * If we are going to COW the mapping later, we examine the pending
2017 * reservations for this page now. This will ensure that any
2018 * allocations necessary to record that reservation occur outside the
2019 * spinlock. For private mappings, we also lookup the pagecache
2020 * page now as it is used to determine if a reservation has been
2021 * consumed.
2022 */
2023 if (write_access && !pte_write(entry)) {
2024 if (vma_needs_reservation(h, vma, address) < 0) {
2025 ret = VM_FAULT_OOM;
2026 goto out_mutex;
2027 }
2028
2029 if (!(vma->vm_flags & VM_SHARED))
2030 pagecache_page = hugetlbfs_pagecache_page(h,
2031 vma, address);
2032 }
2033
1021 spin_lock(&mm->page_table_lock); 2034 spin_lock(&mm->page_table_lock);
1022 /* Check for a racing update before calling hugetlb_cow */ 2035 /* Check for a racing update before calling hugetlb_cow */
1023 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 2036 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
1024 if (write_access && !pte_write(entry)) 2037 goto out_page_table_lock;
1025 ret = hugetlb_cow(mm, vma, address, ptep, entry); 2038
2039
2040 if (write_access) {
2041 if (!pte_write(entry)) {
2042 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2043 pagecache_page);
2044 goto out_page_table_lock;
2045 }
2046 entry = pte_mkdirty(entry);
2047 }
2048 entry = pte_mkyoung(entry);
2049 if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
2050 update_mmu_cache(vma, address, entry);
2051
2052out_page_table_lock:
1026 spin_unlock(&mm->page_table_lock); 2053 spin_unlock(&mm->page_table_lock);
2054
2055 if (pagecache_page) {
2056 unlock_page(pagecache_page);
2057 put_page(pagecache_page);
2058 }
2059
2060out_mutex:
1027 mutex_unlock(&hugetlb_instantiation_mutex); 2061 mutex_unlock(&hugetlb_instantiation_mutex);
1028 2062
1029 return ret; 2063 return ret;
1030} 2064}
1031 2065
2066/* Can be overriden by architectures */
2067__attribute__((weak)) struct page *
2068follow_huge_pud(struct mm_struct *mm, unsigned long address,
2069 pud_t *pud, int write)
2070{
2071 BUG();
2072 return NULL;
2073}
2074
2075static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2076{
2077 if (!ptep || write || shared)
2078 return 0;
2079 else
2080 return huge_pte_none(huge_ptep_get(ptep));
2081}
2082
1032int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2083int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1033 struct page **pages, struct vm_area_struct **vmas, 2084 struct page **pages, struct vm_area_struct **vmas,
1034 unsigned long *position, int *length, int i, 2085 unsigned long *position, int *length, int i,
@@ -1037,6 +2088,9 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1037 unsigned long pfn_offset; 2088 unsigned long pfn_offset;
1038 unsigned long vaddr = *position; 2089 unsigned long vaddr = *position;
1039 int remainder = *length; 2090 int remainder = *length;
2091 struct hstate *h = hstate_vma(vma);
2092 int zeropage_ok = 0;
2093 int shared = vma->vm_flags & VM_SHARED;
1040 2094
1041 spin_lock(&mm->page_table_lock); 2095 spin_lock(&mm->page_table_lock);
1042 while (vaddr < vma->vm_end && remainder) { 2096 while (vaddr < vma->vm_end && remainder) {
@@ -1048,9 +2102,12 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1048 * each hugepage. We have to make * sure we get the 2102 * each hugepage. We have to make * sure we get the
1049 * first, for the page indexing below to work. 2103 * first, for the page indexing below to work.
1050 */ 2104 */
1051 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 2105 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2106 if (huge_zeropage_ok(pte, write, shared))
2107 zeropage_ok = 1;
1052 2108
1053 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 2109 if (!pte ||
2110 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
1054 (write && !pte_write(huge_ptep_get(pte)))) { 2111 (write && !pte_write(huge_ptep_get(pte)))) {
1055 int ret; 2112 int ret;
1056 2113
@@ -1066,12 +2123,15 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1066 break; 2123 break;
1067 } 2124 }
1068 2125
1069 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 2126 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
1070 page = pte_page(huge_ptep_get(pte)); 2127 page = pte_page(huge_ptep_get(pte));
1071same_page: 2128same_page:
1072 if (pages) { 2129 if (pages) {
1073 get_page(page); 2130 if (zeropage_ok)
1074 pages[i] = page + pfn_offset; 2131 pages[i] = ZERO_PAGE(0);
2132 else
2133 pages[i] = page + pfn_offset;
2134 get_page(pages[i]);
1075 } 2135 }
1076 2136
1077 if (vmas) 2137 if (vmas)
@@ -1082,7 +2142,7 @@ same_page:
1082 --remainder; 2142 --remainder;
1083 ++i; 2143 ++i;
1084 if (vaddr < vma->vm_end && remainder && 2144 if (vaddr < vma->vm_end && remainder &&
1085 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 2145 pfn_offset < pages_per_huge_page(h)) {
1086 /* 2146 /*
1087 * We use pfn_offset to avoid touching the pageframes 2147 * We use pfn_offset to avoid touching the pageframes
1088 * of this compound page. 2148 * of this compound page.
@@ -1104,13 +2164,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1104 unsigned long start = address; 2164 unsigned long start = address;
1105 pte_t *ptep; 2165 pte_t *ptep;
1106 pte_t pte; 2166 pte_t pte;
2167 struct hstate *h = hstate_vma(vma);
1107 2168
1108 BUG_ON(address >= end); 2169 BUG_ON(address >= end);
1109 flush_cache_range(vma, address, end); 2170 flush_cache_range(vma, address, end);
1110 2171
1111 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2172 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1112 spin_lock(&mm->page_table_lock); 2173 spin_lock(&mm->page_table_lock);
1113 for (; address < end; address += HPAGE_SIZE) { 2174 for (; address < end; address += huge_page_size(h)) {
1114 ptep = huge_pte_offset(mm, address); 2175 ptep = huge_pte_offset(mm, address);
1115 if (!ptep) 2176 if (!ptep)
1116 continue; 2177 continue;
@@ -1128,195 +2189,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1128 flush_tlb_range(vma, start, end); 2189 flush_tlb_range(vma, start, end);
1129} 2190}
1130 2191
1131struct file_region { 2192int hugetlb_reserve_pages(struct inode *inode,
1132 struct list_head link; 2193 long from, long to,
1133 long from; 2194 struct vm_area_struct *vma)
1134 long to;
1135};
1136
1137static long region_add(struct list_head *head, long f, long t)
1138{
1139 struct file_region *rg, *nrg, *trg;
1140
1141 /* Locate the region we are either in or before. */
1142 list_for_each_entry(rg, head, link)
1143 if (f <= rg->to)
1144 break;
1145
1146 /* Round our left edge to the current segment if it encloses us. */
1147 if (f > rg->from)
1148 f = rg->from;
1149
1150 /* Check for and consume any regions we now overlap with. */
1151 nrg = rg;
1152 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1153 if (&rg->link == head)
1154 break;
1155 if (rg->from > t)
1156 break;
1157
1158 /* If this area reaches higher then extend our area to
1159 * include it completely. If this is not the first area
1160 * which we intend to reuse, free it. */
1161 if (rg->to > t)
1162 t = rg->to;
1163 if (rg != nrg) {
1164 list_del(&rg->link);
1165 kfree(rg);
1166 }
1167 }
1168 nrg->from = f;
1169 nrg->to = t;
1170 return 0;
1171}
1172
1173static long region_chg(struct list_head *head, long f, long t)
1174{
1175 struct file_region *rg, *nrg;
1176 long chg = 0;
1177
1178 /* Locate the region we are before or in. */
1179 list_for_each_entry(rg, head, link)
1180 if (f <= rg->to)
1181 break;
1182
1183 /* If we are below the current region then a new region is required.
1184 * Subtle, allocate a new region at the position but make it zero
1185 * size such that we can guarantee to record the reservation. */
1186 if (&rg->link == head || t < rg->from) {
1187 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
1188 if (!nrg)
1189 return -ENOMEM;
1190 nrg->from = f;
1191 nrg->to = f;
1192 INIT_LIST_HEAD(&nrg->link);
1193 list_add(&nrg->link, rg->link.prev);
1194
1195 return t - f;
1196 }
1197
1198 /* Round our left edge to the current segment if it encloses us. */
1199 if (f > rg->from)
1200 f = rg->from;
1201 chg = t - f;
1202
1203 /* Check for and consume any regions we now overlap with. */
1204 list_for_each_entry(rg, rg->link.prev, link) {
1205 if (&rg->link == head)
1206 break;
1207 if (rg->from > t)
1208 return chg;
1209
1210 /* We overlap with this area, if it extends futher than
1211 * us then we must extend ourselves. Account for its
1212 * existing reservation. */
1213 if (rg->to > t) {
1214 chg += rg->to - t;
1215 t = rg->to;
1216 }
1217 chg -= rg->to - rg->from;
1218 }
1219 return chg;
1220}
1221
1222static long region_truncate(struct list_head *head, long end)
1223{ 2195{
1224 struct file_region *rg, *trg; 2196 long ret, chg;
1225 long chg = 0; 2197 struct hstate *h = hstate_inode(inode);
1226 2198
1227 /* Locate the region we are either in or before. */ 2199 if (vma && vma->vm_flags & VM_NORESERVE)
1228 list_for_each_entry(rg, head, link)
1229 if (end <= rg->to)
1230 break;
1231 if (&rg->link == head)
1232 return 0; 2200 return 0;
1233 2201
1234 /* If we are in the middle of a region then adjust it. */
1235 if (end > rg->from) {
1236 chg = rg->to - end;
1237 rg->to = end;
1238 rg = list_entry(rg->link.next, typeof(*rg), link);
1239 }
1240
1241 /* Drop any remaining regions. */
1242 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
1243 if (&rg->link == head)
1244 break;
1245 chg += rg->to - rg->from;
1246 list_del(&rg->link);
1247 kfree(rg);
1248 }
1249 return chg;
1250}
1251
1252static int hugetlb_acct_memory(long delta)
1253{
1254 int ret = -ENOMEM;
1255
1256 spin_lock(&hugetlb_lock);
1257 /* 2202 /*
1258 * When cpuset is configured, it breaks the strict hugetlb page 2203 * Shared mappings base their reservation on the number of pages that
1259 * reservation as the accounting is done on a global variable. Such 2204 * are already allocated on behalf of the file. Private mappings need
1260 * reservation is completely rubbish in the presence of cpuset because 2205 * to reserve the full area even if read-only as mprotect() may be
1261 * the reservation is not checked against page availability for the 2206 * called to make the mapping read-write. Assume !vma is a shm mapping
1262 * current cpuset. Application can still potentially OOM'ed by kernel
1263 * with lack of free htlb page in cpuset that the task is in.
1264 * Attempt to enforce strict accounting with cpuset is almost
1265 * impossible (or too ugly) because cpuset is too fluid that
1266 * task or memory node can be dynamically moved between cpusets.
1267 *
1268 * The change of semantics for shared hugetlb mapping with cpuset is
1269 * undesirable. However, in order to preserve some of the semantics,
1270 * we fall back to check against current free page availability as
1271 * a best attempt and hopefully to minimize the impact of changing
1272 * semantics that cpuset has.
1273 */ 2207 */
1274 if (delta > 0) { 2208 if (!vma || vma->vm_flags & VM_SHARED)
1275 if (gather_surplus_pages(delta) < 0) 2209 chg = region_chg(&inode->i_mapping->private_list, from, to);
1276 goto out; 2210 else {
1277 2211 struct resv_map *resv_map = resv_map_alloc();
1278 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 2212 if (!resv_map)
1279 return_unused_surplus_pages(delta); 2213 return -ENOMEM;
1280 goto out;
1281 }
1282 }
1283 2214
1284 ret = 0; 2215 chg = to - from;
1285 if (delta < 0)
1286 return_unused_surplus_pages((unsigned long) -delta);
1287 2216
1288out: 2217 set_vma_resv_map(vma, resv_map);
1289 spin_unlock(&hugetlb_lock); 2218 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1290 return ret; 2219 }
1291}
1292
1293int hugetlb_reserve_pages(struct inode *inode, long from, long to)
1294{
1295 long ret, chg;
1296 2220
1297 chg = region_chg(&inode->i_mapping->private_list, from, to);
1298 if (chg < 0) 2221 if (chg < 0)
1299 return chg; 2222 return chg;
1300 2223
1301 if (hugetlb_get_quota(inode->i_mapping, chg)) 2224 if (hugetlb_get_quota(inode->i_mapping, chg))
1302 return -ENOSPC; 2225 return -ENOSPC;
1303 ret = hugetlb_acct_memory(chg); 2226 ret = hugetlb_acct_memory(h, chg);
1304 if (ret < 0) { 2227 if (ret < 0) {
1305 hugetlb_put_quota(inode->i_mapping, chg); 2228 hugetlb_put_quota(inode->i_mapping, chg);
1306 return ret; 2229 return ret;
1307 } 2230 }
1308 region_add(&inode->i_mapping->private_list, from, to); 2231 if (!vma || vma->vm_flags & VM_SHARED)
2232 region_add(&inode->i_mapping->private_list, from, to);
1309 return 0; 2233 return 0;
1310} 2234}
1311 2235
1312void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2236void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1313{ 2237{
2238 struct hstate *h = hstate_inode(inode);
1314 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2239 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1315 2240
1316 spin_lock(&inode->i_lock); 2241 spin_lock(&inode->i_lock);
1317 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 2242 inode->i_blocks -= blocks_per_huge_page(h);
1318 spin_unlock(&inode->i_lock); 2243 spin_unlock(&inode->i_lock);
1319 2244
1320 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2245 hugetlb_put_quota(inode->i_mapping, (chg - freed));
1321 hugetlb_acct_memory(-(chg - freed)); 2246 hugetlb_acct_memory(h, -(chg - freed));
1322} 2247}
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..e4e728bdf324 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15 15
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling);
18
19extern void prep_compound_page(struct page *page, unsigned long order);
20
16static inline void set_page_count(struct page *page, int v) 21static inline void set_page_count(struct page *page, int v)
17{ 22{
18 atomic_set(&page->_count, v); 23 atomic_set(&page->_count, v);
@@ -34,6 +39,15 @@ static inline void __put_page(struct page *page)
34 atomic_dec(&page->_count); 39 atomic_dec(&page->_count);
35} 40}
36 41
42/*
43 * in mm/vmscan.c:
44 */
45extern int isolate_lru_page(struct page *page);
46extern void putback_lru_page(struct page *page);
47
48/*
49 * in mm/page_alloc.c
50 */
37extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
38 52
39/* 53/*
@@ -47,6 +61,120 @@ static inline unsigned long page_order(struct page *page)
47 return page_private(page); 61 return page_private(page);
48} 62}
49 63
64extern long mlock_vma_pages_range(struct vm_area_struct *vma,
65 unsigned long start, unsigned long end);
66extern void munlock_vma_pages_range(struct vm_area_struct *vma,
67 unsigned long start, unsigned long end);
68static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
69{
70 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
71}
72
73#ifdef CONFIG_UNEVICTABLE_LRU
74/*
75 * unevictable_migrate_page() called only from migrate_page_copy() to
76 * migrate unevictable flag to new page.
77 * Note that the old page has been isolated from the LRU lists at this
78 * point so we don't need to worry about LRU statistics.
79 */
80static inline void unevictable_migrate_page(struct page *new, struct page *old)
81{
82 if (TestClearPageUnevictable(old))
83 SetPageUnevictable(new);
84}
85#else
86static inline void unevictable_migrate_page(struct page *new, struct page *old)
87{
88}
89#endif
90
91#ifdef CONFIG_UNEVICTABLE_LRU
92/*
93 * Called only in fault path via page_evictable() for a new page
94 * to determine if it's being mapped into a LOCKED vma.
95 * If so, mark page as mlocked.
96 */
97static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
98{
99 VM_BUG_ON(PageLRU(page));
100
101 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
102 return 0;
103
104 if (!TestSetPageMlocked(page)) {
105 inc_zone_page_state(page, NR_MLOCK);
106 count_vm_event(UNEVICTABLE_PGMLOCKED);
107 }
108 return 1;
109}
110
111/*
112 * must be called with vma's mmap_sem held for read, and page locked.
113 */
114extern void mlock_vma_page(struct page *page);
115
116/*
117 * Clear the page's PageMlocked(). This can be useful in a situation where
118 * we want to unconditionally remove a page from the pagecache -- e.g.,
119 * on truncation or freeing.
120 *
121 * It is legal to call this function for any page, mlocked or not.
122 * If called for a page that is still mapped by mlocked vmas, all we do
123 * is revert to lazy LRU behaviour -- semantics are not broken.
124 */
125extern void __clear_page_mlock(struct page *page);
126static inline void clear_page_mlock(struct page *page)
127{
128 if (unlikely(TestClearPageMlocked(page)))
129 __clear_page_mlock(page);
130}
131
132/*
133 * mlock_migrate_page - called only from migrate_page_copy() to
134 * migrate the Mlocked page flag; update statistics.
135 */
136static inline void mlock_migrate_page(struct page *newpage, struct page *page)
137{
138 if (TestClearPageMlocked(page)) {
139 unsigned long flags;
140
141 local_irq_save(flags);
142 __dec_zone_page_state(page, NR_MLOCK);
143 SetPageMlocked(newpage);
144 __inc_zone_page_state(newpage, NR_MLOCK);
145 local_irq_restore(flags);
146 }
147}
148
149/*
150 * free_page_mlock() -- clean up attempts to free and mlocked() page.
151 * Page should not be on lru, so no need to fix that up.
152 * free_pages_check() will verify...
153 */
154static inline void free_page_mlock(struct page *page)
155{
156 if (unlikely(TestClearPageMlocked(page))) {
157 unsigned long flags;
158
159 local_irq_save(flags);
160 __dec_zone_page_state(page, NR_MLOCK);
161 __count_vm_event(UNEVICTABLE_MLOCKFREED);
162 local_irq_restore(flags);
163 }
164}
165
166#else /* CONFIG_UNEVICTABLE_LRU */
167static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
168{
169 return 0;
170}
171static inline void clear_page_mlock(struct page *page) { }
172static inline void mlock_vma_page(struct page *page) { }
173static inline void mlock_migrate_page(struct page *new, struct page *old) { }
174static inline void free_page_mlock(struct page *page) { }
175
176#endif /* CONFIG_UNEVICTABLE_LRU */
177
50/* 178/*
51 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 179 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
52 * so all functions starting at paging_init should be marked __init 180 * so all functions starting at paging_init should be marked __init
@@ -59,4 +187,68 @@ static inline unsigned long page_order(struct page *page)
59#define __paginginit __init 187#define __paginginit __init
60#endif 188#endif
61 189
190/* Memory initialisation debug and verification */
191enum mminit_level {
192 MMINIT_WARNING,
193 MMINIT_VERIFY,
194 MMINIT_TRACE
195};
196
197#ifdef CONFIG_DEBUG_MEMORY_INIT
198
199extern int mminit_loglevel;
200
201#define mminit_dprintk(level, prefix, fmt, arg...) \
202do { \
203 if (level < mminit_loglevel) { \
204 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
205 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
206 } \
207} while (0)
208
209extern void mminit_verify_pageflags_layout(void);
210extern void mminit_verify_page_links(struct page *page,
211 enum zone_type zone, unsigned long nid, unsigned long pfn);
212extern void mminit_verify_zonelist(void);
213
214#else
215
216static inline void mminit_dprintk(enum mminit_level level,
217 const char *prefix, const char *fmt, ...)
218{
219}
220
221static inline void mminit_verify_pageflags_layout(void)
222{
223}
224
225static inline void mminit_verify_page_links(struct page *page,
226 enum zone_type zone, unsigned long nid, unsigned long pfn)
227{
228}
229
230static inline void mminit_verify_zonelist(void)
231{
232}
233#endif /* CONFIG_DEBUG_MEMORY_INIT */
234
235/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
236#if defined(CONFIG_SPARSEMEM)
237extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
238 unsigned long *end_pfn);
239#else
240static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
241 unsigned long *end_pfn)
242{
243}
244#endif /* CONFIG_SPARSEMEM */
245
246#define GUP_FLAGS_WRITE 0x1
247#define GUP_FLAGS_FORCE 0x2
248#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
249
250int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
251 unsigned long start, int len, int flags,
252 struct page **pages, struct vm_area_struct **vmas);
253
62#endif 254#endif
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea0..f9349c18a1b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
132 * Application no longer needs these pages. If the pages are dirty, 132 * Application no longer needs these pages. If the pages are dirty,
133 * it's OK to just throw them away. The app will be more careful about 133 * it's OK to just throw them away. The app will be more careful about
134 * data it wants to keep. Be sure to free swap resources too. The 134 * data it wants to keep. Be sure to free swap resources too. The
135 * zap_page_range call sets things up for refill_inactive to actually free 135 * zap_page_range call sets things up for shrink_active_list to actually free
136 * these pages later if no one else has touched them in the meantime, 136 * these pages later if no one else has touched them in the meantime,
137 * although we could add these pages to a global reuse list for 137 * although we could add these pages to a global reuse list for
138 * refill_inactive to pick up before reclaiming other pages. 138 * shrink_active_list to pick up before reclaiming other pages.
139 * 139 *
140 * NB: This interface discards data rather than pushes it out to swap, 140 * NB: This interface discards data rather than pushes it out to swap,
141 * as some implementations do. This has performance implications for 141 * as some implementations do. This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..866dcc7eeb0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,12 +32,13 @@
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/mm_inline.h>
36#include <linux/page_cgroup.h>
35 37
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38struct cgroup_subsys mem_cgroup_subsys; 40struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 41#define MEM_CGROUP_RECLAIM_RETRIES 5
40static struct kmem_cache *page_cgroup_cache;
41 42
42/* 43/*
43 * Statistics for memory cgroup. 44 * Statistics for memory cgroup.
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
65/* 66/*
66 * For accounting under irq disable, no need for increment preempt count. 67 * For accounting under irq disable, no need for increment preempt count.
67 */ 68 */
68static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
69 enum mem_cgroup_stat_index idx, int val) 70 enum mem_cgroup_stat_index idx, int val)
70{ 71{
71 int cpu = smp_processor_id(); 72 stat->count[idx] += val;
72 stat->cpustat[cpu].count[idx] += val;
73} 73}
74 74
75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
85/* 85/*
86 * per-zone information in memory controller. 86 * per-zone information in memory controller.
87 */ 87 */
88
89enum mem_cgroup_zstat_index {
90 MEM_CGROUP_ZSTAT_ACTIVE,
91 MEM_CGROUP_ZSTAT_INACTIVE,
92
93 NR_MEM_CGROUP_ZSTAT,
94};
95
96struct mem_cgroup_per_zone { 88struct mem_cgroup_per_zone {
97 /* 89 /*
98 * spin_lock to protect the per cgroup LRU 90 * spin_lock to protect the per cgroup LRU
99 */ 91 */
100 spinlock_t lru_lock; 92 spinlock_t lru_lock;
101 struct list_head active_list; 93 struct list_head lists[NR_LRU_LISTS];
102 struct list_head inactive_list; 94 unsigned long count[NR_LRU_LISTS];
103 unsigned long count[NR_MEM_CGROUP_ZSTAT];
104}; 95};
105/* Macro for accessing counter */ 96/* Macro for accessing counter */
106#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 97#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
144}; 135};
145static struct mem_cgroup init_mem_cgroup; 136static struct mem_cgroup init_mem_cgroup;
146 137
147/*
148 * We use the lower bit of the page->page_cgroup pointer as a bit spin
149 * lock. We need to ensure that page->page_cgroup is at least two
150 * byte aligned (based on comments from Nick Piggin). But since
151 * bit_spin_lock doesn't actually set that lock bit in a non-debug
152 * uniprocessor kernel, we should avoid setting it here too.
153 */
154#define PAGE_CGROUP_LOCK_BIT 0x0
155#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
156#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
157#else
158#define PAGE_CGROUP_LOCK 0x0
159#endif
160
161/*
162 * A page_cgroup page is associated with every page descriptor. The
163 * page_cgroup helps us identify information about the cgroup
164 */
165struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page;
168 struct mem_cgroup *mem_cgroup;
169 int ref_cnt; /* cached, mapped, migrating */
170 int flags;
171};
172#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
173#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
174
175static int page_cgroup_nid(struct page_cgroup *pc)
176{
177 return page_to_nid(pc->page);
178}
179
180static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
181{
182 return page_zonenum(pc->page);
183}
184
185enum charge_type { 138enum charge_type {
186 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
187 MEM_CGROUP_CHARGE_TYPE_MAPPED, 140 MEM_CGROUP_CHARGE_TYPE_MAPPED,
141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
143 NR_CHARGE_TYPE,
144};
145
146/* only for here (for easy reading.) */
147#define PCGF_CACHE (1UL << PCG_CACHE)
148#define PCGF_USED (1UL << PCG_USED)
149#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
150#define PCGF_LOCK (1UL << PCG_LOCK)
151#define PCGF_FILE (1UL << PCG_FILE)
152static const unsigned long
153pcg_default_flags[NR_CHARGE_TYPE] = {
154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
157 0, /* FORCE */
188}; 158};
189 159
190/* 160/*
191 * Always modified under lru lock. Then, not necessary to preempt_disable() 161 * Always modified under lru lock. Then, not necessary to preempt_disable()
192 */ 162 */
193static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
194 bool charge) 164 struct page_cgroup *pc,
165 bool charge)
195{ 166{
196 int val = (charge)? 1 : -1; 167 int val = (charge)? 1 : -1;
197 struct mem_cgroup_stat *stat = &mem->stat; 168 struct mem_cgroup_stat *stat = &mem->stat;
169 struct mem_cgroup_stat_cpu *cpustat;
198 170
199 VM_BUG_ON(!irqs_disabled()); 171 VM_BUG_ON(!irqs_disabled());
200 if (flags & PAGE_CGROUP_FLAG_CACHE) 172
201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 173 cpustat = &stat->cpustat[smp_processor_id()];
174 if (PageCgroupCache(pc))
175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
202 else 176 else
203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 177 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
204 178
205 if (charge) 179 if (charge)
206 __mem_cgroup_stat_add_safe(stat, 180 __mem_cgroup_stat_add_safe(cpustat,
207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 181 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
208 else 182 else
209 __mem_cgroup_stat_add_safe(stat, 183 __mem_cgroup_stat_add_safe(cpustat,
210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
211} 185}
212 186
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
227} 201}
228 202
229static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 203static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
230 enum mem_cgroup_zstat_index idx) 204 enum lru_list idx)
231{ 205{
232 int nid, zid; 206 int nid, zid;
233 struct mem_cgroup_per_zone *mz; 207 struct mem_cgroup_per_zone *mz;
@@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
250 224
251struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 225struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
252{ 226{
227 /*
228 * mm_update_next_owner() may clear mm->owner to NULL
229 * if it races with swapoff, page migration, etc.
230 * So this can be called with p == NULL.
231 */
232 if (unlikely(!p))
233 return NULL;
234
253 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 235 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
254 struct mem_cgroup, css); 236 struct mem_cgroup, css);
255} 237}
256 238
257static inline int page_cgroup_locked(struct page *page)
258{
259 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
260}
261
262static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
263{
264 VM_BUG_ON(!page_cgroup_locked(page));
265 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
266}
267
268struct page_cgroup *page_get_page_cgroup(struct page *page)
269{
270 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
271}
272
273static void lock_page_cgroup(struct page *page)
274{
275 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
276}
277
278static int try_lock_page_cgroup(struct page *page)
279{
280 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
281}
282
283static void unlock_page_cgroup(struct page *page)
284{
285 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
286}
287
288static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
289 struct page_cgroup *pc) 240 struct page_cgroup *pc)
290{ 241{
291 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 242 int lru = LRU_BASE;
243
244 if (PageCgroupUnevictable(pc))
245 lru = LRU_UNEVICTABLE;
246 else {
247 if (PageCgroupActive(pc))
248 lru += LRU_ACTIVE;
249 if (PageCgroupFile(pc))
250 lru += LRU_FILE;
251 }
292 252
293 if (from) 253 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
294 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
295 else
296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
297 254
298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
299 list_del_init(&pc->lru); 256 list_del(&pc->lru);
300} 257}
301 258
302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
303 struct page_cgroup *pc) 260 struct page_cgroup *pc)
304{ 261{
305 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 262 int lru = LRU_BASE;
306 263
307 if (!to) { 264 if (PageCgroupUnevictable(pc))
308 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 265 lru = LRU_UNEVICTABLE;
309 list_add(&pc->lru, &mz->inactive_list); 266 else {
310 } else { 267 if (PageCgroupActive(pc))
311 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 268 lru += LRU_ACTIVE;
312 list_add(&pc->lru, &mz->active_list); 269 if (PageCgroupFile(pc))
270 lru += LRU_FILE;
313 } 271 }
314 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 272
273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274 list_add(&pc->lru, &mz->lists[lru]);
275
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
315} 277}
316 278
317static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
318{ 280{
319 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
320 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
282 int active = PageCgroupActive(pc);
283 int file = PageCgroupFile(pc);
284 int unevictable = PageCgroupUnevictable(pc);
285 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
286 (LRU_FILE * !!file + !!active);
321 287
322 if (from) 288 if (lru == from)
323 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 289 return;
324 else
325 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
326 290
327 if (active) { 291 MEM_CGROUP_ZSTAT(mz, from) -= 1;
328 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 292 /*
329 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 293 * However this is done under mz->lru_lock, another flags, which
330 list_move(&pc->lru, &mz->active_list); 294 * are not related to LRU, will be modified from out-of-lock.
295 * We have to use atomic set/clear flags.
296 */
297 if (is_unevictable_lru(lru)) {
298 ClearPageCgroupActive(pc);
299 SetPageCgroupUnevictable(pc);
331 } else { 300 } else {
332 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 301 if (is_active_lru(lru))
333 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 302 SetPageCgroupActive(pc);
334 list_move(&pc->lru, &mz->inactive_list); 303 else
304 ClearPageCgroupActive(pc);
305 ClearPageCgroupUnevictable(pc);
335 } 306 }
307
308 MEM_CGROUP_ZSTAT(mz, lru) += 1;
309 list_move(&pc->lru, &mz->lists[lru]);
336} 310}
337 311
338int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -348,12 +322,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
348/* 322/*
349 * This routine assumes that the appropriate zone's lru lock is already held 323 * This routine assumes that the appropriate zone's lru lock is already held
350 */ 324 */
351void mem_cgroup_move_lists(struct page *page, bool active) 325void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
352{ 326{
353 struct page_cgroup *pc; 327 struct page_cgroup *pc;
354 struct mem_cgroup_per_zone *mz; 328 struct mem_cgroup_per_zone *mz;
355 unsigned long flags; 329 unsigned long flags;
356 330
331 if (mem_cgroup_subsys.disabled)
332 return;
333
357 /* 334 /*
358 * We cannot lock_page_cgroup while holding zone's lru_lock, 335 * We cannot lock_page_cgroup while holding zone's lru_lock,
359 * because other holders of lock_page_cgroup can be interrupted 336 * because other holders of lock_page_cgroup can be interrupted
@@ -361,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
361 * safely get to page_cgroup without it, so just try_lock it: 338 * safely get to page_cgroup without it, so just try_lock it:
362 * mem_cgroup_isolate_pages allows for page left on wrong list. 339 * mem_cgroup_isolate_pages allows for page left on wrong list.
363 */ 340 */
364 if (!try_lock_page_cgroup(page)) 341 pc = lookup_page_cgroup(page);
342 if (!trylock_page_cgroup(pc))
365 return; 343 return;
366 344 if (pc && PageCgroupUsed(pc)) {
367 pc = page_get_page_cgroup(page);
368 if (pc) {
369 mz = page_cgroup_zoneinfo(pc); 345 mz = page_cgroup_zoneinfo(pc);
370 spin_lock_irqsave(&mz->lru_lock, flags); 346 spin_lock_irqsave(&mz->lru_lock, flags);
371 __mem_cgroup_move_lists(pc, active); 347 __mem_cgroup_move_lists(pc, lru);
372 spin_unlock_irqrestore(&mz->lru_lock, flags); 348 spin_unlock_irqrestore(&mz->lru_lock, flags);
373 } 349 }
374 unlock_page_cgroup(page); 350 unlock_page_cgroup(pc);
375} 351}
376 352
377/* 353/*
@@ -392,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
392} 368}
393 369
394/* 370/*
395 * This function is called from vmscan.c. In page reclaiming loop. balance
396 * between active and inactive list is calculated. For memory controller
397 * page reclaiming, we should use using mem_cgroup's imbalance rather than
398 * zone's global lru imbalance.
399 */
400long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
401{
402 unsigned long active, inactive;
403 /* active and inactive are the number of pages. 'long' is ok.*/
404 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
405 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
406 return (long) (active / (inactive + 1));
407}
408
409/*
410 * prev_priority control...this will be used in memory reclaim path. 371 * prev_priority control...this will be used in memory reclaim path.
411 */ 372 */
412int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -433,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
433 * (see include/linux/mmzone.h) 394 * (see include/linux/mmzone.h)
434 */ 395 */
435 396
436long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
437 struct zone *zone, int priority) 398 int priority, enum lru_list lru)
438{ 399{
439 long nr_active; 400 long nr_pages;
440 int nid = zone->zone_pgdat->node_id; 401 int nid = zone->zone_pgdat->node_id;
441 int zid = zone_idx(zone); 402 int zid = zone_idx(zone);
442 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
443 404
444 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
445 return (nr_active >> priority);
446}
447 406
448long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 407 return (nr_pages >> priority);
449 struct zone *zone, int priority)
450{
451 long nr_inactive;
452 int nid = zone->zone_pgdat->node_id;
453 int zid = zone_idx(zone);
454 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
455
456 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
457 return (nr_inactive >> priority);
458} 408}
459 409
460unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -462,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
462 unsigned long *scanned, int order, 412 unsigned long *scanned, int order,
463 int mode, struct zone *z, 413 int mode, struct zone *z,
464 struct mem_cgroup *mem_cont, 414 struct mem_cgroup *mem_cont,
465 int active) 415 int active, int file)
466{ 416{
467 unsigned long nr_taken = 0; 417 unsigned long nr_taken = 0;
468 struct page *page; 418 struct page *page;
@@ -473,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
473 int nid = z->zone_pgdat->node_id; 423 int nid = z->zone_pgdat->node_id;
474 int zid = zone_idx(z); 424 int zid = zone_idx(z);
475 struct mem_cgroup_per_zone *mz; 425 struct mem_cgroup_per_zone *mz;
426 int lru = LRU_FILE * !!file + !!active;
476 427
477 BUG_ON(!mem_cont); 428 BUG_ON(!mem_cont);
478 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
479 if (active) 430 src = &mz->lists[lru];
480 src = &mz->active_list;
481 else
482 src = &mz->inactive_list;
483
484 431
485 spin_lock(&mz->lru_lock); 432 spin_lock(&mz->lru_lock);
486 scan = 0; 433 scan = 0;
487 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
488 if (scan >= nr_to_scan) 435 if (scan >= nr_to_scan)
489 break; 436 break;
437 if (unlikely(!PageCgroupUsed(pc)))
438 continue;
490 page = pc->page; 439 page = pc->page;
491 440
492 if (unlikely(!PageLRU(page))) 441 if (unlikely(!PageLRU(page)))
493 continue; 442 continue;
494 443
495 if (PageActive(page) && !active) { 444 /*
496 __mem_cgroup_move_lists(pc, true); 445 * TODO: play better with lumpy reclaim, grabbing anything.
497 continue; 446 */
498 } 447 if (PageUnevictable(page) ||
499 if (!PageActive(page) && active) { 448 (PageActive(page) && !active) ||
500 __mem_cgroup_move_lists(pc, false); 449 (!PageActive(page) && active)) {
450 __mem_cgroup_move_lists(pc, page_lru(page));
501 continue; 451 continue;
502 } 452 }
503 453
504 scan++; 454 scan++;
505 list_move(&pc->lru, &pc_list); 455 list_move(&pc->lru, &pc_list);
506 456
507 if (__isolate_lru_page(page, mode) == 0) { 457 if (__isolate_lru_page(page, mode, file) == 0) {
508 list_move(&page->lru, dst); 458 list_move(&page->lru, dst);
509 nr_taken++; 459 nr_taken++;
510 } 460 }
@@ -524,63 +474,45 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
524 * < 0 if the cgroup is over its limit 474 * < 0 if the cgroup is over its limit
525 */ 475 */
526static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 476static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
527 gfp_t gfp_mask, enum charge_type ctype) 477 gfp_t gfp_mask, enum charge_type ctype,
478 struct mem_cgroup *memcg)
528{ 479{
529 struct mem_cgroup *mem; 480 struct mem_cgroup *mem;
530 struct page_cgroup *pc; 481 struct page_cgroup *pc;
531 unsigned long flags;
532 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
533 struct mem_cgroup_per_zone *mz; 483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
534 485
535 if (mem_cgroup_subsys.disabled) 486 pc = lookup_page_cgroup(page);
487 /* can happen at boot */
488 if (unlikely(!pc))
536 return 0; 489 return 0;
537 490 prefetchw(pc);
538 /*
539 * Should page_cgroup's go to their own slab?
540 * One could optimize the performance of the charging routine
541 * by saving a bit in the page_flags and using it as a lock
542 * to see if the cgroup page already has a page_cgroup associated
543 * with it
544 */
545retry:
546 lock_page_cgroup(page);
547 pc = page_get_page_cgroup(page);
548 /*
549 * The page_cgroup exists and
550 * the page has already been accounted.
551 */
552 if (pc) {
553 VM_BUG_ON(pc->page != page);
554 VM_BUG_ON(pc->ref_cnt <= 0);
555
556 pc->ref_cnt++;
557 unlock_page_cgroup(page);
558 goto done;
559 }
560 unlock_page_cgroup(page);
561
562 pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
563 if (pc == NULL)
564 goto err;
565
566 /* 491 /*
567 * We always charge the cgroup the mm_struct belongs to. 492 * We always charge the cgroup the mm_struct belongs to.
568 * The mm_struct's mem_cgroup changes on task migration if the 493 * The mm_struct's mem_cgroup changes on task migration if the
569 * thread group leader migrates. It's possible that mm is not 494 * thread group leader migrates. It's possible that mm is not
570 * set, if so charge the init_mm (happens for pagecache usage). 495 * set, if so charge the init_mm (happens for pagecache usage).
571 */ 496 */
572 if (!mm)
573 mm = &init_mm;
574 497
575 rcu_read_lock(); 498 if (likely(!memcg)) {
576 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 499 rcu_read_lock();
577 /* 500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
578 * For every charge from the cgroup, increment reference count 501 if (unlikely(!mem)) {
579 */ 502 rcu_read_unlock();
580 css_get(&mem->css); 503 return 0;
581 rcu_read_unlock(); 504 }
505 /*
506 * For every charge from the cgroup, increment reference count
507 */
508 css_get(&mem->css);
509 rcu_read_unlock();
510 } else {
511 mem = memcg;
512 css_get(&memcg->css);
513 }
582 514
583 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
584 if (!(gfp_mask & __GFP_WAIT)) 516 if (!(gfp_mask & __GFP_WAIT))
585 goto out; 517 goto out;
586 518
@@ -603,63 +535,104 @@ retry:
603 } 535 }
604 } 536 }
605 537
606 pc->ref_cnt = 1; 538
607 pc->mem_cgroup = mem; 539 lock_page_cgroup(pc);
608 pc->page = page; 540 if (unlikely(PageCgroupUsed(pc))) {
609 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 541 unlock_page_cgroup(pc);
610 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
611 pc->flags = PAGE_CGROUP_FLAG_CACHE;
612
613 lock_page_cgroup(page);
614 if (page_get_page_cgroup(page)) {
615 unlock_page_cgroup(page);
616 /*
617 * Another charge has been added to this page already.
618 * We take lock_page_cgroup(page) again and read
619 * page->cgroup, increment refcnt.... just retry is OK.
620 */
621 res_counter_uncharge(&mem->res, PAGE_SIZE); 542 res_counter_uncharge(&mem->res, PAGE_SIZE);
622 css_put(&mem->css); 543 css_put(&mem->css);
623 kmem_cache_free(page_cgroup_cache, pc); 544
624 goto retry; 545 goto done;
625 } 546 }
626 page_assign_page_cgroup(page, pc); 547 pc->mem_cgroup = mem;
548 /*
549 * If a page is accounted as a page cache, insert to inactive list.
550 * If anon, insert to active list.
551 */
552 pc->flags = pcg_default_flags[ctype];
627 553
628 mz = page_cgroup_zoneinfo(pc); 554 mz = page_cgroup_zoneinfo(pc);
555
629 spin_lock_irqsave(&mz->lru_lock, flags); 556 spin_lock_irqsave(&mz->lru_lock, flags);
630 __mem_cgroup_add_list(mz, pc); 557 __mem_cgroup_add_list(mz, pc);
631 spin_unlock_irqrestore(&mz->lru_lock, flags); 558 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc);
632 560
633 unlock_page_cgroup(page);
634done: 561done:
635 return 0; 562 return 0;
636out: 563out:
637 css_put(&mem->css); 564 css_put(&mem->css);
638 kmem_cache_free(page_cgroup_cache, pc);
639err:
640 return -ENOMEM; 565 return -ENOMEM;
641} 566}
642 567
643int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 568int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
644{ 569{
570 if (mem_cgroup_subsys.disabled)
571 return 0;
572 if (PageCompound(page))
573 return 0;
574 /*
575 * If already mapped, we don't have to account.
576 * If page cache, page->mapping has address_space.
577 * But page->mapping may have out-of-use anon_vma pointer,
578 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
579 * is NULL.
580 */
581 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
582 return 0;
583 if (unlikely(!mm))
584 mm = &init_mm;
645 return mem_cgroup_charge_common(page, mm, gfp_mask, 585 return mem_cgroup_charge_common(page, mm, gfp_mask,
646 MEM_CGROUP_CHARGE_TYPE_MAPPED); 586 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
647} 587}
648 588
649int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 589int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
650 gfp_t gfp_mask) 590 gfp_t gfp_mask)
651{ 591{
652 if (!mm) 592 if (mem_cgroup_subsys.disabled)
593 return 0;
594 if (PageCompound(page))
595 return 0;
596 /*
597 * Corner case handling. This is called from add_to_page_cache()
598 * in usual. But some FS (shmem) precharges this page before calling it
599 * and call add_to_page_cache() with GFP_NOWAIT.
600 *
601 * For GFP_NOWAIT case, the page may be pre-charged before calling
602 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
603 * charge twice. (It works but has to pay a bit larger cost.)
604 */
605 if (!(gfp_mask & __GFP_WAIT)) {
606 struct page_cgroup *pc;
607
608
609 pc = lookup_page_cgroup(page);
610 if (!pc)
611 return 0;
612 lock_page_cgroup(pc);
613 if (PageCgroupUsed(pc)) {
614 unlock_page_cgroup(pc);
615 return 0;
616 }
617 unlock_page_cgroup(pc);
618 }
619
620 if (unlikely(!mm))
653 mm = &init_mm; 621 mm = &init_mm;
654 return mem_cgroup_charge_common(page, mm, gfp_mask, 622
655 MEM_CGROUP_CHARGE_TYPE_CACHE); 623 if (page_is_file_cache(page))
624 return mem_cgroup_charge_common(page, mm, gfp_mask,
625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
626 else
627 return mem_cgroup_charge_common(page, mm, gfp_mask,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
656} 629}
657 630
658/* 631/*
659 * Uncharging is always a welcome operation, we never complain, simply 632 * uncharge if !page_mapped(page)
660 * uncharge.
661 */ 633 */
662void mem_cgroup_uncharge_page(struct page *page) 634static void
635__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
663{ 636{
664 struct page_cgroup *pc; 637 struct page_cgroup *pc;
665 struct mem_cgroup *mem; 638 struct mem_cgroup *mem;
@@ -672,106 +645,172 @@ void mem_cgroup_uncharge_page(struct page *page)
672 /* 645 /*
673 * Check if our page_cgroup is valid 646 * Check if our page_cgroup is valid
674 */ 647 */
675 lock_page_cgroup(page); 648 pc = lookup_page_cgroup(page);
676 pc = page_get_page_cgroup(page); 649 if (unlikely(!pc || !PageCgroupUsed(pc)))
677 if (!pc) 650 return;
678 goto unlock;
679 651
680 VM_BUG_ON(pc->page != page); 652 lock_page_cgroup(pc);
681 VM_BUG_ON(pc->ref_cnt <= 0); 653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
654 || !PageCgroupUsed(pc)) {
655 /* This happens at race in zap_pte_range() and do_swap_page()*/
656 unlock_page_cgroup(pc);
657 return;
658 }
659 ClearPageCgroupUsed(pc);
660 mem = pc->mem_cgroup;
682 661
683 if (--(pc->ref_cnt) == 0) { 662 mz = page_cgroup_zoneinfo(pc);
684 mz = page_cgroup_zoneinfo(pc); 663 spin_lock_irqsave(&mz->lru_lock, flags);
685 spin_lock_irqsave(&mz->lru_lock, flags); 664 __mem_cgroup_remove_list(mz, pc);
686 __mem_cgroup_remove_list(mz, pc); 665 spin_unlock_irqrestore(&mz->lru_lock, flags);
687 spin_unlock_irqrestore(&mz->lru_lock, flags); 666 unlock_page_cgroup(pc);
688 667
689 page_assign_page_cgroup(page, NULL); 668 res_counter_uncharge(&mem->res, PAGE_SIZE);
690 unlock_page_cgroup(page); 669 css_put(&mem->css);
691 670
692 mem = pc->mem_cgroup; 671 return;
693 res_counter_uncharge(&mem->res, PAGE_SIZE); 672}
694 css_put(&mem->css);
695 673
696 kmem_cache_free(page_cgroup_cache, pc); 674void mem_cgroup_uncharge_page(struct page *page)
675{
676 /* early check. */
677 if (page_mapped(page))
697 return; 678 return;
698 } 679 if (page->mapping && !PageAnon(page))
680 return;
681 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
682}
699 683
700unlock: 684void mem_cgroup_uncharge_cache_page(struct page *page)
701 unlock_page_cgroup(page); 685{
686 VM_BUG_ON(page_mapped(page));
687 VM_BUG_ON(page->mapping);
688 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
702} 689}
703 690
704/* 691/*
705 * Returns non-zero if a page (under migration) has valid page_cgroup member. 692 * Before starting migration, account against new page.
706 * Refcnt of page_cgroup is incremented.
707 */ 693 */
708int mem_cgroup_prepare_migration(struct page *page) 694int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
709{ 695{
710 struct page_cgroup *pc; 696 struct page_cgroup *pc;
697 struct mem_cgroup *mem = NULL;
698 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
699 int ret = 0;
711 700
712 if (mem_cgroup_subsys.disabled) 701 if (mem_cgroup_subsys.disabled)
713 return 0; 702 return 0;
714 703
715 lock_page_cgroup(page); 704 pc = lookup_page_cgroup(page);
716 pc = page_get_page_cgroup(page); 705 lock_page_cgroup(pc);
717 if (pc) 706 if (PageCgroupUsed(pc)) {
718 pc->ref_cnt++; 707 mem = pc->mem_cgroup;
719 unlock_page_cgroup(page); 708 css_get(&mem->css);
720 return pc != NULL; 709 if (PageCgroupCache(pc)) {
710 if (page_is_file_cache(page))
711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
712 else
713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
714 }
715 }
716 unlock_page_cgroup(pc);
717 if (mem) {
718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
719 ctype, mem);
720 css_put(&mem->css);
721 }
722 return ret;
721} 723}
722 724
723void mem_cgroup_end_migration(struct page *page) 725/* remove redundant charge if migration failed*/
726void mem_cgroup_end_migration(struct page *newpage)
724{ 727{
725 mem_cgroup_uncharge_page(page); 728 /*
729 * At success, page->mapping is not NULL.
730 * special rollback care is necessary when
731 * 1. at migration failure. (newpage->mapping is cleared in this case)
732 * 2. the newpage was moved but not remapped again because the task
733 * exits and the newpage is obsolete. In this case, the new page
734 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
735 * always for avoiding mess. The page_cgroup will be removed if
736 * unnecessary. File cache pages is still on radix-tree. Don't
737 * care it.
738 */
739 if (!newpage->mapping)
740 __mem_cgroup_uncharge_common(newpage,
741 MEM_CGROUP_CHARGE_TYPE_FORCE);
742 else if (PageAnon(newpage))
743 mem_cgroup_uncharge_page(newpage);
726} 744}
727 745
728/* 746/*
729 * We know both *page* and *newpage* are now not-on-LRU and PG_locked. 747 * A call to try to shrink memory usage under specified resource controller.
730 * And no race with uncharge() routines because page_cgroup for *page* 748 * This is typically used for page reclaiming for shmem for reducing side
731 * has extra one reference by mem_cgroup_prepare_migration. 749 * effect of page allocation from shmem, which is used by some mem_cgroup.
732 */ 750 */
733void mem_cgroup_page_migration(struct page *page, struct page *newpage) 751int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
734{ 752{
735 struct page_cgroup *pc; 753 struct mem_cgroup *mem;
736 struct mem_cgroup_per_zone *mz; 754 int progress = 0;
737 unsigned long flags; 755 int retry = MEM_CGROUP_RECLAIM_RETRIES;
738 756
739 lock_page_cgroup(page); 757 if (mem_cgroup_subsys.disabled)
740 pc = page_get_page_cgroup(page); 758 return 0;
741 if (!pc) { 759 if (!mm)
742 unlock_page_cgroup(page); 760 return 0;
743 return; 761
762 rcu_read_lock();
763 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
764 if (unlikely(!mem)) {
765 rcu_read_unlock();
766 return 0;
744 } 767 }
768 css_get(&mem->css);
769 rcu_read_unlock();
745 770
746 mz = page_cgroup_zoneinfo(pc); 771 do {
747 spin_lock_irqsave(&mz->lru_lock, flags); 772 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
748 __mem_cgroup_remove_list(mz, pc); 773 progress += res_counter_check_under_limit(&mem->res);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 774 } while (!progress && --retry);
750 775
751 page_assign_page_cgroup(page, NULL); 776 css_put(&mem->css);
752 unlock_page_cgroup(page); 777 if (!retry)
778 return -ENOMEM;
779 return 0;
780}
753 781
754 pc->page = newpage; 782int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
755 lock_page_cgroup(newpage); 783{
756 page_assign_page_cgroup(newpage, pc);
757 784
758 mz = page_cgroup_zoneinfo(pc); 785 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
759 spin_lock_irqsave(&mz->lru_lock, flags); 786 int progress;
760 __mem_cgroup_add_list(mz, pc); 787 int ret = 0;
761 spin_unlock_irqrestore(&mz->lru_lock, flags);
762 788
763 unlock_page_cgroup(newpage); 789 while (res_counter_set_limit(&memcg->res, val)) {
790 if (signal_pending(current)) {
791 ret = -EINTR;
792 break;
793 }
794 if (!retry_count) {
795 ret = -EBUSY;
796 break;
797 }
798 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
799 if (!progress)
800 retry_count--;
801 }
802 return ret;
764} 803}
765 804
805
766/* 806/*
767 * This routine traverse page_cgroup in given list and drop them all. 807 * This routine traverse page_cgroup in given list and drop them all.
768 * This routine ignores page_cgroup->ref_cnt.
769 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 808 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
770 */ 809 */
771#define FORCE_UNCHARGE_BATCH (128) 810#define FORCE_UNCHARGE_BATCH (128)
772static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
773 struct mem_cgroup_per_zone *mz, 812 struct mem_cgroup_per_zone *mz,
774 int active) 813 enum lru_list lru)
775{ 814{
776 struct page_cgroup *pc; 815 struct page_cgroup *pc;
777 struct page *page; 816 struct page *page;
@@ -779,22 +818,31 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
779 unsigned long flags; 818 unsigned long flags;
780 struct list_head *list; 819 struct list_head *list;
781 820
782 if (active) 821 list = &mz->lists[lru];
783 list = &mz->active_list;
784 else
785 list = &mz->inactive_list;
786 822
787 spin_lock_irqsave(&mz->lru_lock, flags); 823 spin_lock_irqsave(&mz->lru_lock, flags);
788 while (!list_empty(list)) { 824 while (!list_empty(list)) {
789 pc = list_entry(list->prev, struct page_cgroup, lru); 825 pc = list_entry(list->prev, struct page_cgroup, lru);
790 page = pc->page; 826 page = pc->page;
827 if (!PageCgroupUsed(pc))
828 break;
791 get_page(page); 829 get_page(page);
792 spin_unlock_irqrestore(&mz->lru_lock, flags); 830 spin_unlock_irqrestore(&mz->lru_lock, flags);
793 mem_cgroup_uncharge_page(page); 831 /*
794 put_page(page); 832 * Check if this page is on LRU. !LRU page can be found
795 if (--count <= 0) { 833 * if it's under page migration.
796 count = FORCE_UNCHARGE_BATCH; 834 */
797 cond_resched(); 835 if (PageLRU(page)) {
836 __mem_cgroup_uncharge_common(page,
837 MEM_CGROUP_CHARGE_TYPE_FORCE);
838 put_page(page);
839 if (--count <= 0) {
840 count = FORCE_UNCHARGE_BATCH;
841 cond_resched();
842 }
843 } else {
844 spin_lock_irqsave(&mz->lru_lock, flags);
845 break;
798 } 846 }
799 spin_lock_irqsave(&mz->lru_lock, flags); 847 spin_lock_irqsave(&mz->lru_lock, flags);
800 } 848 }
@@ -810,9 +858,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
810 int ret = -EBUSY; 858 int ret = -EBUSY;
811 int node, zid; 859 int node, zid;
812 860
813 if (mem_cgroup_subsys.disabled)
814 return 0;
815
816 css_get(&mem->css); 861 css_get(&mem->css);
817 /* 862 /*
818 * page reclaim code (kswapd etc..) will move pages between 863 * page reclaim code (kswapd etc..) will move pages between
@@ -822,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
822 while (mem->res.usage > 0) { 867 while (mem->res.usage > 0) {
823 if (atomic_read(&mem->css.cgroup->count) > 0) 868 if (atomic_read(&mem->css.cgroup->count) > 0)
824 goto out; 869 goto out;
870 /* This is for making all *used* pages to be on LRU. */
871 lru_add_drain_all();
825 for_each_node_state(node, N_POSSIBLE) 872 for_each_node_state(node, N_POSSIBLE)
826 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 873 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
827 struct mem_cgroup_per_zone *mz; 874 struct mem_cgroup_per_zone *mz;
875 enum lru_list l;
828 mz = mem_cgroup_zoneinfo(mem, node, zid); 876 mz = mem_cgroup_zoneinfo(mem, node, zid);
829 /* drop all page_cgroup in active_list */ 877 for_each_lru(l)
830 mem_cgroup_force_empty_list(mem, mz, 1); 878 mem_cgroup_force_empty_list(mem, mz, l);
831 /* drop all page_cgroup in inactive_list */
832 mem_cgroup_force_empty_list(mem, mz, 0);
833 } 879 }
880 cond_resched();
834 } 881 }
835 ret = 0; 882 ret = 0;
836out: 883out:
@@ -838,32 +885,34 @@ out:
838 return ret; 885 return ret;
839} 886}
840 887
841static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
842{
843 *tmp = memparse(buf, &buf);
844 if (*buf != '\0')
845 return -EINVAL;
846
847 /*
848 * Round up the value to the closest page size
849 */
850 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
851 return 0;
852}
853
854static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 888static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
855{ 889{
856 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 890 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
857 cft->private); 891 cft->private);
858} 892}
859 893/*
860static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 894 * The user of this function is...
861 struct file *file, const char __user *userbuf, 895 * RES_LIMIT.
862 size_t nbytes, loff_t *ppos) 896 */
897static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
898 const char *buffer)
863{ 899{
864 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 900 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
865 cft->private, userbuf, nbytes, ppos, 901 unsigned long long val;
866 mem_cgroup_write_strategy); 902 int ret;
903
904 switch (cft->private) {
905 case RES_LIMIT:
906 /* This function does all necessary parse...reuse it */
907 ret = res_counter_memparse_write_strategy(buffer, &val);
908 if (!ret)
909 ret = mem_cgroup_resize_limit(memcg, val);
910 break;
911 default:
912 ret = -EINVAL; /* should be BUG() ? */
913 break;
914 }
915 return ret;
867} 916}
868 917
869static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 918static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -913,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
913 } 962 }
914 /* showing # of active pages */ 963 /* showing # of active pages */
915 { 964 {
916 unsigned long active, inactive; 965 unsigned long active_anon, inactive_anon;
917 966 unsigned long active_file, inactive_file;
918 inactive = mem_cgroup_get_all_zonestat(mem_cont, 967 unsigned long unevictable;
919 MEM_CGROUP_ZSTAT_INACTIVE); 968
920 active = mem_cgroup_get_all_zonestat(mem_cont, 969 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
921 MEM_CGROUP_ZSTAT_ACTIVE); 970 LRU_INACTIVE_ANON);
922 cb->fill(cb, "active", (active) * PAGE_SIZE); 971 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
923 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 972 LRU_ACTIVE_ANON);
973 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
974 LRU_INACTIVE_FILE);
975 active_file = mem_cgroup_get_all_zonestat(mem_cont,
976 LRU_ACTIVE_FILE);
977 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
978 LRU_UNEVICTABLE);
979
980 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
981 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
982 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
983 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
985
924 } 986 }
925 return 0; 987 return 0;
926} 988}
@@ -940,7 +1002,7 @@ static struct cftype mem_cgroup_files[] = {
940 { 1002 {
941 .name = "limit_in_bytes", 1003 .name = "limit_in_bytes",
942 .private = RES_LIMIT, 1004 .private = RES_LIMIT,
943 .write = mem_cgroup_write, 1005 .write_string = mem_cgroup_write,
944 .read_u64 = mem_cgroup_read, 1006 .read_u64 = mem_cgroup_read,
945 }, 1007 },
946 { 1008 {
@@ -963,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
963{ 1025{
964 struct mem_cgroup_per_node *pn; 1026 struct mem_cgroup_per_node *pn;
965 struct mem_cgroup_per_zone *mz; 1027 struct mem_cgroup_per_zone *mz;
1028 enum lru_list l;
966 int zone, tmp = node; 1029 int zone, tmp = node;
967 /* 1030 /*
968 * This routine is called against possible nodes. 1031 * This routine is called against possible nodes.
@@ -983,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
983 1046
984 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
985 mz = &pn->zoneinfo[zone]; 1048 mz = &pn->zoneinfo[zone];
986 INIT_LIST_HEAD(&mz->active_list);
987 INIT_LIST_HEAD(&mz->inactive_list);
988 spin_lock_init(&mz->lru_lock); 1049 spin_lock_init(&mz->lru_lock);
1050 for_each_lru(l)
1051 INIT_LIST_HEAD(&mz->lists[l]);
989 } 1052 }
990 return 0; 1053 return 0;
991} 1054}
@@ -1026,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1026 1089
1027 if (unlikely((cont->parent) == NULL)) { 1090 if (unlikely((cont->parent) == NULL)) {
1028 mem = &init_mem_cgroup; 1091 mem = &init_mem_cgroup;
1029 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1030 } else { 1092 } else {
1031 mem = mem_cgroup_alloc(); 1093 mem = mem_cgroup_alloc();
1032 if (!mem) 1094 if (!mem)
@@ -1070,8 +1132,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1070static int mem_cgroup_populate(struct cgroup_subsys *ss, 1132static int mem_cgroup_populate(struct cgroup_subsys *ss,
1071 struct cgroup *cont) 1133 struct cgroup *cont)
1072{ 1134{
1073 if (mem_cgroup_subsys.disabled)
1074 return 0;
1075 return cgroup_add_files(cont, ss, mem_cgroup_files, 1135 return cgroup_add_files(cont, ss, mem_cgroup_files,
1076 ARRAY_SIZE(mem_cgroup_files)); 1136 ARRAY_SIZE(mem_cgroup_files));
1077} 1137}
@@ -1084,9 +1144,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1084 struct mm_struct *mm; 1144 struct mm_struct *mm;
1085 struct mem_cgroup *mem, *old_mem; 1145 struct mem_cgroup *mem, *old_mem;
1086 1146
1087 if (mem_cgroup_subsys.disabled)
1088 return;
1089
1090 mm = get_task_mm(p); 1147 mm = get_task_mm(p);
1091 if (mm == NULL) 1148 if (mm == NULL)
1092 return; 1149 return;
@@ -1094,9 +1151,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1094 mem = mem_cgroup_from_cont(cont); 1151 mem = mem_cgroup_from_cont(cont);
1095 old_mem = mem_cgroup_from_cont(old_cont); 1152 old_mem = mem_cgroup_from_cont(old_cont);
1096 1153
1097 if (mem == old_mem)
1098 goto out;
1099
1100 /* 1154 /*
1101 * Only thread group leaders are allowed to migrate, the mm_struct is 1155 * Only thread group leaders are allowed to migrate, the mm_struct is
1102 * in effect owned by the leader 1156 * in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..164951c47305 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
54 55
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -61,6 +62,8 @@
61#include <linux/swapops.h> 62#include <linux/swapops.h>
62#include <linux/elf.h> 63#include <linux/elf.h>
63 64
65#include "internal.h"
66
64#ifndef CONFIG_NEED_MULTIPLE_NODES 67#ifndef CONFIG_NEED_MULTIPLE_NODES
65/* use the per-pgdat data instead for discontigmem - mbligh */ 68/* use the per-pgdat data instead for discontigmem - mbligh */
66unsigned long max_mapnr; 69unsigned long max_mapnr;
@@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
211 * 214 *
212 * Must be called with pagetable lock held. 215 * Must be called with pagetable lock held.
213 */ 216 */
214void free_pgd_range(struct mmu_gather **tlb, 217void free_pgd_range(struct mmu_gather *tlb,
215 unsigned long addr, unsigned long end, 218 unsigned long addr, unsigned long end,
216 unsigned long floor, unsigned long ceiling) 219 unsigned long floor, unsigned long ceiling)
217{ 220{
@@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb,
262 return; 265 return;
263 266
264 start = addr; 267 start = addr;
265 pgd = pgd_offset((*tlb)->mm, addr); 268 pgd = pgd_offset(tlb->mm, addr);
266 do { 269 do {
267 next = pgd_addr_end(addr, end); 270 next = pgd_addr_end(addr, end);
268 if (pgd_none_or_clear_bad(pgd)) 271 if (pgd_none_or_clear_bad(pgd))
269 continue; 272 continue;
270 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 273 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
271 } while (pgd++, addr = next, addr != end); 274 } while (pgd++, addr = next, addr != end);
272} 275}
273 276
274void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, 277void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
275 unsigned long floor, unsigned long ceiling) 278 unsigned long floor, unsigned long ceiling)
276{ 279{
277 while (vma) { 280 while (vma) {
@@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
372 * 375 *
373 * The calling function must still handle the error. 376 * The calling function must still handle the error.
374 */ 377 */
375void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) 378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
379 unsigned long vaddr)
376{ 380{
377 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
378 "vm_flags = %lx, vaddr = %lx\n", 382 "vm_flags = %lx, vaddr = %lx\n",
@@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
649 unsigned long next; 653 unsigned long next;
650 unsigned long addr = vma->vm_start; 654 unsigned long addr = vma->vm_start;
651 unsigned long end = vma->vm_end; 655 unsigned long end = vma->vm_end;
656 int ret;
652 657
653 /* 658 /*
654 * Don't copy ptes where a page fault will fill them correctly. 659 * Don't copy ptes where a page fault will fill them correctly.
@@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
664 if (is_vm_hugetlb_page(vma)) 669 if (is_vm_hugetlb_page(vma))
665 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 670 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
666 671
672 /*
673 * We need to invalidate the secondary MMU mappings only when
674 * there could be a permission downgrade on the ptes of the
675 * parent mm. And a permission downgrade will only happen if
676 * is_cow_mapping() returns true.
677 */
678 if (is_cow_mapping(vma->vm_flags))
679 mmu_notifier_invalidate_range_start(src_mm, addr, end);
680
681 ret = 0;
667 dst_pgd = pgd_offset(dst_mm, addr); 682 dst_pgd = pgd_offset(dst_mm, addr);
668 src_pgd = pgd_offset(src_mm, addr); 683 src_pgd = pgd_offset(src_mm, addr);
669 do { 684 do {
670 next = pgd_addr_end(addr, end); 685 next = pgd_addr_end(addr, end);
671 if (pgd_none_or_clear_bad(src_pgd)) 686 if (pgd_none_or_clear_bad(src_pgd))
672 continue; 687 continue;
673 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 688 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
674 vma, addr, next)) 689 vma, addr, next))) {
675 return -ENOMEM; 690 ret = -ENOMEM;
691 break;
692 }
676 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 693 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
677 return 0; 694
695 if (is_cow_mapping(vma->vm_flags))
696 mmu_notifier_invalidate_range_end(src_mm,
697 vma->vm_start, end);
698 return ret;
678} 699}
679 700
680static unsigned long zap_pte_range(struct mmu_gather *tlb, 701static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
878 unsigned long start = start_addr; 899 unsigned long start = start_addr;
879 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 900 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
880 int fullmm = (*tlbp)->fullmm; 901 int fullmm = (*tlbp)->fullmm;
902 struct mm_struct *mm = vma->vm_mm;
881 903
904 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
882 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 905 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
883 unsigned long end; 906 unsigned long end;
884 907
@@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
899 } 922 }
900 923
901 if (unlikely(is_vm_hugetlb_page(vma))) { 924 if (unlikely(is_vm_hugetlb_page(vma))) {
902 unmap_hugepage_range(vma, start, end); 925 /*
903 zap_work -= (end - start) / 926 * It is undesirable to test vma->vm_file as it
904 (HPAGE_SIZE / PAGE_SIZE); 927 * should be non-null for valid hugetlb area.
928 * However, vm_file will be NULL in the error
929 * cleanup path of do_mmap_pgoff. When
930 * hugetlbfs ->mmap method fails,
931 * do_mmap_pgoff() nullifies vma->vm_file
932 * before calling this function to clean up.
933 * Since no pte has actually been setup, it is
934 * safe to do nothing in this case.
935 */
936 if (vma->vm_file) {
937 unmap_hugepage_range(vma, start, end, NULL);
938 zap_work -= (end - start) /
939 pages_per_huge_page(hstate_vma(vma));
940 }
941
905 start = end; 942 start = end;
906 } else 943 } else
907 start = unmap_page_range(*tlbp, vma, 944 start = unmap_page_range(*tlbp, vma,
@@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
929 } 966 }
930 } 967 }
931out: 968out:
969 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
932 return start; /* which is now the end (or restart) address */ 970 return start; /* which is now the end (or restart) address */
933} 971}
934 972
@@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
956 return end; 994 return end;
957} 995}
958 996
997/**
998 * zap_vma_ptes - remove ptes mapping the vma
999 * @vma: vm_area_struct holding ptes to be zapped
1000 * @address: starting address of pages to zap
1001 * @size: number of bytes to zap
1002 *
1003 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1004 *
1005 * The entire address range must be fully contained within the vma.
1006 *
1007 * Returns 0 if successful.
1008 */
1009int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1010 unsigned long size)
1011{
1012 if (address < vma->vm_start || address + size > vma->vm_end ||
1013 !(vma->vm_flags & VM_PFNMAP))
1014 return -1;
1015 zap_page_range(vma, address, size, NULL);
1016 return 0;
1017}
1018EXPORT_SYMBOL_GPL(zap_vma_ptes);
1019
959/* 1020/*
960 * Do a quick page-table lookup for a single page. 1021 * Do a quick page-table lookup for a single page.
961 */ 1022 */
@@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
982 goto no_page_table; 1043 goto no_page_table;
983 1044
984 pud = pud_offset(pgd, address); 1045 pud = pud_offset(pgd, address);
985 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 1046 if (pud_none(*pud))
986 goto no_page_table; 1047 goto no_page_table;
987 1048 if (pud_huge(*pud)) {
1049 BUG_ON(flags & FOLL_GET);
1050 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1051 goto out;
1052 }
1053 if (unlikely(pud_bad(*pud)))
1054 goto no_page_table;
1055
988 pmd = pmd_offset(pud, address); 1056 pmd = pmd_offset(pud, address);
989 if (pmd_none(*pmd)) 1057 if (pmd_none(*pmd))
990 goto no_page_table; 1058 goto no_page_table;
991
992 if (pmd_huge(*pmd)) { 1059 if (pmd_huge(*pmd)) {
993 BUG_ON(flags & FOLL_GET); 1060 BUG_ON(flags & FOLL_GET);
994 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1061 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
995 goto out; 1062 goto out;
996 } 1063 }
997
998 if (unlikely(pmd_bad(*pmd))) 1064 if (unlikely(pmd_bad(*pmd)))
999 goto no_page_table; 1065 goto no_page_table;
1000 1066
@@ -1058,19 +1124,22 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1058 if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) 1124 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1059 return 0; 1125 return 0;
1060 /* 1126 /*
1061 * And if we have a fault or a nopfn routine, it's not an 1127 * And if we have a fault routine, it's not an anonymous region.
1062 * anonymous region.
1063 */ 1128 */
1064 return !vma->vm_ops || 1129 return !vma->vm_ops || !vma->vm_ops->fault;
1065 (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
1066} 1130}
1067 1131
1068int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1132
1069 unsigned long start, int len, int write, int force, 1133
1134int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1135 unsigned long start, int len, int flags,
1070 struct page **pages, struct vm_area_struct **vmas) 1136 struct page **pages, struct vm_area_struct **vmas)
1071{ 1137{
1072 int i; 1138 int i;
1073 unsigned int vm_flags; 1139 unsigned int vm_flags = 0;
1140 int write = !!(flags & GUP_FLAGS_WRITE);
1141 int force = !!(flags & GUP_FLAGS_FORCE);
1142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1074 1143
1075 if (len <= 0) 1144 if (len <= 0)
1076 return 0; 1145 return 0;
@@ -1094,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1094 pud_t *pud; 1163 pud_t *pud;
1095 pmd_t *pmd; 1164 pmd_t *pmd;
1096 pte_t *pte; 1165 pte_t *pte;
1097 if (write) /* user gate pages are read-only */ 1166
1167 /* user gate pages are read-only */
1168 if (!ignore && write)
1098 return i ? : -EFAULT; 1169 return i ? : -EFAULT;
1099 if (pg > TASK_SIZE) 1170 if (pg > TASK_SIZE)
1100 pgd = pgd_offset_k(pg); 1171 pgd = pgd_offset_k(pg);
@@ -1126,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1126 continue; 1197 continue;
1127 } 1198 }
1128 1199
1129 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1200 if (!vma ||
1130 || !(vm_flags & vma->vm_flags)) 1201 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1202 (!ignore && !(vm_flags & vma->vm_flags)))
1131 return i ? : -EFAULT; 1203 return i ? : -EFAULT;
1132 1204
1133 if (is_vm_hugetlb_page(vma)) { 1205 if (is_vm_hugetlb_page(vma)) {
@@ -1202,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1202 } while (len); 1274 } while (len);
1203 return i; 1275 return i;
1204} 1276}
1277
1278int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1279 unsigned long start, int len, int write, int force,
1280 struct page **pages, struct vm_area_struct **vmas)
1281{
1282 int flags = 0;
1283
1284 if (write)
1285 flags |= GUP_FLAGS_WRITE;
1286 if (force)
1287 flags |= GUP_FLAGS_FORCE;
1288
1289 return __get_user_pages(tsk, mm,
1290 start, len, flags,
1291 pages, vmas);
1292}
1293
1205EXPORT_SYMBOL(get_user_pages); 1294EXPORT_SYMBOL(get_user_pages);
1206 1295
1207pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1296pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1232,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1232 pte_t *pte; 1321 pte_t *pte;
1233 spinlock_t *ptl; 1322 spinlock_t *ptl;
1234 1323
1235 retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
1236 if (retval)
1237 goto out;
1238
1239 retval = -EINVAL; 1324 retval = -EINVAL;
1240 if (PageAnon(page)) 1325 if (PageAnon(page))
1241 goto out_uncharge; 1326 goto out;
1242 retval = -ENOMEM; 1327 retval = -ENOMEM;
1243 flush_dcache_page(page); 1328 flush_dcache_page(page);
1244 pte = get_locked_pte(mm, addr, &ptl); 1329 pte = get_locked_pte(mm, addr, &ptl);
1245 if (!pte) 1330 if (!pte)
1246 goto out_uncharge; 1331 goto out;
1247 retval = -EBUSY; 1332 retval = -EBUSY;
1248 if (!pte_none(*pte)) 1333 if (!pte_none(*pte))
1249 goto out_unlock; 1334 goto out_unlock;
@@ -1259,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1259 return retval; 1344 return retval;
1260out_unlock: 1345out_unlock:
1261 pte_unmap_unlock(pte, ptl); 1346 pte_unmap_unlock(pte, ptl);
1262out_uncharge:
1263 mem_cgroup_uncharge_page(page);
1264out: 1347out:
1265 return retval; 1348 return retval;
1266} 1349}
@@ -1338,6 +1421,11 @@ out:
1338 * 1421 *
1339 * This function should only be called from a vm_ops->fault handler, and 1422 * This function should only be called from a vm_ops->fault handler, and
1340 * in that case the handler should return NULL. 1423 * in that case the handler should return NULL.
1424 *
1425 * vma cannot be a COW mapping.
1426 *
1427 * As this is called only for pages that do not currently exist, we
1428 * do not need to flush old virtual caches or the TLB.
1341 */ 1429 */
1342int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1430int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1343 unsigned long pfn) 1431 unsigned long pfn)
@@ -1548,6 +1636,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1548 unsigned long next; 1636 unsigned long next;
1549 int err; 1637 int err;
1550 1638
1639 BUG_ON(pud_huge(*pud));
1640
1551 pmd = pmd_alloc(mm, pud, addr); 1641 pmd = pmd_alloc(mm, pud, addr);
1552 if (!pmd) 1642 if (!pmd)
1553 return -ENOMEM; 1643 return -ENOMEM;
@@ -1589,10 +1679,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1589{ 1679{
1590 pgd_t *pgd; 1680 pgd_t *pgd;
1591 unsigned long next; 1681 unsigned long next;
1592 unsigned long end = addr + size; 1682 unsigned long start = addr, end = addr + size;
1593 int err; 1683 int err;
1594 1684
1595 BUG_ON(addr >= end); 1685 BUG_ON(addr >= end);
1686 mmu_notifier_invalidate_range_start(mm, start, end);
1596 pgd = pgd_offset(mm, addr); 1687 pgd = pgd_offset(mm, addr);
1597 do { 1688 do {
1598 next = pgd_addr_end(addr, end); 1689 next = pgd_addr_end(addr, end);
@@ -1600,6 +1691,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1600 if (err) 1691 if (err)
1601 break; 1692 break;
1602 } while (pgd++, addr = next, addr != end); 1693 } while (pgd++, addr = next, addr != end);
1694 mmu_notifier_invalidate_range_end(mm, start, end);
1603 return err; 1695 return err;
1604} 1696}
1605EXPORT_SYMBOL_GPL(apply_to_page_range); 1697EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1716,7 +1808,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1716 * not dirty accountable. 1808 * not dirty accountable.
1717 */ 1809 */
1718 if (PageAnon(old_page)) { 1810 if (PageAnon(old_page)) {
1719 if (!TestSetPageLocked(old_page)) { 1811 if (trylock_page(old_page)) {
1720 reuse = can_share_swap_page(old_page); 1812 reuse = can_share_swap_page(old_page);
1721 unlock_page(old_page); 1813 unlock_page(old_page);
1722 } 1814 }
@@ -1785,6 +1877,15 @@ gotten:
1785 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1877 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1786 if (!new_page) 1878 if (!new_page)
1787 goto oom; 1879 goto oom;
1880 /*
1881 * Don't let another task, with possibly unlocked vma,
1882 * keep the mlocked page.
1883 */
1884 if (vma->vm_flags & VM_LOCKED) {
1885 lock_page(old_page); /* for LRU manipulation */
1886 clear_page_mlock(old_page);
1887 unlock_page(old_page);
1888 }
1788 cow_user_page(new_page, old_page, address, vma); 1889 cow_user_page(new_page, old_page, address, vma);
1789 __SetPageUptodate(new_page); 1890 __SetPageUptodate(new_page);
1790 1891
@@ -1812,12 +1913,14 @@ gotten:
1812 * seen in the presence of one thread doing SMC and another 1913 * seen in the presence of one thread doing SMC and another
1813 * thread doing COW. 1914 * thread doing COW.
1814 */ 1915 */
1815 ptep_clear_flush(vma, address, page_table); 1916 ptep_clear_flush_notify(vma, address, page_table);
1816 set_pte_at(mm, address, page_table, entry); 1917 SetPageSwapBacked(new_page);
1817 update_mmu_cache(vma, address, entry); 1918 lru_cache_add_active_or_unevictable(new_page, vma);
1818 lru_cache_add_active(new_page);
1819 page_add_new_anon_rmap(new_page, vma, address); 1919 page_add_new_anon_rmap(new_page, vma, address);
1820 1920
1921//TODO: is this safe? do_anonymous_page() does it this way.
1922 set_pte_at(mm, address, page_table, entry);
1923 update_mmu_cache(vma, address, entry);
1821 if (old_page) { 1924 if (old_page) {
1822 /* 1925 /*
1823 * Only after switching the pte to the new page may 1926 * Only after switching the pte to the new page may
@@ -2215,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2215 count_vm_event(PGMAJFAULT); 2318 count_vm_event(PGMAJFAULT);
2216 } 2319 }
2217 2320
2321 mark_page_accessed(page);
2322
2323 lock_page(page);
2324 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2325
2218 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2326 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2219 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2220 ret = VM_FAULT_OOM; 2327 ret = VM_FAULT_OOM;
2328 unlock_page(page);
2221 goto out; 2329 goto out;
2222 } 2330 }
2223 2331
2224 mark_page_accessed(page);
2225 lock_page(page);
2226 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2227
2228 /* 2332 /*
2229 * Back out if somebody else already faulted in this pte. 2333 * Back out if somebody else already faulted in this pte.
2230 */ 2334 */
@@ -2251,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2251 page_add_anon_rmap(page, vma, address); 2355 page_add_anon_rmap(page, vma, address);
2252 2356
2253 swap_free(entry); 2357 swap_free(entry);
2254 if (vm_swap_full()) 2358 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2255 remove_exclusive_swap_page(page); 2359 remove_exclusive_swap_page(page);
2256 unlock_page(page); 2360 unlock_page(page);
2257 2361
@@ -2309,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2309 if (!pte_none(*page_table)) 2413 if (!pte_none(*page_table))
2310 goto release; 2414 goto release;
2311 inc_mm_counter(mm, anon_rss); 2415 inc_mm_counter(mm, anon_rss);
2312 lru_cache_add_active(page); 2416 SetPageSwapBacked(page);
2417 lru_cache_add_active_or_unevictable(page, vma);
2313 page_add_new_anon_rmap(page, vma, address); 2418 page_add_new_anon_rmap(page, vma, address);
2314 set_pte_at(mm, address, page_table, entry); 2419 set_pte_at(mm, address, page_table, entry);
2315 2420
@@ -2350,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2350 struct page *page; 2455 struct page *page;
2351 pte_t entry; 2456 pte_t entry;
2352 int anon = 0; 2457 int anon = 0;
2458 int charged = 0;
2353 struct page *dirty_page = NULL; 2459 struct page *dirty_page = NULL;
2354 struct vm_fault vmf; 2460 struct vm_fault vmf;
2355 int ret; 2461 int ret;
@@ -2390,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2390 ret = VM_FAULT_OOM; 2496 ret = VM_FAULT_OOM;
2391 goto out; 2497 goto out;
2392 } 2498 }
2499 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2500 ret = VM_FAULT_OOM;
2501 page_cache_release(page);
2502 goto out;
2503 }
2504 charged = 1;
2505 /*
2506 * Don't let another task, with possibly unlocked vma,
2507 * keep the mlocked page.
2508 */
2509 if (vma->vm_flags & VM_LOCKED)
2510 clear_page_mlock(vmf.page);
2393 copy_user_highpage(page, vmf.page, address, vma); 2511 copy_user_highpage(page, vmf.page, address, vma);
2394 __SetPageUptodate(page); 2512 __SetPageUptodate(page);
2395 } else { 2513 } else {
@@ -2424,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2424 2542
2425 } 2543 }
2426 2544
2427 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2428 ret = VM_FAULT_OOM;
2429 goto out;
2430 }
2431
2432 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2545 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2433 2546
2434 /* 2547 /*
@@ -2447,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2447 entry = mk_pte(page, vma->vm_page_prot); 2560 entry = mk_pte(page, vma->vm_page_prot);
2448 if (flags & FAULT_FLAG_WRITE) 2561 if (flags & FAULT_FLAG_WRITE)
2449 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2562 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2450 set_pte_at(mm, address, page_table, entry);
2451 if (anon) { 2563 if (anon) {
2452 inc_mm_counter(mm, anon_rss); 2564 inc_mm_counter(mm, anon_rss);
2453 lru_cache_add_active(page); 2565 SetPageSwapBacked(page);
2454 page_add_new_anon_rmap(page, vma, address); 2566 lru_cache_add_active_or_unevictable(page, vma);
2567 page_add_new_anon_rmap(page, vma, address);
2455 } else { 2568 } else {
2456 inc_mm_counter(mm, file_rss); 2569 inc_mm_counter(mm, file_rss);
2457 page_add_file_rmap(page); 2570 page_add_file_rmap(page);
@@ -2460,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2460 get_page(dirty_page); 2573 get_page(dirty_page);
2461 } 2574 }
2462 } 2575 }
2576//TODO: is this safe? do_anonymous_page() does it this way.
2577 set_pte_at(mm, address, page_table, entry);
2463 2578
2464 /* no need to invalidate: a not-present page won't be cached */ 2579 /* no need to invalidate: a not-present page won't be cached */
2465 update_mmu_cache(vma, address, entry); 2580 update_mmu_cache(vma, address, entry);
2466 } else { 2581 } else {
2467 mem_cgroup_uncharge_page(page); 2582 if (charged)
2583 mem_cgroup_uncharge_page(page);
2468 if (anon) 2584 if (anon)
2469 page_cache_release(page); 2585 page_cache_release(page);
2470 else 2586 else
@@ -2501,59 +2617,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2501 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2617 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2502} 2618}
2503 2619
2504
2505/*
2506 * do_no_pfn() tries to create a new page mapping for a page without
2507 * a struct_page backing it
2508 *
2509 * As this is called only for pages that do not currently exist, we
2510 * do not need to flush old virtual caches or the TLB.
2511 *
2512 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2513 * but allow concurrent faults), and pte mapped but not yet locked.
2514 * We return with mmap_sem still held, but pte unmapped and unlocked.
2515 *
2516 * It is expected that the ->nopfn handler always returns the same pfn
2517 * for a given virtual mapping.
2518 *
2519 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2520 */
2521static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2522 unsigned long address, pte_t *page_table, pmd_t *pmd,
2523 int write_access)
2524{
2525 spinlock_t *ptl;
2526 pte_t entry;
2527 unsigned long pfn;
2528
2529 pte_unmap(page_table);
2530 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2531 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2532
2533 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2534
2535 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2536
2537 if (unlikely(pfn == NOPFN_OOM))
2538 return VM_FAULT_OOM;
2539 else if (unlikely(pfn == NOPFN_SIGBUS))
2540 return VM_FAULT_SIGBUS;
2541 else if (unlikely(pfn == NOPFN_REFAULT))
2542 return 0;
2543
2544 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2545
2546 /* Only go through if we didn't race with anybody else... */
2547 if (pte_none(*page_table)) {
2548 entry = pfn_pte(pfn, vma->vm_page_prot);
2549 if (write_access)
2550 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2551 set_pte_at(mm, address, page_table, entry);
2552 }
2553 pte_unmap_unlock(page_table, ptl);
2554 return 0;
2555}
2556
2557/* 2620/*
2558 * Fault of a previously existing named mapping. Repopulate the pte 2621 * Fault of a previously existing named mapping. Repopulate the pte
2559 * from the encoded file_pte if possible. This enables swappable 2622 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2677,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 if (likely(vma->vm_ops->fault)) 2677 if (likely(vma->vm_ops->fault))
2615 return do_linear_fault(mm, vma, address, 2678 return do_linear_fault(mm, vma, address,
2616 pte, pmd, write_access, entry); 2679 pte, pmd, write_access, entry);
2617 if (unlikely(vma->vm_ops->nopfn))
2618 return do_no_pfn(mm, vma, address, pte,
2619 pmd, write_access);
2620 } 2680 }
2621 return do_anonymous_page(mm, vma, address, 2681 return do_anonymous_page(mm, vma, address,
2622 pte, pmd, write_access); 2682 pte, pmd, write_access);
@@ -2748,7 +2808,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
2748 2808
2749 vma = find_vma(current->mm, addr); 2809 vma = find_vma(current->mm, addr);
2750 if (!vma) 2810 if (!vma)
2751 return -1; 2811 return -ENOMEM;
2752 write = (vma->vm_flags & VM_WRITE) != 0; 2812 write = (vma->vm_flags & VM_WRITE) != 0;
2753 BUG_ON(addr >= end); 2813 BUG_ON(addr >= end);
2754 BUG_ON(end > vma->vm_end); 2814 BUG_ON(end > vma->vm_end);
@@ -2757,7 +2817,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
2757 len, write, 0, NULL, NULL); 2817 len, write, 0, NULL, NULL);
2758 if (ret < 0) 2818 if (ret < 0)
2759 return ret; 2819 return ret;
2760 return ret == len ? 0 : -1; 2820 return ret == len ? 0 : -EFAULT;
2761} 2821}
2762 2822
2763#if !defined(__HAVE_ARCH_GATE_AREA) 2823#if !defined(__HAVE_ARCH_GATE_AREA)
@@ -2804,6 +2864,86 @@ int in_gate_area_no_task(unsigned long addr)
2804 2864
2805#endif /* __HAVE_ARCH_GATE_AREA */ 2865#endif /* __HAVE_ARCH_GATE_AREA */
2806 2866
2867#ifdef CONFIG_HAVE_IOREMAP_PROT
2868static resource_size_t follow_phys(struct vm_area_struct *vma,
2869 unsigned long address, unsigned int flags,
2870 unsigned long *prot)
2871{
2872 pgd_t *pgd;
2873 pud_t *pud;
2874 pmd_t *pmd;
2875 pte_t *ptep, pte;
2876 spinlock_t *ptl;
2877 resource_size_t phys_addr = 0;
2878 struct mm_struct *mm = vma->vm_mm;
2879
2880 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2881
2882 pgd = pgd_offset(mm, address);
2883 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2884 goto no_page_table;
2885
2886 pud = pud_offset(pgd, address);
2887 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2888 goto no_page_table;
2889
2890 pmd = pmd_offset(pud, address);
2891 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2892 goto no_page_table;
2893
2894 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
2895 if (pmd_huge(*pmd))
2896 goto no_page_table;
2897
2898 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2899 if (!ptep)
2900 goto out;
2901
2902 pte = *ptep;
2903 if (!pte_present(pte))
2904 goto unlock;
2905 if ((flags & FOLL_WRITE) && !pte_write(pte))
2906 goto unlock;
2907 phys_addr = pte_pfn(pte);
2908 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
2909
2910 *prot = pgprot_val(pte_pgprot(pte));
2911
2912unlock:
2913 pte_unmap_unlock(ptep, ptl);
2914out:
2915 return phys_addr;
2916no_page_table:
2917 return 0;
2918}
2919
2920int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2921 void *buf, int len, int write)
2922{
2923 resource_size_t phys_addr;
2924 unsigned long prot = 0;
2925 void *maddr;
2926 int offset = addr & (PAGE_SIZE-1);
2927
2928 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2929 return -EINVAL;
2930
2931 phys_addr = follow_phys(vma, addr, write, &prot);
2932
2933 if (!phys_addr)
2934 return -EINVAL;
2935
2936 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2937 if (write)
2938 memcpy_toio(maddr + offset, buf, len);
2939 else
2940 memcpy_fromio(buf, maddr + offset, len);
2941 iounmap(maddr);
2942
2943 return len;
2944}
2945#endif
2946
2807/* 2947/*
2808 * Access another process' address space. 2948 * Access another process' address space.
2809 * Source/target buffer must be kernel space, 2949 * Source/target buffer must be kernel space,
@@ -2813,7 +2953,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2813{ 2953{
2814 struct mm_struct *mm; 2954 struct mm_struct *mm;
2815 struct vm_area_struct *vma; 2955 struct vm_area_struct *vma;
2816 struct page *page;
2817 void *old_buf = buf; 2956 void *old_buf = buf;
2818 2957
2819 mm = get_task_mm(tsk); 2958 mm = get_task_mm(tsk);
@@ -2825,28 +2964,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2825 while (len) { 2964 while (len) {
2826 int bytes, ret, offset; 2965 int bytes, ret, offset;
2827 void *maddr; 2966 void *maddr;
2967 struct page *page = NULL;
2828 2968
2829 ret = get_user_pages(tsk, mm, addr, 1, 2969 ret = get_user_pages(tsk, mm, addr, 1,
2830 write, 1, &page, &vma); 2970 write, 1, &page, &vma);
2831 if (ret <= 0) 2971 if (ret <= 0) {
2832 break; 2972 /*
2833 2973 * Check if this is a VM_IO | VM_PFNMAP VMA, which
2834 bytes = len; 2974 * we can access using slightly different code.
2835 offset = addr & (PAGE_SIZE-1); 2975 */
2836 if (bytes > PAGE_SIZE-offset) 2976#ifdef CONFIG_HAVE_IOREMAP_PROT
2837 bytes = PAGE_SIZE-offset; 2977 vma = find_vma(mm, addr);
2838 2978 if (!vma)
2839 maddr = kmap(page); 2979 break;
2840 if (write) { 2980 if (vma->vm_ops && vma->vm_ops->access)
2841 copy_to_user_page(vma, page, addr, 2981 ret = vma->vm_ops->access(vma, addr, buf,
2842 maddr + offset, buf, bytes); 2982 len, write);
2843 set_page_dirty_lock(page); 2983 if (ret <= 0)
2984#endif
2985 break;
2986 bytes = ret;
2844 } else { 2987 } else {
2845 copy_from_user_page(vma, page, addr, 2988 bytes = len;
2846 buf, maddr + offset, bytes); 2989 offset = addr & (PAGE_SIZE-1);
2990 if (bytes > PAGE_SIZE-offset)
2991 bytes = PAGE_SIZE-offset;
2992
2993 maddr = kmap(page);
2994 if (write) {
2995 copy_to_user_page(vma, page, addr,
2996 maddr + offset, buf, bytes);
2997 set_page_dirty_lock(page);
2998 } else {
2999 copy_from_user_page(vma, page, addr,
3000 buf, maddr + offset, bytes);
3001 }
3002 kunmap(page);
3003 page_cache_release(page);
2847 } 3004 }
2848 kunmap(page);
2849 page_cache_release(page);
2850 len -= bytes; 3005 len -= bytes;
2851 buf += bytes; 3006 buf += bytes;
2852 addr += bytes; 3007 addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..6837a1014372 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
26#include <linux/delay.h> 26#include <linux/delay.h>
27#include <linux/migrate.h> 27#include <linux/migrate.h>
28#include <linux/page-isolation.h> 28#include <linux/page-isolation.h>
29#include <linux/pfn.h>
29 30
30#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
31 32
@@ -62,9 +63,9 @@ static void release_memory_resource(struct resource *res)
62 63
63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 64#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
64#ifndef CONFIG_SPARSEMEM_VMEMMAP 65#ifndef CONFIG_SPARSEMEM_VMEMMAP
65static void get_page_bootmem(unsigned long info, struct page *page, int magic) 66static void get_page_bootmem(unsigned long info, struct page *page, int type)
66{ 67{
67 atomic_set(&page->_mapcount, magic); 68 atomic_set(&page->_mapcount, type);
68 SetPagePrivate(page); 69 SetPagePrivate(page);
69 set_page_private(page, info); 70 set_page_private(page, info);
70 atomic_inc(&page->_count); 71 atomic_inc(&page->_count);
@@ -72,10 +73,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic)
72 73
73void put_page_bootmem(struct page *page) 74void put_page_bootmem(struct page *page)
74{ 75{
75 int magic; 76 int type;
76 77
77 magic = atomic_read(&page->_mapcount); 78 type = atomic_read(&page->_mapcount);
78 BUG_ON(magic >= -1); 79 BUG_ON(type >= -1);
79 80
80 if (atomic_dec_return(&page->_count) == 1) { 81 if (atomic_dec_return(&page->_count) == 1) {
81 ClearPagePrivate(page); 82 ClearPagePrivate(page);
@@ -86,7 +87,7 @@ void put_page_bootmem(struct page *page)
86 87
87} 88}
88 89
89void register_page_bootmem_info_section(unsigned long start_pfn) 90static void register_page_bootmem_info_section(unsigned long start_pfn)
90{ 91{
91 unsigned long *usemap, mapsize, section_nr, i; 92 unsigned long *usemap, mapsize, section_nr, i;
92 struct mem_section *ms; 93 struct mem_section *ms;
@@ -119,7 +120,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 120 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
120 121
121 for (i = 0; i < mapsize; i++, page++) 122 for (i = 0; i < mapsize; i++, page++)
122 get_page_bootmem(section_nr, page, MIX_INFO); 123 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
123 124
124} 125}
125 126
@@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
323 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 324 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
324 BUG_ON(nr_pages % PAGES_PER_SECTION); 325 BUG_ON(nr_pages % PAGES_PER_SECTION);
325 326
326 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
327
328 sections_to_remove = nr_pages / PAGES_PER_SECTION; 327 sections_to_remove = nr_pages / PAGES_PER_SECTION;
329 for (i = 0; i < sections_to_remove; i++) { 328 for (i = 0; i < sections_to_remove; i++) {
330 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 329 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
330 release_mem_region(pfn << PAGE_SHIFT,
331 PAGES_PER_SECTION << PAGE_SHIFT);
331 ret = __remove_section(zone, __pfn_to_section(pfn)); 332 ret = __remove_section(zone, __pfn_to_section(pfn));
332 if (ret) 333 if (ret)
333 break; 334 break;
@@ -429,7 +430,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 430
430 if (need_zonelists_rebuild) 431 if (need_zonelists_rebuild)
431 build_all_zonelists(); 432 build_all_zonelists();
432 vm_total_pages = nr_free_pagecache_pages(); 433 else
434 vm_total_pages = nr_free_pagecache_pages();
435
433 writeback_set_ratelimit(); 436 writeback_set_ratelimit();
434 437
435 if (onlined_pages) 438 if (onlined_pages)
@@ -455,7 +458,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
455 /* we can use NODE_DATA(nid) from here */ 458 /* we can use NODE_DATA(nid) from here */
456 459
457 /* init node's zones as empty zones, we don't have any present pages.*/ 460 /* init node's zones as empty zones, we don't have any present pages.*/
458 free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); 461 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
459 462
460 return pgdat; 463 return pgdat;
461} 464}
@@ -521,6 +524,66 @@ EXPORT_SYMBOL_GPL(add_memory);
521 524
522#ifdef CONFIG_MEMORY_HOTREMOVE 525#ifdef CONFIG_MEMORY_HOTREMOVE
523/* 526/*
527 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
528 * set and the size of the free page is given by page_order(). Using this,
529 * the function determines if the pageblock contains only free pages.
530 * Due to buddy contraints, a free page at least the size of a pageblock will
531 * be located at the start of the pageblock
532 */
533static inline int pageblock_free(struct page *page)
534{
535 return PageBuddy(page) && page_order(page) >= pageblock_order;
536}
537
538/* Return the start of the next active pageblock after a given page */
539static struct page *next_active_pageblock(struct page *page)
540{
541 int pageblocks_stride;
542
543 /* Ensure the starting page is pageblock-aligned */
544 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
545
546 /* Move forward by at least 1 * pageblock_nr_pages */
547 pageblocks_stride = 1;
548
549 /* If the entire pageblock is free, move to the end of free page */
550 if (pageblock_free(page))
551 pageblocks_stride += page_order(page) - pageblock_order;
552
553 return page + (pageblocks_stride * pageblock_nr_pages);
554}
555
556/* Checks if this range of memory is likely to be hot-removable. */
557int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
558{
559 int type;
560 struct page *page = pfn_to_page(start_pfn);
561 struct page *end_page = page + nr_pages;
562
563 /* Check the starting page of each pageblock within the range */
564 for (; page < end_page; page = next_active_pageblock(page)) {
565 type = get_pageblock_migratetype(page);
566
567 /*
568 * A pageblock containing MOVABLE or free pages is considered
569 * removable
570 */
571 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
572 return 0;
573
574 /*
575 * A pageblock starting with a PageReserved page is not
576 * considered removable.
577 */
578 if (PageReserved(page))
579 return 0;
580 }
581
582 /* All pageblocks in the memory block are likely to be hot-removable */
583 return 1;
584}
585
586/*
524 * Confirm all pages in a range [start, end) is belongs to the same zone. 587 * Confirm all pages in a range [start, end) is belongs to the same zone.
525 */ 588 */
526static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 589static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
@@ -595,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
595 * We can skip free pages. And we can only deal with pages on 658 * We can skip free pages. And we can only deal with pages on
596 * LRU. 659 * LRU.
597 */ 660 */
598 ret = isolate_lru_page(page, &source); 661 ret = isolate_lru_page(page);
599 if (!ret) { /* Success */ 662 if (!ret) { /* Success */
663 list_add_tail(&page->lru, &source);
600 move_pages--; 664 move_pages--;
601 } else { 665 } else {
602 /* Becasue we don't have big zone->lock. we should 666 /* Becasue we don't have big zone->lock. we should
@@ -787,10 +851,19 @@ failed_removal:
787 851
788 return ret; 852 return ret;
789} 853}
854
855int remove_memory(u64 start, u64 size)
856{
857 unsigned long start_pfn, end_pfn;
858
859 start_pfn = PFN_DOWN(start);
860 end_pfn = start_pfn + PFN_DOWN(size);
861 return offline_pages(start_pfn, end_pfn, 120 * HZ);
862}
790#else 863#else
791int remove_memory(u64 start, u64 size) 864int remove_memory(u64 start, u64 size)
792{ 865{
793 return -EINVAL; 866 return -EINVAL;
794} 867}
795EXPORT_SYMBOL_GPL(remove_memory);
796#endif /* CONFIG_MEMORY_HOTREMOVE */ 868#endif /* CONFIG_MEMORY_HOTREMOVE */
869EXPORT_SYMBOL_GPL(remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..36f42573a335 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,8 @@
93#include <asm/tlbflush.h> 93#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 94#include <asm/uaccess.h>
95 95
96#include "internal.h"
97
96/* Internal flags */ 98/* Internal flags */
97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 99#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 100#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
762 /* 764 /*
763 * Avoid migrating a page that is shared with others. 765 * Avoid migrating a page that is shared with others.
764 */ 766 */
765 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) 767 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
766 isolate_lru_page(page, pagelist); 768 if (!isolate_lru_page(page)) {
769 list_add_tail(&page->lru, pagelist);
770 }
771 }
767} 772}
768 773
769static struct page *new_node_page(struct page *page, unsigned long node, int **x) 774static struct page *new_node_page(struct page *page, unsigned long node, int **x)
@@ -803,7 +808,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
803int do_migrate_pages(struct mm_struct *mm, 808int do_migrate_pages(struct mm_struct *mm,
804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 809 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
805{ 810{
806 LIST_HEAD(pagelist);
807 int busy = 0; 811 int busy = 0;
808 int err = 0; 812 int err = 0;
809 nodemask_t tmp; 813 nodemask_t tmp;
@@ -1481,7 +1485,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1481 1485
1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1486 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1487 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484 HPAGE_SHIFT), gfp_flags); 1488 huge_page_shift(hstate_vma(vma))), gfp_flags);
1485 } else { 1489 } else {
1486 zl = policy_zonelist(gfp_flags, *mpol); 1490 zl = policy_zonelist(gfp_flags, *mpol);
1487 if ((*mpol)->mode == MPOL_BIND) 1491 if ((*mpol)->mode == MPOL_BIND)
@@ -2198,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
2198 if (PageSwapCache(page)) 2202 if (PageSwapCache(page))
2199 md->swapcache++; 2203 md->swapcache++;
2200 2204
2201 if (PageActive(page)) 2205 if (PageActive(page) || PageUnevictable(page))
2202 md->active++; 2206 md->active++;
2203 2207
2204 if (PageWriteback(page)) 2208 if (PageWriteback(page))
@@ -2220,9 +2224,12 @@ static void check_huge_range(struct vm_area_struct *vma,
2220{ 2224{
2221 unsigned long addr; 2225 unsigned long addr;
2222 struct page *page; 2226 struct page *page;
2227 struct hstate *h = hstate_vma(vma);
2228 unsigned long sz = huge_page_size(h);
2223 2229
2224 for (addr = start; addr < end; addr += HPAGE_SIZE) { 2230 for (addr = start; addr < end; addr += sz) {
2225 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2231 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2232 addr & huge_page_mask(h));
2226 pte_t pte; 2233 pte_t pte;
2227 2234
2228 if (!ptep) 2235 if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 55bd355d170d..6602941bfab0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,42 +30,13 @@
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
36#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
37 38
38/* 39/*
39 * Isolate one page from the LRU lists. If successful put it onto
40 * the indicated list with elevated page count.
41 *
42 * Result:
43 * -EBUSY: page not on LRU list
44 * 0: page removed from LRU list and added to the specified list.
45 */
46int isolate_lru_page(struct page *page, struct list_head *pagelist)
47{
48 int ret = -EBUSY;
49
50 if (PageLRU(page)) {
51 struct zone *zone = page_zone(page);
52
53 spin_lock_irq(&zone->lru_lock);
54 if (PageLRU(page) && get_page_unless_zero(page)) {
55 ret = 0;
56 ClearPageLRU(page);
57 if (PageActive(page))
58 del_page_from_active_list(zone, page);
59 else
60 del_page_from_inactive_list(zone, page);
61 list_add_tail(&page->lru, pagelist);
62 }
63 spin_unlock_irq(&zone->lru_lock);
64 }
65 return ret;
66}
67
68/*
69 * migrate_prep() needs to be called before we start compiling a list of pages 40 * migrate_prep() needs to be called before we start compiling a list of pages
70 * to be migrated using isolate_lru_page(). 41 * to be migrated using isolate_lru_page().
71 */ 42 */
@@ -82,23 +53,9 @@ int migrate_prep(void)
82 return 0; 53 return 0;
83} 54}
84 55
85static inline void move_to_lru(struct page *page)
86{
87 if (PageActive(page)) {
88 /*
89 * lru_cache_add_active checks that
90 * the PG_active bit is off.
91 */
92 ClearPageActive(page);
93 lru_cache_add_active(page);
94 } else {
95 lru_cache_add(page);
96 }
97 put_page(page);
98}
99
100/* 56/*
101 * Add isolated pages on the list back to the LRU. 57 * Add isolated pages on the list back to the LRU under page lock
58 * to avoid leaking evictable pages back onto unevictable list.
102 * 59 *
103 * returns the number of pages put back. 60 * returns the number of pages put back.
104 */ 61 */
@@ -110,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
110 67
111 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
112 list_del(&page->lru); 69 list_del(&page->lru);
113 move_to_lru(page); 70 putback_lru_page(page);
114 count++; 71 count++;
115 } 72 }
116 return count; 73 return count;
@@ -284,7 +241,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
284 241
285 page = migration_entry_to_page(entry); 242 page = migration_entry_to_page(entry);
286 243
287 get_page(page); 244 /*
245 * Once radix-tree replacement of page migration started, page_count
246 * *must* be zero. And, we don't want to call wait_on_page_locked()
247 * against a page without get_page().
248 * So, we use get_page_unless_zero(), here. Even failed, page fault
249 * will occur again.
250 */
251 if (!get_page_unless_zero(page))
252 goto out;
288 pte_unmap_unlock(ptep, ptl); 253 pte_unmap_unlock(ptep, ptl);
289 wait_on_page_locked(page); 254 wait_on_page_locked(page);
290 put_page(page); 255 put_page(page);
@@ -304,6 +269,7 @@ out:
304static int migrate_page_move_mapping(struct address_space *mapping, 269static int migrate_page_move_mapping(struct address_space *mapping,
305 struct page *newpage, struct page *page) 270 struct page *newpage, struct page *page)
306{ 271{
272 int expected_count;
307 void **pslot; 273 void **pslot;
308 274
309 if (!mapping) { 275 if (!mapping) {
@@ -313,14 +279,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
313 return 0; 279 return 0;
314 } 280 }
315 281
316 write_lock_irq(&mapping->tree_lock); 282 spin_lock_irq(&mapping->tree_lock);
317 283
318 pslot = radix_tree_lookup_slot(&mapping->page_tree, 284 pslot = radix_tree_lookup_slot(&mapping->page_tree,
319 page_index(page)); 285 page_index(page));
320 286
321 if (page_count(page) != 2 + !!PagePrivate(page) || 287 expected_count = 2 + !!PagePrivate(page);
288 if (page_count(page) != expected_count ||
322 (struct page *)radix_tree_deref_slot(pslot) != page) { 289 (struct page *)radix_tree_deref_slot(pslot) != page) {
323 write_unlock_irq(&mapping->tree_lock); 290 spin_unlock_irq(&mapping->tree_lock);
291 return -EAGAIN;
292 }
293
294 if (!page_freeze_refs(page, expected_count)) {
295 spin_unlock_irq(&mapping->tree_lock);
324 return -EAGAIN; 296 return -EAGAIN;
325 } 297 }
326 298
@@ -337,6 +309,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
337 309
338 radix_tree_replace_slot(pslot, newpage); 310 radix_tree_replace_slot(pslot, newpage);
339 311
312 page_unfreeze_refs(page, expected_count);
340 /* 313 /*
341 * Drop cache reference from old page. 314 * Drop cache reference from old page.
342 * We know this isn't the last reference. 315 * We know this isn't the last reference.
@@ -356,7 +329,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 __dec_zone_page_state(page, NR_FILE_PAGES); 329 __dec_zone_page_state(page, NR_FILE_PAGES);
357 __inc_zone_page_state(newpage, NR_FILE_PAGES); 330 __inc_zone_page_state(newpage, NR_FILE_PAGES);
358 331
359 write_unlock_irq(&mapping->tree_lock); 332 spin_unlock_irq(&mapping->tree_lock);
360 333
361 return 0; 334 return 0;
362} 335}
@@ -366,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
366 */ 339 */
367static void migrate_page_copy(struct page *newpage, struct page *page) 340static void migrate_page_copy(struct page *newpage, struct page *page)
368{ 341{
342 int anon;
343
369 copy_highpage(newpage, page); 344 copy_highpage(newpage, page);
370 345
371 if (PageError(page)) 346 if (PageError(page))
@@ -374,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
374 SetPageReferenced(newpage); 349 SetPageReferenced(newpage);
375 if (PageUptodate(page)) 350 if (PageUptodate(page))
376 SetPageUptodate(newpage); 351 SetPageUptodate(newpage);
377 if (PageActive(page)) 352 if (TestClearPageActive(page)) {
353 VM_BUG_ON(PageUnevictable(page));
378 SetPageActive(newpage); 354 SetPageActive(newpage);
355 } else
356 unevictable_migrate_page(newpage, page);
379 if (PageChecked(page)) 357 if (PageChecked(page))
380 SetPageChecked(newpage); 358 SetPageChecked(newpage);
381 if (PageMappedToDisk(page)) 359 if (PageMappedToDisk(page))
@@ -393,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
393 __set_page_dirty_nobuffers(newpage); 371 __set_page_dirty_nobuffers(newpage);
394 } 372 }
395 373
374 mlock_migrate_page(newpage, page);
375
396#ifdef CONFIG_SWAP 376#ifdef CONFIG_SWAP
397 ClearPageSwapCache(page); 377 ClearPageSwapCache(page);
398#endif 378#endif
399 ClearPageActive(page);
400 ClearPagePrivate(page); 379 ClearPagePrivate(page);
401 set_page_private(page, 0); 380 set_page_private(page, 0);
381 /* page->mapping contains a flag for PageAnon() */
382 anon = PageAnon(page);
402 page->mapping = NULL; 383 page->mapping = NULL;
403 384
385 if (!anon) /* This page was removed from radix-tree. */
386 mem_cgroup_uncharge_cache_page(page);
387
404 /* 388 /*
405 * If any waiters have accumulated on the new page then 389 * If any waiters have accumulated on the new page then
406 * wake them up. 390 * wake them up.
@@ -575,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping,
575 * 559 *
576 * The new page will have replaced the old page if this function 560 * The new page will have replaced the old page if this function
577 * is successful. 561 * is successful.
562 *
563 * Return value:
564 * < 0 - error code
565 * == 0 - success
578 */ 566 */
579static int move_to_new_page(struct page *newpage, struct page *page) 567static int move_to_new_page(struct page *newpage, struct page *page)
580{ 568{
@@ -586,12 +574,14 @@ static int move_to_new_page(struct page *newpage, struct page *page)
586 * establishing additional references. We are the only one 574 * establishing additional references. We are the only one
587 * holding a reference to the new page at this point. 575 * holding a reference to the new page at this point.
588 */ 576 */
589 if (TestSetPageLocked(newpage)) 577 if (!trylock_page(newpage))
590 BUG(); 578 BUG();
591 579
592 /* Prepare mapping for the new page.*/ 580 /* Prepare mapping for the new page.*/
593 newpage->index = page->index; 581 newpage->index = page->index;
594 newpage->mapping = page->mapping; 582 newpage->mapping = page->mapping;
583 if (PageSwapBacked(page))
584 SetPageSwapBacked(newpage);
595 585
596 mapping = page_mapping(page); 586 mapping = page_mapping(page);
597 if (!mapping) 587 if (!mapping)
@@ -610,7 +600,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
610 rc = fallback_migrate_page(mapping, newpage, page); 600 rc = fallback_migrate_page(mapping, newpage, page);
611 601
612 if (!rc) { 602 if (!rc) {
613 mem_cgroup_page_migration(page, newpage);
614 remove_migration_ptes(page, newpage); 603 remove_migration_ptes(page, newpage);
615 } else 604 } else
616 newpage->mapping = NULL; 605 newpage->mapping = NULL;
@@ -636,12 +625,21 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
636 if (!newpage) 625 if (!newpage)
637 return -ENOMEM; 626 return -ENOMEM;
638 627
639 if (page_count(page) == 1) 628 if (page_count(page) == 1) {
640 /* page was freed from under us. So we are done. */ 629 /* page was freed from under us. So we are done. */
641 goto move_newpage; 630 goto move_newpage;
631 }
632
633 charge = mem_cgroup_prepare_migration(page, newpage);
634 if (charge == -ENOMEM) {
635 rc = -ENOMEM;
636 goto move_newpage;
637 }
638 /* prepare cgroup just returns 0 or -ENOMEM */
639 BUG_ON(charge);
642 640
643 rc = -EAGAIN; 641 rc = -EAGAIN;
644 if (TestSetPageLocked(page)) { 642 if (!trylock_page(page)) {
645 if (!force) 643 if (!force)
646 goto move_newpage; 644 goto move_newpage;
647 lock_page(page); 645 lock_page(page);
@@ -691,25 +689,19 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
691 goto rcu_unlock; 689 goto rcu_unlock;
692 } 690 }
693 691
694 charge = mem_cgroup_prepare_migration(page);
695 /* Establish migration ptes or remove ptes */ 692 /* Establish migration ptes or remove ptes */
696 try_to_unmap(page, 1); 693 try_to_unmap(page, 1);
697 694
698 if (!page_mapped(page)) 695 if (!page_mapped(page))
699 rc = move_to_new_page(newpage, page); 696 rc = move_to_new_page(newpage, page);
700 697
701 if (rc) { 698 if (rc)
702 remove_migration_ptes(page, page); 699 remove_migration_ptes(page, page);
703 if (charge)
704 mem_cgroup_end_migration(page);
705 } else if (charge)
706 mem_cgroup_end_migration(newpage);
707rcu_unlock: 700rcu_unlock:
708 if (rcu_locked) 701 if (rcu_locked)
709 rcu_read_unlock(); 702 rcu_read_unlock();
710 703
711unlock: 704unlock:
712
713 unlock_page(page); 705 unlock_page(page);
714 706
715 if (rc != -EAGAIN) { 707 if (rc != -EAGAIN) {
@@ -720,15 +712,19 @@ unlock:
720 * restored. 712 * restored.
721 */ 713 */
722 list_del(&page->lru); 714 list_del(&page->lru);
723 move_to_lru(page); 715 putback_lru_page(page);
724 } 716 }
725 717
726move_newpage: 718move_newpage:
719 if (!charge)
720 mem_cgroup_end_migration(newpage);
721
727 /* 722 /*
728 * Move the new page to the LRU. If migration was not successful 723 * Move the new page to the LRU. If migration was not successful
729 * then this will free the page. 724 * then this will free the page.
730 */ 725 */
731 move_to_lru(newpage); 726 putback_lru_page(newpage);
727
732 if (result) { 728 if (result) {
733 if (rc) 729 if (rc)
734 *result = rc; 730 *result = rc;
@@ -835,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
835 * Move a set of pages as indicated in the pm array. The addr 831 * Move a set of pages as indicated in the pm array. The addr
836 * field must be set to the virtual address of the page to be moved 832 * field must be set to the virtual address of the page to be moved
837 * and the node number must contain a valid target node. 833 * and the node number must contain a valid target node.
834 * The pm array ends with node = MAX_NUMNODES.
838 */ 835 */
839static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, 836static int do_move_page_to_node_array(struct mm_struct *mm,
840 int migrate_all) 837 struct page_to_node *pm,
838 int migrate_all)
841{ 839{
842 int err; 840 int err;
843 struct page_to_node *pp; 841 struct page_to_node *pp;
@@ -891,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
891 !migrate_all) 889 !migrate_all)
892 goto put_and_set; 890 goto put_and_set;
893 891
894 err = isolate_lru_page(page, &pagelist); 892 err = isolate_lru_page(page);
893 if (!err)
894 list_add_tail(&page->lru, &pagelist);
895put_and_set: 895put_and_set:
896 /* 896 /*
897 * Either remove the duplicate refcount from 897 * Either remove the duplicate refcount from
@@ -903,36 +903,118 @@ set_status:
903 pp->status = err; 903 pp->status = err;
904 } 904 }
905 905
906 err = 0;
906 if (!list_empty(&pagelist)) 907 if (!list_empty(&pagelist))
907 err = migrate_pages(&pagelist, new_page_node, 908 err = migrate_pages(&pagelist, new_page_node,
908 (unsigned long)pm); 909 (unsigned long)pm);
909 else
910 err = -ENOENT;
911 910
912 up_read(&mm->mmap_sem); 911 up_read(&mm->mmap_sem);
913 return err; 912 return err;
914} 913}
915 914
916/* 915/*
917 * Determine the nodes of a list of pages. The addr in the pm array 916 * Migrate an array of page address onto an array of nodes and fill
918 * must have been set to the virtual address of which we want to determine 917 * the corresponding array of status.
919 * the node number.
920 */ 918 */
921static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) 919static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
920 unsigned long nr_pages,
921 const void __user * __user *pages,
922 const int __user *nodes,
923 int __user *status, int flags)
922{ 924{
925 struct page_to_node *pm = NULL;
926 nodemask_t task_nodes;
927 int err = 0;
928 int i;
929
930 task_nodes = cpuset_mems_allowed(task);
931
932 /* Limit nr_pages so that the multiplication may not overflow */
933 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
934 err = -E2BIG;
935 goto out;
936 }
937
938 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
939 if (!pm) {
940 err = -ENOMEM;
941 goto out;
942 }
943
944 /*
945 * Get parameters from user space and initialize the pm
946 * array. Return various errors if the user did something wrong.
947 */
948 for (i = 0; i < nr_pages; i++) {
949 const void __user *p;
950
951 err = -EFAULT;
952 if (get_user(p, pages + i))
953 goto out_pm;
954
955 pm[i].addr = (unsigned long)p;
956 if (nodes) {
957 int node;
958
959 if (get_user(node, nodes + i))
960 goto out_pm;
961
962 err = -ENODEV;
963 if (!node_state(node, N_HIGH_MEMORY))
964 goto out_pm;
965
966 err = -EACCES;
967 if (!node_isset(node, task_nodes))
968 goto out_pm;
969
970 pm[i].node = node;
971 } else
972 pm[i].node = 0; /* anything to not match MAX_NUMNODES */
973 }
974 /* End marker */
975 pm[nr_pages].node = MAX_NUMNODES;
976
977 err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
978 if (err >= 0)
979 /* Return status information */
980 for (i = 0; i < nr_pages; i++)
981 if (put_user(pm[i].status, status + i))
982 err = -EFAULT;
983
984out_pm:
985 vfree(pm);
986out:
987 return err;
988}
989
990/*
991 * Determine the nodes of an array of pages and store it in an array of status.
992 */
993static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
994 const void __user * __user *pages,
995 int __user *status)
996{
997 unsigned long i;
998 int err;
999
923 down_read(&mm->mmap_sem); 1000 down_read(&mm->mmap_sem);
924 1001
925 for ( ; pm->node != MAX_NUMNODES; pm++) { 1002 for (i = 0; i < nr_pages; i++) {
1003 const void __user *p;
1004 unsigned long addr;
926 struct vm_area_struct *vma; 1005 struct vm_area_struct *vma;
927 struct page *page; 1006 struct page *page;
928 int err;
929 1007
930 err = -EFAULT; 1008 err = -EFAULT;
931 vma = find_vma(mm, pm->addr); 1009 if (get_user(p, pages+i))
1010 goto out;
1011 addr = (unsigned long) p;
1012
1013 vma = find_vma(mm, addr);
932 if (!vma) 1014 if (!vma)
933 goto set_status; 1015 goto set_status;
934 1016
935 page = follow_page(vma, pm->addr, 0); 1017 page = follow_page(vma, addr, 0);
936 1018
937 err = PTR_ERR(page); 1019 err = PTR_ERR(page);
938 if (IS_ERR(page)) 1020 if (IS_ERR(page))
@@ -945,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
945 1027
946 err = page_to_nid(page); 1028 err = page_to_nid(page);
947set_status: 1029set_status:
948 pm->status = err; 1030 put_user(err, status+i);
949 } 1031 }
1032 err = 0;
950 1033
1034out:
951 up_read(&mm->mmap_sem); 1035 up_read(&mm->mmap_sem);
952 return 0; 1036 return err;
953} 1037}
954 1038
955/* 1039/*
@@ -961,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
961 const int __user *nodes, 1045 const int __user *nodes,
962 int __user *status, int flags) 1046 int __user *status, int flags)
963{ 1047{
964 int err = 0;
965 int i;
966 struct task_struct *task; 1048 struct task_struct *task;
967 nodemask_t task_nodes;
968 struct mm_struct *mm; 1049 struct mm_struct *mm;
969 struct page_to_node *pm = NULL; 1050 int err;
970 1051
971 /* Check flags */ 1052 /* Check flags */
972 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1053 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -998,79 +1079,24 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
998 (current->uid != task->suid) && (current->uid != task->uid) && 1079 (current->uid != task->suid) && (current->uid != task->uid) &&
999 !capable(CAP_SYS_NICE)) { 1080 !capable(CAP_SYS_NICE)) {
1000 err = -EPERM; 1081 err = -EPERM;
1001 goto out2; 1082 goto out;
1002 } 1083 }
1003 1084
1004 err = security_task_movememory(task); 1085 err = security_task_movememory(task);
1005 if (err) 1086 if (err)
1006 goto out2; 1087 goto out;
1007
1008
1009 task_nodes = cpuset_mems_allowed(task);
1010
1011 /* Limit nr_pages so that the multiplication may not overflow */
1012 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
1013 err = -E2BIG;
1014 goto out2;
1015 }
1016
1017 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
1018 if (!pm) {
1019 err = -ENOMEM;
1020 goto out2;
1021 }
1022
1023 /*
1024 * Get parameters from user space and initialize the pm
1025 * array. Return various errors if the user did something wrong.
1026 */
1027 for (i = 0; i < nr_pages; i++) {
1028 const void __user *p;
1029
1030 err = -EFAULT;
1031 if (get_user(p, pages + i))
1032 goto out;
1033
1034 pm[i].addr = (unsigned long)p;
1035 if (nodes) {
1036 int node;
1037
1038 if (get_user(node, nodes + i))
1039 goto out;
1040
1041 err = -ENODEV;
1042 if (!node_state(node, N_HIGH_MEMORY))
1043 goto out;
1044
1045 err = -EACCES;
1046 if (!node_isset(node, task_nodes))
1047 goto out;
1048 1088
1049 pm[i].node = node; 1089 if (nodes) {
1050 } else 1090 err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1051 pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 1091 flags);
1092 } else {
1093 err = do_pages_stat(mm, nr_pages, pages, status);
1052 } 1094 }
1053 /* End marker */
1054 pm[nr_pages].node = MAX_NUMNODES;
1055
1056 if (nodes)
1057 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
1058 else
1059 err = do_pages_stat(mm, pm);
1060
1061 if (err >= 0)
1062 /* Return status information */
1063 for (i = 0; i < nr_pages; i++)
1064 if (put_user(pm[i].status, status + i))
1065 err = -EFAULT;
1066 1095
1067out: 1096out:
1068 vfree(pm);
1069out2:
1070 mmput(mm); 1097 mmput(mm);
1071 return err; 1098 return err;
1072} 1099}
1073#endif
1074 1100
1075/* 1101/*
1076 * Call migration functions in the vma_ops that may prepare 1102 * Call migration functions in the vma_ops that may prepare
@@ -1092,3 +1118,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1092 } 1118 }
1093 return err; 1119 return err;
1094} 1120}
1121#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6a..008ea70b7afa 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
8#include <linux/capability.h> 8#include <linux/capability.h>
9#include <linux/mman.h> 9#include <linux/mman.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
13#include <linux/pagemap.h>
11#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
12#include <linux/syscalls.h> 15#include <linux/syscalls.h>
13#include <linux/sched.h> 16#include <linux/sched.h>
14#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rmap.h>
19#include <linux/mmzone.h>
20#include <linux/hugetlb.h>
21
22#include "internal.h"
15 23
16int can_do_mlock(void) 24int can_do_mlock(void)
17{ 25{
@@ -23,17 +31,381 @@ int can_do_mlock(void)
23} 31}
24EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
25 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate
38 * statistics.
39 *
40 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
41 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
42 * The unevictable list is an LRU sibling list to the [in]active lists.
43 * PageUnevictable is set to indicate the unevictable state.
44 *
45 * When lazy mlocking via vmscan, it is important to ensure that the
46 * vma's VM_LOCKED status is not concurrently being modified, otherwise we
47 * may have mlocked a page that is being munlocked. So lazy mlock must take
48 * the mmap_sem for read, and verify that the vma really is locked
49 * (see mm/rmap.c).
50 */
51
52/*
53 * LRU accounting for clear_page_mlock()
54 */
55void __clear_page_mlock(struct page *page)
56{
57 VM_BUG_ON(!PageLocked(page));
58
59 if (!page->mapping) { /* truncated ? */
60 return;
61 }
62
63 dec_zone_page_state(page, NR_MLOCK);
64 count_vm_event(UNEVICTABLE_PGCLEARED);
65 if (!isolate_lru_page(page)) {
66 putback_lru_page(page);
67 } else {
68 /*
69 * Page not on the LRU yet. Flush all pagevecs and retry.
70 */
71 lru_add_drain_all();
72 if (!isolate_lru_page(page))
73 putback_lru_page(page);
74 else if (PageUnevictable(page))
75 count_vm_event(UNEVICTABLE_PGSTRANDED);
76
77 }
78}
79
80/*
81 * Mark page as mlocked if not already.
82 * If page on LRU, isolate and putback to move to unevictable list.
83 */
84void mlock_vma_page(struct page *page)
85{
86 BUG_ON(!PageLocked(page));
87
88 if (!TestSetPageMlocked(page)) {
89 inc_zone_page_state(page, NR_MLOCK);
90 count_vm_event(UNEVICTABLE_PGMLOCKED);
91 if (!isolate_lru_page(page))
92 putback_lru_page(page);
93 }
94}
95
96/*
97 * called from munlock()/munmap() path with page supposedly on the LRU.
98 *
99 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
100 * [in try_to_munlock()] and then attempt to isolate the page. We must
101 * isolate the page to keep others from messing with its unevictable
102 * and mlocked state while trying to munlock. However, we pre-clear the
103 * mlocked state anyway as we might lose the isolation race and we might
104 * not get another chance to clear PageMlocked. If we successfully
105 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
106 * mapping the page, it will restore the PageMlocked state, unless the page
107 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
108 * perhaps redundantly.
109 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
110 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
111 * either of which will restore the PageMlocked state by calling
112 * mlock_vma_page() above, if it can grab the vma's mmap sem.
113 */
114static void munlock_vma_page(struct page *page)
115{
116 BUG_ON(!PageLocked(page));
117
118 if (TestClearPageMlocked(page)) {
119 dec_zone_page_state(page, NR_MLOCK);
120 if (!isolate_lru_page(page)) {
121 int ret = try_to_munlock(page);
122 /*
123 * did try_to_unlock() succeed or punt?
124 */
125 if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
126 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
127
128 putback_lru_page(page);
129 } else {
130 /*
131 * We lost the race. let try_to_unmap() deal
132 * with it. At least we get the page state and
133 * mlock stats right. However, page is still on
134 * the noreclaim list. We'll fix that up when
135 * the page is eventually freed or we scan the
136 * noreclaim list.
137 */
138 if (PageUnevictable(page))
139 count_vm_event(UNEVICTABLE_PGSTRANDED);
140 else
141 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
142 }
143 }
144}
145
146/**
147 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
148 * @vma: target vma
149 * @start: start address
150 * @end: end address
151 * @mlock: 0 indicate munlock, otherwise mlock.
152 *
153 * If @mlock == 0, unlock an mlocked range;
154 * else mlock the range of pages. This takes care of making the pages present ,
155 * too.
156 *
157 * return 0 on success, negative error code on error.
158 *
159 * vma->vm_mm->mmap_sem must be held for at least read.
160 */
161static long __mlock_vma_pages_range(struct vm_area_struct *vma,
162 unsigned long start, unsigned long end,
163 int mlock)
164{
165 struct mm_struct *mm = vma->vm_mm;
166 unsigned long addr = start;
167 struct page *pages[16]; /* 16 gives a reasonable batch */
168 int nr_pages = (end - start) / PAGE_SIZE;
169 int ret;
170 int gup_flags = 0;
171
172 VM_BUG_ON(start & ~PAGE_MASK);
173 VM_BUG_ON(end & ~PAGE_MASK);
174 VM_BUG_ON(start < vma->vm_start);
175 VM_BUG_ON(end > vma->vm_end);
176 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
177 (atomic_read(&mm->mm_users) != 0));
178
179 /*
180 * mlock: don't page populate if page has PROT_NONE permission.
181 * munlock: the pages always do munlock althrough
182 * its has PROT_NONE permission.
183 */
184 if (!mlock)
185 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
186
187 if (vma->vm_flags & VM_WRITE)
188 gup_flags |= GUP_FLAGS_WRITE;
189
190 lru_add_drain_all(); /* push cached pages to LRU */
191
192 while (nr_pages > 0) {
193 int i;
194
195 cond_resched();
196
197 /*
198 * get_user_pages makes pages present if we are
199 * setting mlock. and this extra reference count will
200 * disable migration of this page. However, page may
201 * still be truncated out from under us.
202 */
203 ret = __get_user_pages(current, mm, addr,
204 min_t(int, nr_pages, ARRAY_SIZE(pages)),
205 gup_flags, pages, NULL);
206 /*
207 * This can happen for, e.g., VM_NONLINEAR regions before
208 * a page has been allocated and mapped at a given offset,
209 * or for addresses that map beyond end of a file.
210 * We'll mlock the the pages if/when they get faulted in.
211 */
212 if (ret < 0)
213 break;
214 if (ret == 0) {
215 /*
216 * We know the vma is there, so the only time
217 * we cannot get a single page should be an
218 * error (ret < 0) case.
219 */
220 WARN_ON(1);
221 break;
222 }
223
224 lru_add_drain(); /* push cached pages to LRU */
225
226 for (i = 0; i < ret; i++) {
227 struct page *page = pages[i];
228
229 lock_page(page);
230 /*
231 * Because we lock page here and migration is blocked
232 * by the elevated reference, we need only check for
233 * page truncation (file-cache only).
234 */
235 if (page->mapping) {
236 if (mlock)
237 mlock_vma_page(page);
238 else
239 munlock_vma_page(page);
240 }
241 unlock_page(page);
242 put_page(page); /* ref from get_user_pages() */
243
244 /*
245 * here we assume that get_user_pages() has given us
246 * a list of virtually contiguous pages.
247 */
248 addr += PAGE_SIZE; /* for next get_user_pages() */
249 nr_pages--;
250 }
251 ret = 0;
252 }
253
254 lru_add_drain_all(); /* to update stats */
255
256 return ret; /* count entire vma as locked_vm */
257}
258
259/*
260 * convert get_user_pages() return value to posix mlock() error
261 */
262static int __mlock_posix_error_return(long retval)
263{
264 if (retval == -EFAULT)
265 retval = -ENOMEM;
266 else if (retval == -ENOMEM)
267 retval = -EAGAIN;
268 return retval;
269}
270
271#else /* CONFIG_UNEVICTABLE_LRU */
272
273/*
274 * Just make pages present if VM_LOCKED. No-op if unlocking.
275 */
276static long __mlock_vma_pages_range(struct vm_area_struct *vma,
277 unsigned long start, unsigned long end,
278 int mlock)
279{
280 if (mlock && (vma->vm_flags & VM_LOCKED))
281 return make_pages_present(start, end);
282 return 0;
283}
284
285static inline int __mlock_posix_error_return(long retval)
286{
287 return 0;
288}
289
290#endif /* CONFIG_UNEVICTABLE_LRU */
291
292/**
293 * mlock_vma_pages_range() - mlock pages in specified vma range.
294 * @vma - the vma containing the specfied address range
295 * @start - starting address in @vma to mlock
296 * @end - end address [+1] in @vma to mlock
297 *
298 * For mmap()/mremap()/expansion of mlocked vma.
299 *
300 * return 0 on success for "normal" vmas.
301 *
302 * return number of pages [> 0] to be removed from locked_vm on success
303 * of "special" vmas.
304 *
305 * return negative error if vma spanning @start-@range disappears while
306 * mmap semaphore is dropped. Unlikely?
307 */
308long mlock_vma_pages_range(struct vm_area_struct *vma,
309 unsigned long start, unsigned long end)
310{
311 struct mm_struct *mm = vma->vm_mm;
312 int nr_pages = (end - start) / PAGE_SIZE;
313 BUG_ON(!(vma->vm_flags & VM_LOCKED));
314
315 /*
316 * filter unlockable vmas
317 */
318 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
319 goto no_mlock;
320
321 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
322 is_vm_hugetlb_page(vma) ||
323 vma == get_gate_vma(current))) {
324 long error;
325 downgrade_write(&mm->mmap_sem);
326
327 error = __mlock_vma_pages_range(vma, start, end, 1);
328
329 up_read(&mm->mmap_sem);
330 /* vma can change or disappear */
331 down_write(&mm->mmap_sem);
332 vma = find_vma(mm, start);
333 /* non-NULL vma must contain @start, but need to check @end */
334 if (!vma || end > vma->vm_end)
335 return -ENOMEM;
336
337 return 0; /* hide other errors from mmap(), et al */
338 }
339
340 /*
341 * User mapped kernel pages or huge pages:
342 * make these pages present to populate the ptes, but
343 * fall thru' to reset VM_LOCKED--no need to unlock, and
344 * return nr_pages so these don't get counted against task's
345 * locked limit. huge pages are already counted against
346 * locked vm limit.
347 */
348 make_pages_present(start, end);
349
350no_mlock:
351 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
352 return nr_pages; /* error or pages NOT mlocked */
353}
354
355
356/*
357 * munlock_vma_pages_range() - munlock all pages in the vma range.'
358 * @vma - vma containing range to be munlock()ed.
359 * @start - start address in @vma of the range
360 * @end - end of range in @vma.
361 *
362 * For mremap(), munmap() and exit().
363 *
364 * Called with @vma VM_LOCKED.
365 *
366 * Returns with VM_LOCKED cleared. Callers must be prepared to
367 * deal with this.
368 *
369 * We don't save and restore VM_LOCKED here because pages are
370 * still on lru. In unmap path, pages might be scanned by reclaim
371 * and re-mlocked by try_to_{munlock|unmap} before we unmap and
372 * free them. This will result in freeing mlocked pages.
373 */
374void munlock_vma_pages_range(struct vm_area_struct *vma,
375 unsigned long start, unsigned long end)
376{
377 vma->vm_flags &= ~VM_LOCKED;
378 __mlock_vma_pages_range(vma, start, end, 0);
379}
380
381/*
382 * mlock_fixup - handle mlock[all]/munlock[all] requests.
383 *
384 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
385 * munlock is a no-op. However, for some special vmas, we go ahead and
386 * populate the ptes via make_pages_present().
387 *
388 * For vmas that pass the filters, merge/split as appropriate.
389 */
26static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 390static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
27 unsigned long start, unsigned long end, unsigned int newflags) 391 unsigned long start, unsigned long end, unsigned int newflags)
28{ 392{
29 struct mm_struct * mm = vma->vm_mm; 393 struct mm_struct *mm = vma->vm_mm;
30 pgoff_t pgoff; 394 pgoff_t pgoff;
31 int pages; 395 int nr_pages;
32 int ret = 0; 396 int ret = 0;
33 397 int lock = newflags & VM_LOCKED;
34 if (newflags == vma->vm_flags) { 398
35 *prev = vma; 399 if (newflags == vma->vm_flags ||
36 goto out; 400 (vma->vm_flags & (VM_IO | VM_PFNMAP)))
401 goto out; /* don't set VM_LOCKED, don't count */
402
403 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
404 is_vm_hugetlb_page(vma) ||
405 vma == get_gate_vma(current)) {
406 if (lock)
407 make_pages_present(start, end);
408 goto out; /* don't set VM_LOCKED, don't count */
37 } 409 }
38 410
39 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 411 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
44 goto success; 416 goto success;
45 } 417 }
46 418
47 *prev = vma;
48
49 if (start != vma->vm_start) { 419 if (start != vma->vm_start) {
50 ret = split_vma(mm, vma, start, 1); 420 ret = split_vma(mm, vma, start, 1);
51 if (ret) 421 if (ret)
@@ -60,26 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
60 430
61success: 431success:
62 /* 432 /*
433 * Keep track of amount of locked VM.
434 */
435 nr_pages = (end - start) >> PAGE_SHIFT;
436 if (!lock)
437 nr_pages = -nr_pages;
438 mm->locked_vm += nr_pages;
439
440 /*
63 * vm_flags is protected by the mmap_sem held in write mode. 441 * vm_flags is protected by the mmap_sem held in write mode.
64 * It's okay if try_to_unmap_one unmaps a page just after we 442 * It's okay if try_to_unmap_one unmaps a page just after we
65 * set VM_LOCKED, make_pages_present below will bring it back. 443 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
66 */ 444 */
67 vma->vm_flags = newflags; 445 vma->vm_flags = newflags;
68 446
69 /* 447 if (lock) {
70 * Keep track of amount of locked VM. 448 /*
71 */ 449 * mmap_sem is currently held for write. Downgrade the write
72 pages = (end - start) >> PAGE_SHIFT; 450 * lock to a read lock so that other faults, mmap scans, ...
73 if (newflags & VM_LOCKED) { 451 * while we fault in all pages.
74 pages = -pages; 452 */
75 if (!(newflags & VM_IO)) 453 downgrade_write(&mm->mmap_sem);
76 ret = make_pages_present(start, end); 454
455 ret = __mlock_vma_pages_range(vma, start, end, 1);
456
457 /*
458 * Need to reacquire mmap sem in write mode, as our callers
459 * expect this. We have no support for atomically upgrading
460 * a sem to write, so we need to check for ranges while sem
461 * is unlocked.
462 */
463 up_read(&mm->mmap_sem);
464 /* vma can change or disappear */
465 down_write(&mm->mmap_sem);
466 *prev = find_vma(mm, start);
467 /* non-NULL *prev must contain @start, but need to check @end */
468 if (!(*prev) || end > (*prev)->vm_end)
469 ret = -ENOMEM;
470 else if (ret > 0) {
471 mm->locked_vm -= ret;
472 ret = 0;
473 } else
474 ret = __mlock_posix_error_return(ret); /* translate if needed */
475 } else {
476 /*
477 * TODO: for unlocking, pages will already be resident, so
478 * we don't need to wait for allocations/reclaim/pagein, ...
479 * However, unlocking a very large region can still take a
480 * while. Should we downgrade the semaphore for both lock
481 * AND unlock ?
482 */
483 __mlock_vma_pages_range(vma, start, end, 0);
77 } 484 }
78 485
79 mm->locked_vm -= pages;
80out: 486out:
81 if (ret == -ENOMEM) 487 *prev = vma;
82 ret = -EAGAIN;
83 return ret; 488 return ret;
84} 489}
85 490
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..4e0e26591dfa
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
1/*
2 * mm_init.c - Memory initialisation verification and debugging
3 *
4 * Copyright 2008 IBM Corporation, 2008
5 * Author Mel Gorman <mel@csn.ul.ie>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/kobject.h>
11#include <linux/module.h>
12#include "internal.h"
13
14#ifdef CONFIG_DEBUG_MEMORY_INIT
15int mminit_loglevel;
16
17#ifndef SECTIONS_SHIFT
18#define SECTIONS_SHIFT 0
19#endif
20
21/* The zonelists are simply reported, validation is manual. */
22void mminit_verify_zonelist(void)
23{
24 int nid;
25
26 if (mminit_loglevel < MMINIT_VERIFY)
27 return;
28
29 for_each_online_node(nid) {
30 pg_data_t *pgdat = NODE_DATA(nid);
31 struct zone *zone;
32 struct zoneref *z;
33 struct zonelist *zonelist;
34 int i, listid, zoneid;
35
36 BUG_ON(MAX_ZONELISTS > 2);
37 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
38
39 /* Identify the zone and nodelist */
40 zoneid = i % MAX_NR_ZONES;
41 listid = i / MAX_NR_ZONES;
42 zonelist = &pgdat->node_zonelists[listid];
43 zone = &pgdat->node_zones[zoneid];
44 if (!populated_zone(zone))
45 continue;
46
47 /* Print information about the zonelist */
48 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
49 listid > 0 ? "thisnode" : "general", nid,
50 zone->name);
51
52 /* Iterate the zonelist */
53 for_each_zone_zonelist(zone, z, zonelist, zoneid) {
54#ifdef CONFIG_NUMA
55 printk(KERN_CONT "%d:%s ",
56 zone->node, zone->name);
57#else
58 printk(KERN_CONT "0:%s ", zone->name);
59#endif /* CONFIG_NUMA */
60 }
61 printk(KERN_CONT "\n");
62 }
63 }
64}
65
66void __init mminit_verify_pageflags_layout(void)
67{
68 int shift, width;
69 unsigned long or_mask, add_mask;
70
71 shift = 8 * sizeof(unsigned long);
72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
74 "Section %d Node %d Zone %d Flags %d\n",
75 SECTIONS_WIDTH,
76 NODES_WIDTH,
77 ZONES_WIDTH,
78 NR_PAGEFLAGS);
79 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
80 "Section %d Node %d Zone %d\n",
81 SECTIONS_SHIFT,
82 NODES_SHIFT,
83 ZONES_SHIFT);
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
85 "Section %lu Node %lu Zone %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT);
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
90 "Zone ID: %lu -> %lu\n",
91 (unsigned long)ZONEID_PGOFF,
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags");
99#endif
100
101 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH;
103 BUG_ON(shift != SECTIONS_PGSHIFT);
104 }
105 if (NODES_WIDTH) {
106 shift -= NODES_WIDTH;
107 BUG_ON(shift != NODES_PGSHIFT);
108 }
109 if (ZONES_WIDTH) {
110 shift -= ZONES_WIDTH;
111 BUG_ON(shift != ZONES_PGSHIFT);
112 }
113
114 /* Check for bitmask overlaps */
115 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
116 (NODES_MASK << NODES_PGSHIFT) |
117 (SECTIONS_MASK << SECTIONS_PGSHIFT);
118 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
119 (NODES_MASK << NODES_PGSHIFT) +
120 (SECTIONS_MASK << SECTIONS_PGSHIFT);
121 BUG_ON(or_mask != add_mask);
122}
123
124void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
125 unsigned long nid, unsigned long pfn)
126{
127 BUG_ON(page_to_nid(page) != nid);
128 BUG_ON(page_zonenum(page) != zone);
129 BUG_ON(page_to_pfn(page) != pfn);
130}
131
132static __init int set_mminit_loglevel(char *str)
133{
134 get_option(&str, &mminit_loglevel);
135 return 0;
136}
137early_param("mminit_loglevel", set_mminit_loglevel);
138#endif /* CONFIG_DEBUG_MEMORY_INIT */
139
140struct kobject *mm_kobj;
141EXPORT_SYMBOL_GPL(mm_kobj);
142
143static int __init mm_sysfs_init(void)
144{
145 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
146 if (!mm_kobj)
147 return -ENOMEM;
148
149 return 0;
150}
151
152__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..74f4d158022e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,12 +26,15 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/cacheflush.h> 32#include <asm/cacheflush.h>
32#include <asm/tlb.h> 33#include <asm/tlb.h>
33#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
34 35
36#include "internal.h"
37
35#ifndef arch_mmap_check 38#ifndef arch_mmap_check
36#define arch_mmap_check(addr, len, flags) (0) 39#define arch_mmap_check(addr, len, flags) (0)
37#endif 40#endif
@@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
367 if (vma_tmp->vm_end > addr) { 370 if (vma_tmp->vm_end > addr) {
368 vma = vma_tmp; 371 vma = vma_tmp;
369 if (vma_tmp->vm_start <= addr) 372 if (vma_tmp->vm_start <= addr)
370 return vma; 373 break;
371 __rb_link = &__rb_parent->rb_left; 374 __rb_link = &__rb_parent->rb_left;
372 } else { 375 } else {
373 rb_prev = __rb_parent; 376 rb_prev = __rb_parent;
@@ -407,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
407 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 410 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
408} 411}
409 412
410static inline void __vma_link_file(struct vm_area_struct *vma) 413static void __vma_link_file(struct vm_area_struct *vma)
411{ 414{
412 struct file * file; 415 struct file * file;
413 416
@@ -659,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end);
659 * If the vma has a ->close operation then the driver probably needs to release 662 * If the vma has a ->close operation then the driver probably needs to release
660 * per-vma resources, so we don't attempt to merge those. 663 * per-vma resources, so we don't attempt to merge those.
661 */ 664 */
662#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
663
664static inline int is_mergeable_vma(struct vm_area_struct *vma, 665static inline int is_mergeable_vma(struct vm_area_struct *vma,
665 struct file *file, unsigned long vm_flags) 666 struct file *file, unsigned long vm_flags)
666{ 667{
@@ -969,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
969 return -EPERM; 970 return -EPERM;
970 vm_flags |= VM_LOCKED; 971 vm_flags |= VM_LOCKED;
971 } 972 }
973
972 /* mlock MCL_FUTURE? */ 974 /* mlock MCL_FUTURE? */
973 if (vm_flags & VM_LOCKED) { 975 if (vm_flags & VM_LOCKED) {
974 unsigned long locked, lock_limit; 976 unsigned long locked, lock_limit;
@@ -1027,6 +1029,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
1027 } else { 1029 } else {
1028 switch (flags & MAP_TYPE) { 1030 switch (flags & MAP_TYPE) {
1029 case MAP_SHARED: 1031 case MAP_SHARED:
1032 /*
1033 * Ignore pgoff.
1034 */
1035 pgoff = 0;
1030 vm_flags |= VM_SHARED | VM_MAYSHARE; 1036 vm_flags |= VM_SHARED | VM_MAYSHARE;
1031 break; 1037 break;
1032 case MAP_PRIVATE: 1038 case MAP_PRIVATE:
@@ -1108,6 +1114,9 @@ munmap_back:
1108 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 1114 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1109 return -ENOMEM; 1115 return -ENOMEM;
1110 1116
1117 if (flags & MAP_NORESERVE)
1118 vm_flags |= VM_NORESERVE;
1119
1111 if (accountable && (!(flags & MAP_NORESERVE) || 1120 if (accountable && (!(flags & MAP_NORESERVE) ||
1112 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 1121 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1113 if (vm_flags & VM_SHARED) { 1122 if (vm_flags & VM_SHARED) {
@@ -1129,10 +1138,12 @@ munmap_back:
1129 * The VM_SHARED test is necessary because shmem_zero_setup 1138 * The VM_SHARED test is necessary because shmem_zero_setup
1130 * will create the file object for a shared anonymous map below. 1139 * will create the file object for a shared anonymous map below.
1131 */ 1140 */
1132 if (!file && !(vm_flags & VM_SHARED) && 1141 if (!file && !(vm_flags & VM_SHARED)) {
1133 vma_merge(mm, prev, addr, addr + len, vm_flags, 1142 vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
1134 NULL, NULL, pgoff, NULL)) 1143 NULL, NULL, pgoff, NULL);
1135 goto out; 1144 if (vma)
1145 goto out;
1146 }
1136 1147
1137 /* 1148 /*
1138 * Determine the object being mapped and call the appropriate 1149 * Determine the object being mapped and call the appropriate
@@ -1214,10 +1225,14 @@ out:
1214 mm->total_vm += len >> PAGE_SHIFT; 1225 mm->total_vm += len >> PAGE_SHIFT;
1215 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1226 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1216 if (vm_flags & VM_LOCKED) { 1227 if (vm_flags & VM_LOCKED) {
1217 mm->locked_vm += len >> PAGE_SHIFT; 1228 /*
1218 make_pages_present(addr, addr + len); 1229 * makes pages present; downgrades, drops, reacquires mmap_sem
1219 } 1230 */
1220 if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1231 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1232 if (nr_pages < 0)
1233 return nr_pages; /* vma gone! */
1234 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1235 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1221 make_pages_present(addr, addr + len); 1236 make_pages_present(addr, addr + len);
1222 return addr; 1237 return addr;
1223 1238
@@ -1576,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1576 * vma is the last one with address > vma->vm_end. Have to extend vma. 1591 * vma is the last one with address > vma->vm_end. Have to extend vma.
1577 */ 1592 */
1578#ifndef CONFIG_IA64 1593#ifndef CONFIG_IA64
1579static inline 1594static
1580#endif 1595#endif
1581int expand_upwards(struct vm_area_struct *vma, unsigned long address) 1596int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1582{ 1597{
@@ -1626,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1626/* 1641/*
1627 * vma is the first one with address < vma->vm_start. Have to extend vma. 1642 * vma is the first one with address < vma->vm_start. Have to extend vma.
1628 */ 1643 */
1629static inline int expand_downwards(struct vm_area_struct *vma, 1644static int expand_downwards(struct vm_area_struct *vma,
1630 unsigned long address) 1645 unsigned long address)
1631{ 1646{
1632 int error; 1647 int error;
@@ -1688,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
1688 vma = find_vma_prev(mm, addr, &prev); 1703 vma = find_vma_prev(mm, addr, &prev);
1689 if (vma && (vma->vm_start <= addr)) 1704 if (vma && (vma->vm_start <= addr))
1690 return vma; 1705 return vma;
1691 if (!prev || expand_stack(prev, addr)) 1706 if (expand_stack(prev, addr))
1692 return NULL; 1707 return NULL;
1693 if (prev->vm_flags & VM_LOCKED) 1708 if (prev->vm_flags & VM_LOCKED) {
1694 make_pages_present(addr, prev->vm_end); 1709 if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
1710 return NULL; /* vma gone! */
1711 }
1695 return prev; 1712 return prev;
1696} 1713}
1697#else 1714#else
@@ -1717,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1717 start = vma->vm_start; 1734 start = vma->vm_start;
1718 if (expand_stack(vma, addr)) 1735 if (expand_stack(vma, addr))
1719 return NULL; 1736 return NULL;
1720 if (vma->vm_flags & VM_LOCKED) 1737 if (vma->vm_flags & VM_LOCKED) {
1721 make_pages_present(addr, start); 1738 if (mlock_vma_pages_range(vma, addr, start) < 0)
1739 return NULL; /* vma gone! */
1740 }
1722 return vma; 1741 return vma;
1723} 1742}
1724#endif 1743#endif
@@ -1737,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1737 long nrpages = vma_pages(vma); 1756 long nrpages = vma_pages(vma);
1738 1757
1739 mm->total_vm -= nrpages; 1758 mm->total_vm -= nrpages;
1740 if (vma->vm_flags & VM_LOCKED)
1741 mm->locked_vm -= nrpages;
1742 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1759 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1743 vma = remove_vma(vma); 1760 vma = remove_vma(vma);
1744 } while (vma); 1761 } while (vma);
@@ -1763,7 +1780,7 @@ static void unmap_region(struct mm_struct *mm,
1763 update_hiwater_rss(mm); 1780 update_hiwater_rss(mm);
1764 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1781 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1765 vm_unacct_memory(nr_accounted); 1782 vm_unacct_memory(nr_accounted);
1766 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1783 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1767 next? next->vm_start: 0); 1784 next? next->vm_start: 0);
1768 tlb_finish_mmu(tlb, start, end); 1785 tlb_finish_mmu(tlb, start, end);
1769} 1786}
@@ -1807,7 +1824,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1807 struct mempolicy *pol; 1824 struct mempolicy *pol;
1808 struct vm_area_struct *new; 1825 struct vm_area_struct *new;
1809 1826
1810 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 1827 if (is_vm_hugetlb_page(vma) && (addr &
1828 ~(huge_page_mask(hstate_vma(vma)))))
1811 return -EINVAL; 1829 return -EINVAL;
1812 1830
1813 if (mm->map_count >= sysctl_max_map_count) 1831 if (mm->map_count >= sysctl_max_map_count)
@@ -1903,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1903 vma = prev? prev->vm_next: mm->mmap; 1921 vma = prev? prev->vm_next: mm->mmap;
1904 1922
1905 /* 1923 /*
1924 * unlock any mlock()ed ranges before detaching vmas
1925 */
1926 if (mm->locked_vm) {
1927 struct vm_area_struct *tmp = vma;
1928 while (tmp && tmp->vm_start < end) {
1929 if (tmp->vm_flags & VM_LOCKED) {
1930 mm->locked_vm -= vma_pages(tmp);
1931 munlock_vma_pages_all(tmp);
1932 }
1933 tmp = tmp->vm_next;
1934 }
1935 }
1936
1937 /*
1906 * Remove the vma's, and unmap the actual pages 1938 * Remove the vma's, and unmap the actual pages
1907 */ 1939 */
1908 detach_vmas_to_be_unmapped(mm, vma, prev, end); 1940 detach_vmas_to_be_unmapped(mm, vma, prev, end);
@@ -2014,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2014 return -ENOMEM; 2046 return -ENOMEM;
2015 2047
2016 /* Can we just expand an old private anonymous mapping? */ 2048 /* Can we just expand an old private anonymous mapping? */
2017 if (vma_merge(mm, prev, addr, addr + len, flags, 2049 vma = vma_merge(mm, prev, addr, addr + len, flags,
2018 NULL, NULL, pgoff, NULL)) 2050 NULL, NULL, pgoff, NULL);
2051 if (vma)
2019 goto out; 2052 goto out;
2020 2053
2021 /* 2054 /*
@@ -2037,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2037out: 2070out:
2038 mm->total_vm += len >> PAGE_SHIFT; 2071 mm->total_vm += len >> PAGE_SHIFT;
2039 if (flags & VM_LOCKED) { 2072 if (flags & VM_LOCKED) {
2040 mm->locked_vm += len >> PAGE_SHIFT; 2073 if (!mlock_vma_pages_range(vma, addr, addr + len))
2041 make_pages_present(addr, addr + len); 2074 mm->locked_vm += (len >> PAGE_SHIFT);
2042 } 2075 }
2043 return addr; 2076 return addr;
2044} 2077}
@@ -2049,13 +2082,23 @@ EXPORT_SYMBOL(do_brk);
2049void exit_mmap(struct mm_struct *mm) 2082void exit_mmap(struct mm_struct *mm)
2050{ 2083{
2051 struct mmu_gather *tlb; 2084 struct mmu_gather *tlb;
2052 struct vm_area_struct *vma = mm->mmap; 2085 struct vm_area_struct *vma;
2053 unsigned long nr_accounted = 0; 2086 unsigned long nr_accounted = 0;
2054 unsigned long end; 2087 unsigned long end;
2055 2088
2056 /* mm's last user has gone, and its about to be pulled down */ 2089 /* mm's last user has gone, and its about to be pulled down */
2057 arch_exit_mmap(mm); 2090 arch_exit_mmap(mm);
2058 2091 mmu_notifier_release(mm);
2092
2093 if (mm->locked_vm) {
2094 vma = mm->mmap;
2095 while (vma) {
2096 if (vma->vm_flags & VM_LOCKED)
2097 munlock_vma_pages_all(vma);
2098 vma = vma->vm_next;
2099 }
2100 }
2101 vma = mm->mmap;
2059 lru_add_drain(); 2102 lru_add_drain();
2060 flush_cache_mm(mm); 2103 flush_cache_mm(mm);
2061 tlb = tlb_gather_mmu(mm, 1); 2104 tlb = tlb_gather_mmu(mm, 1);
@@ -2063,7 +2106,7 @@ void exit_mmap(struct mm_struct *mm)
2063 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2106 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2064 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2107 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2065 vm_unacct_memory(nr_accounted); 2108 vm_unacct_memory(nr_accounted);
2066 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2109 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2067 tlb_finish_mmu(tlb, 0, end); 2110 tlb_finish_mmu(tlb, 0, end);
2068 2111
2069 /* 2112 /*
@@ -2262,3 +2305,167 @@ int install_special_mapping(struct mm_struct *mm,
2262 2305
2263 return 0; 2306 return 0;
2264} 2307}
2308
2309static DEFINE_MUTEX(mm_all_locks_mutex);
2310
2311static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2312{
2313 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2314 /*
2315 * The LSB of head.next can't change from under us
2316 * because we hold the mm_all_locks_mutex.
2317 */
2318 spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
2319 /*
2320 * We can safely modify head.next after taking the
2321 * anon_vma->lock. If some other vma in this mm shares
2322 * the same anon_vma we won't take it again.
2323 *
2324 * No need of atomic instructions here, head.next
2325 * can't change from under us thanks to the
2326 * anon_vma->lock.
2327 */
2328 if (__test_and_set_bit(0, (unsigned long *)
2329 &anon_vma->head.next))
2330 BUG();
2331 }
2332}
2333
2334static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2335{
2336 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2337 /*
2338 * AS_MM_ALL_LOCKS can't change from under us because
2339 * we hold the mm_all_locks_mutex.
2340 *
2341 * Operations on ->flags have to be atomic because
2342 * even if AS_MM_ALL_LOCKS is stable thanks to the
2343 * mm_all_locks_mutex, there may be other cpus
2344 * changing other bitflags in parallel to us.
2345 */
2346 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2347 BUG();
2348 spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
2349 }
2350}
2351
2352/*
2353 * This operation locks against the VM for all pte/vma/mm related
2354 * operations that could ever happen on a certain mm. This includes
2355 * vmtruncate, try_to_unmap, and all page faults.
2356 *
2357 * The caller must take the mmap_sem in write mode before calling
2358 * mm_take_all_locks(). The caller isn't allowed to release the
2359 * mmap_sem until mm_drop_all_locks() returns.
2360 *
2361 * mmap_sem in write mode is required in order to block all operations
2362 * that could modify pagetables and free pages without need of
2363 * altering the vma layout (for example populate_range() with
2364 * nonlinear vmas). It's also needed in write mode to avoid new
2365 * anon_vmas to be associated with existing vmas.
2366 *
2367 * A single task can't take more than one mm_take_all_locks() in a row
2368 * or it would deadlock.
2369 *
2370 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
2371 * mapping->flags avoid to take the same lock twice, if more than one
2372 * vma in this mm is backed by the same anon_vma or address_space.
2373 *
2374 * We can take all the locks in random order because the VM code
2375 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
2376 * takes more than one of them in a row. Secondly we're protected
2377 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2378 *
2379 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2380 * that may have to take thousand of locks.
2381 *
2382 * mm_take_all_locks() can fail if it's interrupted by signals.
2383 */
2384int mm_take_all_locks(struct mm_struct *mm)
2385{
2386 struct vm_area_struct *vma;
2387 int ret = -EINTR;
2388
2389 BUG_ON(down_read_trylock(&mm->mmap_sem));
2390
2391 mutex_lock(&mm_all_locks_mutex);
2392
2393 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2394 if (signal_pending(current))
2395 goto out_unlock;
2396 if (vma->vm_file && vma->vm_file->f_mapping)
2397 vm_lock_mapping(mm, vma->vm_file->f_mapping);
2398 }
2399
2400 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2401 if (signal_pending(current))
2402 goto out_unlock;
2403 if (vma->anon_vma)
2404 vm_lock_anon_vma(mm, vma->anon_vma);
2405 }
2406
2407 ret = 0;
2408
2409out_unlock:
2410 if (ret)
2411 mm_drop_all_locks(mm);
2412
2413 return ret;
2414}
2415
2416static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2417{
2418 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2419 /*
2420 * The LSB of head.next can't change to 0 from under
2421 * us because we hold the mm_all_locks_mutex.
2422 *
2423 * We must however clear the bitflag before unlocking
2424 * the vma so the users using the anon_vma->head will
2425 * never see our bitflag.
2426 *
2427 * No need of atomic instructions here, head.next
2428 * can't change from under us until we release the
2429 * anon_vma->lock.
2430 */
2431 if (!__test_and_clear_bit(0, (unsigned long *)
2432 &anon_vma->head.next))
2433 BUG();
2434 spin_unlock(&anon_vma->lock);
2435 }
2436}
2437
2438static void vm_unlock_mapping(struct address_space *mapping)
2439{
2440 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2441 /*
2442 * AS_MM_ALL_LOCKS can't change to 0 from under us
2443 * because we hold the mm_all_locks_mutex.
2444 */
2445 spin_unlock(&mapping->i_mmap_lock);
2446 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2447 &mapping->flags))
2448 BUG();
2449 }
2450}
2451
2452/*
2453 * The mmap_sem cannot be released by the caller until
2454 * mm_drop_all_locks() returns.
2455 */
2456void mm_drop_all_locks(struct mm_struct *mm)
2457{
2458 struct vm_area_struct *vma;
2459
2460 BUG_ON(down_read_trylock(&mm->mmap_sem));
2461 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2462
2463 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2464 if (vma->anon_vma)
2465 vm_unlock_anon_vma(vma->anon_vma);
2466 if (vma->vm_file && vma->vm_file->f_mapping)
2467 vm_unlock_mapping(vma->vm_file->f_mapping);
2468 }
2469
2470 mutex_unlock(&mm_all_locks_mutex);
2471}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
1/*
2 * linux/mm/mmu_notifier.c
3 *
4 * Copyright (C) 2008 Qumranet, Inc.
5 * Copyright (C) 2008 SGI
6 * Christoph Lameter <clameter@sgi.com>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See
9 * the COPYING file in the top-level directory.
10 */
11
12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h>
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/err.h>
17#include <linux/rcupdate.h>
18#include <linux/sched.h>
19
20/*
21 * This function can't run concurrently against mmu_notifier_register
22 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
23 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
24 * in parallel despite there being no task using this mm any more,
25 * through the vmas outside of the exit_mmap context, such as with
26 * vmtruncate. This serializes against mmu_notifier_unregister with
27 * the mmu_notifier_mm->lock in addition to RCU and it serializes
28 * against the other mmu notifiers with RCU. struct mmu_notifier_mm
29 * can't go away from under us as exit_mmap holds an mm_count pin
30 * itself.
31 */
32void __mmu_notifier_release(struct mm_struct *mm)
33{
34 struct mmu_notifier *mn;
35
36 spin_lock(&mm->mmu_notifier_mm->lock);
37 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
38 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
39 struct mmu_notifier,
40 hlist);
41 /*
42 * We arrived before mmu_notifier_unregister so
43 * mmu_notifier_unregister will do nothing other than
44 * to wait ->release to finish and
45 * mmu_notifier_unregister to return.
46 */
47 hlist_del_init_rcu(&mn->hlist);
48 /*
49 * RCU here will block mmu_notifier_unregister until
50 * ->release returns.
51 */
52 rcu_read_lock();
53 spin_unlock(&mm->mmu_notifier_mm->lock);
54 /*
55 * if ->release runs before mmu_notifier_unregister it
56 * must be handled as it's the only way for the driver
57 * to flush all existing sptes and stop the driver
58 * from establishing any more sptes before all the
59 * pages in the mm are freed.
60 */
61 if (mn->ops->release)
62 mn->ops->release(mn, mm);
63 rcu_read_unlock();
64 spin_lock(&mm->mmu_notifier_mm->lock);
65 }
66 spin_unlock(&mm->mmu_notifier_mm->lock);
67
68 /*
69 * synchronize_rcu here prevents mmu_notifier_release to
70 * return to exit_mmap (which would proceed freeing all pages
71 * in the mm) until the ->release method returns, if it was
72 * invoked by mmu_notifier_unregister.
73 *
74 * The mmu_notifier_mm can't go away from under us because one
75 * mm_count is hold by exit_mmap.
76 */
77 synchronize_rcu();
78}
79
80/*
81 * If no young bitflag is supported by the hardware, ->clear_flush_young can
82 * unmap the address and return 1 or 0 depending if the mapping previously
83 * existed or not.
84 */
85int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
86 unsigned long address)
87{
88 struct mmu_notifier *mn;
89 struct hlist_node *n;
90 int young = 0;
91
92 rcu_read_lock();
93 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
94 if (mn->ops->clear_flush_young)
95 young |= mn->ops->clear_flush_young(mn, mm, address);
96 }
97 rcu_read_unlock();
98
99 return young;
100}
101
102void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->invalidate_page)
111 mn->ops->invalidate_page(mn, mm, address);
112 }
113 rcu_read_unlock();
114}
115
116void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
117 unsigned long start, unsigned long end)
118{
119 struct mmu_notifier *mn;
120 struct hlist_node *n;
121
122 rcu_read_lock();
123 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
124 if (mn->ops->invalidate_range_start)
125 mn->ops->invalidate_range_start(mn, mm, start, end);
126 }
127 rcu_read_unlock();
128}
129
130void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
131 unsigned long start, unsigned long end)
132{
133 struct mmu_notifier *mn;
134 struct hlist_node *n;
135
136 rcu_read_lock();
137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
138 if (mn->ops->invalidate_range_end)
139 mn->ops->invalidate_range_end(mn, mm, start, end);
140 }
141 rcu_read_unlock();
142}
143
144static int do_mmu_notifier_register(struct mmu_notifier *mn,
145 struct mm_struct *mm,
146 int take_mmap_sem)
147{
148 struct mmu_notifier_mm *mmu_notifier_mm;
149 int ret;
150
151 BUG_ON(atomic_read(&mm->mm_users) <= 0);
152
153 ret = -ENOMEM;
154 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
155 if (unlikely(!mmu_notifier_mm))
156 goto out;
157
158 if (take_mmap_sem)
159 down_write(&mm->mmap_sem);
160 ret = mm_take_all_locks(mm);
161 if (unlikely(ret))
162 goto out_cleanup;
163
164 if (!mm_has_notifiers(mm)) {
165 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
166 spin_lock_init(&mmu_notifier_mm->lock);
167 mm->mmu_notifier_mm = mmu_notifier_mm;
168 mmu_notifier_mm = NULL;
169 }
170 atomic_inc(&mm->mm_count);
171
172 /*
173 * Serialize the update against mmu_notifier_unregister. A
174 * side note: mmu_notifier_release can't run concurrently with
175 * us because we hold the mm_users pin (either implicitly as
176 * current->mm or explicitly with get_task_mm() or similar).
177 * We can't race against any other mmu notifier method either
178 * thanks to mm_take_all_locks().
179 */
180 spin_lock(&mm->mmu_notifier_mm->lock);
181 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
182 spin_unlock(&mm->mmu_notifier_mm->lock);
183
184 mm_drop_all_locks(mm);
185out_cleanup:
186 if (take_mmap_sem)
187 up_write(&mm->mmap_sem);
188 /* kfree() does nothing if mmu_notifier_mm is NULL */
189 kfree(mmu_notifier_mm);
190out:
191 BUG_ON(atomic_read(&mm->mm_users) <= 0);
192 return ret;
193}
194
195/*
196 * Must not hold mmap_sem nor any other VM related lock when calling
197 * this registration function. Must also ensure mm_users can't go down
198 * to zero while this runs to avoid races with mmu_notifier_release,
199 * so mm has to be current->mm or the mm should be pinned safely such
200 * as with get_task_mm(). If the mm is not current->mm, the mm_users
201 * pin should be released by calling mmput after mmu_notifier_register
202 * returns. mmu_notifier_unregister must be always called to
203 * unregister the notifier. mm_count is automatically pinned to allow
204 * mmu_notifier_unregister to safely run at any time later, before or
205 * after exit_mmap. ->release will always be called before exit_mmap
206 * frees the pages.
207 */
208int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
209{
210 return do_mmu_notifier_register(mn, mm, 1);
211}
212EXPORT_SYMBOL_GPL(mmu_notifier_register);
213
214/*
215 * Same as mmu_notifier_register but here the caller must hold the
216 * mmap_sem in write mode.
217 */
218int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
219{
220 return do_mmu_notifier_register(mn, mm, 0);
221}
222EXPORT_SYMBOL_GPL(__mmu_notifier_register);
223
224/* this is called after the last mmu_notifier_unregister() returned */
225void __mmu_notifier_mm_destroy(struct mm_struct *mm)
226{
227 BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
228 kfree(mm->mmu_notifier_mm);
229 mm->mmu_notifier_mm = LIST_POISON1; /* debug */
230}
231
232/*
233 * This releases the mm_count pin automatically and frees the mm
234 * structure if it was the last user of it. It serializes against
235 * running mmu notifiers with RCU and against mmu_notifier_unregister
236 * with the unregister lock + RCU. All sptes must be dropped before
237 * calling mmu_notifier_unregister. ->release or any other notifier
238 * method may be invoked concurrently with mmu_notifier_unregister,
239 * and only after mmu_notifier_unregister returned we're guaranteed
240 * that ->release or any other method can't run anymore.
241 */
242void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
243{
244 BUG_ON(atomic_read(&mm->mm_count) <= 0);
245
246 spin_lock(&mm->mmu_notifier_mm->lock);
247 if (!hlist_unhashed(&mn->hlist)) {
248 hlist_del_rcu(&mn->hlist);
249
250 /*
251 * RCU here will force exit_mmap to wait ->release to finish
252 * before freeing the pages.
253 */
254 rcu_read_lock();
255 spin_unlock(&mm->mmu_notifier_mm->lock);
256 /*
257 * exit_mmap will block in mmu_notifier_release to
258 * guarantee ->release is called before freeing the
259 * pages.
260 */
261 if (mn->ops->release)
262 mn->ops->release(mn, mm);
263 rcu_read_unlock();
264 } else
265 spin_unlock(&mm->mmu_notifier_mm->lock);
266
267 /*
268 * Wait any running method to finish, of course including
269 * ->release if it was run by mmu_notifier_relase instead of us.
270 */
271 synchronize_rcu();
272
273 BUG_ON(atomic_read(&mm->mm_count) <= 0);
274
275 mmdrop(mm);
276}
277EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed595ee6f..16ce8b955dcf 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
69 (z->zone && !zref_in_nodemask(z, nodes))) 69 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++; 70 z++;
71 71
72 *zone = zonelist_zone(z++); 72 *zone = zonelist_zone(z);
73 return z; 73 return z;
74} 74}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 360d9cc8b38c..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include <asm/pgtable.h> 26#include <asm/pgtable.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
@@ -153,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
153 * If we make a private mapping writable we increase our commit; 154 * If we make a private mapping writable we increase our commit;
154 * but (without finer accounting) cannot reduce our commit if we 155 * but (without finer accounting) cannot reduce our commit if we
155 * make it unwritable again. 156 * make it unwritable again.
156 *
157 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
158 * a MAP_NORESERVE private mapping to writable will now reserve.
159 */ 157 */
160 if (newflags & VM_WRITE) { 158 if (newflags & VM_WRITE) {
161 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { 159 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
160 VM_SHARED|VM_NORESERVE))) {
162 charged = nrpages; 161 charged = nrpages;
163 if (security_vm_enough_memory(charged)) 162 if (security_vm_enough_memory(charged))
164 return -ENOMEM; 163 return -ENOMEM;
@@ -205,10 +204,12 @@ success:
205 dirty_accountable = 1; 204 dirty_accountable = 1;
206 } 205 }
207 206
207 mmu_notifier_invalidate_range_start(mm, start, end);
208 if (is_vm_hugetlb_page(vma)) 208 if (is_vm_hugetlb_page(vma))
209 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 209 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
210 else 210 else
211 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); 211 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
212 mmu_notifier_invalidate_range_end(mm, start, end);
212 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 213 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
213 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 214 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
214 return 0; 215 return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..58a2908f42f5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,11 +18,14 @@
18#include <linux/highmem.h> 18#include <linux/highmem.h>
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h>
21 22
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/cacheflush.h> 24#include <asm/cacheflush.h>
24#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
25 26
27#include "internal.h"
28
26static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) 29static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
27{ 30{
28 pgd_t *pgd; 31 pgd_t *pgd;
@@ -74,7 +77,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
74 struct mm_struct *mm = vma->vm_mm; 77 struct mm_struct *mm = vma->vm_mm;
75 pte_t *old_pte, *new_pte, pte; 78 pte_t *old_pte, *new_pte, pte;
76 spinlock_t *old_ptl, *new_ptl; 79 spinlock_t *old_ptl, *new_ptl;
80 unsigned long old_start;
77 81
82 old_start = old_addr;
83 mmu_notifier_invalidate_range_start(vma->vm_mm,
84 old_start, old_end);
78 if (vma->vm_file) { 85 if (vma->vm_file) {
79 /* 86 /*
80 * Subtle point from Rajesh Venkatasubramanian: before 87 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +123,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
116 pte_unmap_unlock(old_pte - 1, old_ptl); 123 pte_unmap_unlock(old_pte - 1, old_ptl);
117 if (mapping) 124 if (mapping)
118 spin_unlock(&mapping->i_mmap_lock); 125 spin_unlock(&mapping->i_mmap_lock);
126 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
119} 127}
120 128
121#define LATENCY_LIMIT (64 * PAGE_SIZE) 129#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -232,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
232 if (vm_flags & VM_LOCKED) { 240 if (vm_flags & VM_LOCKED) {
233 mm->locked_vm += new_len >> PAGE_SHIFT; 241 mm->locked_vm += new_len >> PAGE_SHIFT;
234 if (new_len > old_len) 242 if (new_len > old_len)
235 make_pages_present(new_addr + old_len, 243 mlock_vma_pages_range(new_vma, new_addr + old_len,
236 new_addr + new_len); 244 new_addr + new_len);
237 } 245 }
238 246
239 return new_addr; 247 return new_addr;
@@ -373,7 +381,7 @@ unsigned long do_mremap(unsigned long addr,
373 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 381 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
374 if (vma->vm_flags & VM_LOCKED) { 382 if (vma->vm_flags & VM_LOCKED) {
375 mm->locked_vm += pages; 383 mm->locked_vm += pages;
376 make_pages_present(addr + old_len, 384 mlock_vma_pages_range(vma, addr + old_len,
377 addr + new_len); 385 addr + new_len);
378 } 386 }
379 ret = addr; 387 ret = addr;
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..2696b24f2bb3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/ptrace.h> 25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
@@ -34,6 +34,8 @@
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36 36
37#include "internal.h"
38
37void *high_memory; 39void *high_memory;
38struct page *mem_map; 40struct page *mem_map;
39unsigned long max_mapnr; 41unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
128 return PAGE_SIZE << compound_order(page); 130 return PAGE_SIZE << compound_order(page);
129} 131}
130 132
131/* 133int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
132 * get a list of pages in an address range belonging to the specified process 134 unsigned long start, int len, int flags,
133 * and indicate the VMA that covers each page 135 struct page **pages, struct vm_area_struct **vmas)
134 * - this is potentially dodgy as we may end incrementing the page count of a
135 * slab page or a secondary page from a compound page
136 * - don't permit access to VMAs that don't support it, such as I/O mappings
137 */
138int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
139 unsigned long start, int len, int write, int force,
140 struct page **pages, struct vm_area_struct **vmas)
141{ 136{
142 struct vm_area_struct *vma; 137 struct vm_area_struct *vma;
143 unsigned long vm_flags; 138 unsigned long vm_flags;
144 int i; 139 int i;
140 int write = !!(flags & GUP_FLAGS_WRITE);
141 int force = !!(flags & GUP_FLAGS_FORCE);
142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
145 143
146 /* calculate required read or write permissions. 144 /* calculate required read or write permissions.
147 * - if 'force' is set, we only require the "MAY" flags. 145 * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
156 154
157 /* protect what we can, including chardevs */ 155 /* protect what we can, including chardevs */
158 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 156 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
159 !(vm_flags & vma->vm_flags)) 157 (!ignore && !(vm_flags & vma->vm_flags)))
160 goto finish_or_fault; 158 goto finish_or_fault;
161 159
162 if (pages) { 160 if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
174finish_or_fault: 172finish_or_fault:
175 return i ? : -EFAULT; 173 return i ? : -EFAULT;
176} 174}
175
176
177/*
178 * get a list of pages in an address range belonging to the specified process
179 * and indicate the VMA that covers each page
180 * - this is potentially dodgy as we may end incrementing the page count of a
181 * slab page or a secondary page from a compound page
182 * - don't permit access to VMAs that don't support it, such as I/O mappings
183 */
184int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 unsigned long start, int len, int write, int force,
186 struct page **pages, struct vm_area_struct **vmas)
187{
188 int flags = 0;
189
190 if (write)
191 flags |= GUP_FLAGS_WRITE;
192 if (force)
193 flags |= GUP_FLAGS_FORCE;
194
195 return __get_user_pages(tsk, mm,
196 start, len, flags,
197 pages, vmas);
198}
177EXPORT_SYMBOL(get_user_pages); 199EXPORT_SYMBOL(get_user_pages);
178 200
179DEFINE_RWLOCK(vmlist_lock); 201DEFINE_RWLOCK(vmlist_lock);
@@ -266,6 +288,27 @@ void *vmalloc_node(unsigned long size, int node)
266} 288}
267EXPORT_SYMBOL(vmalloc_node); 289EXPORT_SYMBOL(vmalloc_node);
268 290
291#ifndef PAGE_KERNEL_EXEC
292# define PAGE_KERNEL_EXEC PAGE_KERNEL
293#endif
294
295/**
296 * vmalloc_exec - allocate virtually contiguous, executable memory
297 * @size: allocation size
298 *
299 * Kernel-internal function to allocate enough pages to cover @size
300 * the page level allocator and map them into contiguous and
301 * executable kernel virtual space.
302 *
303 * For tight control over page level allocator and protection flags
304 * use __vmalloc() instead.
305 */
306
307void *vmalloc_exec(unsigned long size)
308{
309 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
310}
311
269/** 312/**
270 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 313 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
271 * @size: allocation size 314 * @size: allocation size
@@ -745,7 +788,7 @@ static unsigned long determine_vm_flags(struct file *file,
745 * it's being traced - otherwise breakpoints set in it may interfere 788 * it's being traced - otherwise breakpoints set in it may interfere
746 * with another untraced process 789 * with another untraced process
747 */ 790 */
748 if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) 791 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
749 vm_flags &= ~VM_MAYSHARE; 792 vm_flags &= ~VM_MAYSHARE;
750 793
751 return vm_flags; 794 return vm_flags;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee6265..64e5b4bcd964 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/notifier.h> 27#include <linux/notifier.h>
28#include <linux/memcontrol.h> 28#include <linux/memcontrol.h>
29#include <linux/security.h>
29 30
30int sysctl_panic_on_oom; 31int sysctl_panic_on_oom;
31int sysctl_oom_kill_allocating_task; 32int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
128 * Superuser processes are usually more important, so we make it 129 * Superuser processes are usually more important, so we make it
129 * less likely that we kill those. 130 * less likely that we kill those.
130 */ 131 */
131 if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) 132 if (has_capability(p, CAP_SYS_ADMIN) ||
133 has_capability(p, CAP_SYS_RESOURCE))
132 points /= 4; 134 points /= 4;
133 135
134 /* 136 /*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
137 * tend to only have this flag set on applications they think 139 * tend to only have this flag set on applications they think
138 * of as important. 140 * of as important.
139 */ 141 */
140 if (__capable(p, CAP_SYS_RAWIO)) 142 if (has_capability(p, CAP_SYS_RAWIO))
141 points /= 4; 143 points /= 4;
142 144
143 /* 145 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..2970e35fd03f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -7,7 +7,7 @@
7 * Contains functions related to writing back dirty pages at the 7 * Contains functions related to writing back dirty pages at the
8 * address_space level. 8 * address_space level.
9 * 9 *
10 * 10Apr2002 akpm@zip.com.au 10 * 10Apr2002 Andrew Morton
11 * Initial version 11 * Initial version
12 */ 12 */
13 13
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
329 struct zone *z = 329 struct zone *z =
330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
331 331
332 x += zone_page_state(z, NR_FREE_PAGES) 332 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
333 + zone_page_state(z, NR_INACTIVE)
334 + zone_page_state(z, NR_ACTIVE);
335 } 333 }
336 /* 334 /*
337 * Make sure that the number of highmem pages is never larger 335 * Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
355{ 353{
356 unsigned long x; 354 unsigned long x;
357 355
358 x = global_page_state(NR_FREE_PAGES) 356 x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
359 + global_page_state(NR_INACTIVE)
360 + global_page_state(NR_ACTIVE);
361 357
362 if (!vm_highmem_is_dirtyable) 358 if (!vm_highmem_is_dirtyable)
363 x -= highmem_dirtyable_memory(x); 359 x -= highmem_dirtyable_memory(x);
@@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping,
876 pgoff_t end; /* Inclusive */ 872 pgoff_t end; /* Inclusive */
877 int scanned = 0; 873 int scanned = 0;
878 int range_whole = 0; 874 int range_whole = 0;
875 long nr_to_write = wbc->nr_to_write;
879 876
880 if (wbc->nonblocking && bdi_write_congested(bdi)) { 877 if (wbc->nonblocking && bdi_write_congested(bdi)) {
881 wbc->encountered_congestion = 1; 878 wbc->encountered_congestion = 1;
@@ -939,7 +936,7 @@ retry:
939 unlock_page(page); 936 unlock_page(page);
940 ret = 0; 937 ret = 0;
941 } 938 }
942 if (ret || (--(wbc->nr_to_write) <= 0)) 939 if (ret || (--nr_to_write <= 0))
943 done = 1; 940 done = 1;
944 if (wbc->nonblocking && bdi_write_congested(bdi)) { 941 if (wbc->nonblocking && bdi_write_congested(bdi)) {
945 wbc->encountered_congestion = 1; 942 wbc->encountered_congestion = 1;
@@ -958,11 +955,12 @@ retry:
958 index = 0; 955 index = 0;
959 goto retry; 956 goto retry;
960 } 957 }
961 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 958 if (!wbc->no_nrwrite_index_update) {
962 mapping->writeback_index = index; 959 if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
960 mapping->writeback_index = index;
961 wbc->nr_to_write = nr_to_write;
962 }
963 963
964 if (wbc->range_cont)
965 wbc->range_start = index << PAGE_CACHE_SHIFT;
966 return ret; 964 return ret;
967} 965}
968EXPORT_SYMBOL(write_cache_pages); 966EXPORT_SYMBOL(write_cache_pages);
@@ -1088,7 +1086,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1088 if (!mapping) 1086 if (!mapping)
1089 return 1; 1087 return 1;
1090 1088
1091 write_lock_irq(&mapping->tree_lock); 1089 spin_lock_irq(&mapping->tree_lock);
1092 mapping2 = page_mapping(page); 1090 mapping2 = page_mapping(page);
1093 if (mapping2) { /* Race with truncate? */ 1091 if (mapping2) { /* Race with truncate? */
1094 BUG_ON(mapping2 != mapping); 1092 BUG_ON(mapping2 != mapping);
@@ -1102,7 +1100,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1102 radix_tree_tag_set(&mapping->page_tree, 1100 radix_tree_tag_set(&mapping->page_tree,
1103 page_index(page), PAGECACHE_TAG_DIRTY); 1101 page_index(page), PAGECACHE_TAG_DIRTY);
1104 } 1102 }
1105 write_unlock_irq(&mapping->tree_lock); 1103 spin_unlock_irq(&mapping->tree_lock);
1106 if (mapping->host) { 1104 if (mapping->host) {
1107 /* !PageAnon && !swapper_space */ 1105 /* !PageAnon && !swapper_space */
1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1106 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1256,7 @@ int test_clear_page_writeback(struct page *page)
1258 struct backing_dev_info *bdi = mapping->backing_dev_info; 1256 struct backing_dev_info *bdi = mapping->backing_dev_info;
1259 unsigned long flags; 1257 unsigned long flags;
1260 1258
1261 write_lock_irqsave(&mapping->tree_lock, flags); 1259 spin_lock_irqsave(&mapping->tree_lock, flags);
1262 ret = TestClearPageWriteback(page); 1260 ret = TestClearPageWriteback(page);
1263 if (ret) { 1261 if (ret) {
1264 radix_tree_tag_clear(&mapping->page_tree, 1262 radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1267,7 @@ int test_clear_page_writeback(struct page *page)
1269 __bdi_writeout_inc(bdi); 1267 __bdi_writeout_inc(bdi);
1270 } 1268 }
1271 } 1269 }
1272 write_unlock_irqrestore(&mapping->tree_lock, flags); 1270 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1273 } else { 1271 } else {
1274 ret = TestClearPageWriteback(page); 1272 ret = TestClearPageWriteback(page);
1275 } 1273 }
@@ -1287,7 +1285,7 @@ int test_set_page_writeback(struct page *page)
1287 struct backing_dev_info *bdi = mapping->backing_dev_info; 1285 struct backing_dev_info *bdi = mapping->backing_dev_info;
1288 unsigned long flags; 1286 unsigned long flags;
1289 1287
1290 write_lock_irqsave(&mapping->tree_lock, flags); 1288 spin_lock_irqsave(&mapping->tree_lock, flags);
1291 ret = TestSetPageWriteback(page); 1289 ret = TestSetPageWriteback(page);
1292 if (!ret) { 1290 if (!ret) {
1293 radix_tree_tag_set(&mapping->page_tree, 1291 radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1298,7 @@ int test_set_page_writeback(struct page *page)
1300 radix_tree_tag_clear(&mapping->page_tree, 1298 radix_tree_tag_clear(&mapping->page_tree,
1301 page_index(page), 1299 page_index(page),
1302 PAGECACHE_TAG_DIRTY); 1300 PAGECACHE_TAG_DIRTY);
1303 write_unlock_irqrestore(&mapping->tree_lock, flags); 1301 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1304 } else { 1302 } else {
1305 ret = TestSetPageWriteback(page); 1303 ret = TestSetPageWriteback(page);
1306 } 1304 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..d0a240fbb8bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -44,7 +44,7 @@
44#include <linux/backing-dev.h> 44#include <linux/backing-dev.h>
45#include <linux/fault-inject.h> 45#include <linux/fault-inject.h>
46#include <linux/page-isolation.h> 46#include <linux/page-isolation.h>
47#include <linux/memcontrol.h> 47#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 48#include <linux/debugobjects.h>
49 49
50#include <asm/tlbflush.h> 50#include <asm/tlbflush.h>
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 unsigned long __initdata required_kernelcore; 156 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 157 static unsigned long __initdata required_movablecore;
158 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
159 159
160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
161 int movable_zone; 161 int movable_zone;
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
223 223
224static void bad_page(struct page *page) 224static void bad_page(struct page *page)
225{ 225{
226 void *pc = page_get_page_cgroup(page);
227
228 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG 226 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
229 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 227 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
230 current->comm, page, (int)(2*sizeof(unsigned long)), 228 current->comm, page, (int)(2*sizeof(unsigned long)),
231 (unsigned long)page->flags, page->mapping, 229 (unsigned long)page->flags, page->mapping,
232 page_mapcount(page), page_count(page)); 230 page_mapcount(page), page_count(page));
233 if (pc) { 231
234 printk(KERN_EMERG "cgroup:%p\n", pc);
235 page_reset_bad_cgroup(page);
236 }
237 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 232 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
238 KERN_EMERG "Backtrace:\n"); 233 KERN_EMERG "Backtrace:\n");
239 dump_stack(); 234 dump_stack();
@@ -264,17 +259,18 @@ static void free_compound_page(struct page *page)
264 __free_pages_ok(page, compound_order(page)); 259 __free_pages_ok(page, compound_order(page));
265} 260}
266 261
267static void prep_compound_page(struct page *page, unsigned long order) 262void prep_compound_page(struct page *page, unsigned long order)
268{ 263{
269 int i; 264 int i;
270 int nr_pages = 1 << order; 265 int nr_pages = 1 << order;
266 struct page *p = page + 1;
271 267
272 set_compound_page_dtor(page, free_compound_page); 268 set_compound_page_dtor(page, free_compound_page);
273 set_compound_order(page, order); 269 set_compound_order(page, order);
274 __SetPageHead(page); 270 __SetPageHead(page);
275 for (i = 1; i < nr_pages; i++) { 271 for (i = 1; i < nr_pages; i++, p++) {
276 struct page *p = page + i; 272 if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
277 273 p = pfn_to_page(page_to_pfn(page) + i);
278 __SetPageTail(p); 274 __SetPageTail(p);
279 p->first_page = page; 275 p->first_page = page;
280 } 276 }
@@ -284,6 +280,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
284{ 280{
285 int i; 281 int i;
286 int nr_pages = 1 << order; 282 int nr_pages = 1 << order;
283 struct page *p = page + 1;
287 284
288 if (unlikely(compound_order(page) != order)) 285 if (unlikely(compound_order(page) != order))
289 bad_page(page); 286 bad_page(page);
@@ -291,8 +288,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
291 if (unlikely(!PageHead(page))) 288 if (unlikely(!PageHead(page)))
292 bad_page(page); 289 bad_page(page);
293 __ClearPageHead(page); 290 __ClearPageHead(page);
294 for (i = 1; i < nr_pages; i++) { 291 for (i = 1; i < nr_pages; i++, p++) {
295 struct page *p = page + i; 292 if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
293 p = pfn_to_page(page_to_pfn(page) + i);
296 294
297 if (unlikely(!PageTail(p) | 295 if (unlikely(!PageTail(p) |
298 (p->first_page != page))) 296 (p->first_page != page)))
@@ -432,8 +430,9 @@ static inline void __free_one_page(struct page *page,
432 430
433 buddy = __page_find_buddy(page, page_idx, order); 431 buddy = __page_find_buddy(page, page_idx, order);
434 if (!page_is_buddy(page, buddy, order)) 432 if (!page_is_buddy(page, buddy, order))
435 break; /* Move the buddy up one level. */ 433 break;
436 434
435 /* Our buddy is free, merge with it and move up one order. */
437 list_del(&buddy->lru); 436 list_del(&buddy->lru);
438 zone->free_area[order].nr_free--; 437 zone->free_area[order].nr_free--;
439 rmv_page_order(buddy); 438 rmv_page_order(buddy);
@@ -450,14 +449,16 @@ static inline void __free_one_page(struct page *page,
450 449
451static inline int free_pages_check(struct page *page) 450static inline int free_pages_check(struct page *page)
452{ 451{
452 free_page_mlock(page);
453 if (unlikely(page_mapcount(page) | 453 if (unlikely(page_mapcount(page) |
454 (page->mapping != NULL) | 454 (page->mapping != NULL) |
455 (page_get_page_cgroup(page) != NULL) |
456 (page_count(page) != 0) | 455 (page_count(page) != 0) |
457 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) 456 (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
458 bad_page(page); 457 bad_page(page);
459 if (PageDirty(page)) 458 if (PageDirty(page))
460 __ClearPageDirty(page); 459 __ClearPageDirty(page);
460 if (PageSwapBacked(page))
461 __ClearPageSwapBacked(page);
461 /* 462 /*
462 * For now, we report if PG_reserved was found set, but do not 463 * For now, we report if PG_reserved was found set, but do not
463 * clear it, and do not free the page. But we shall soon need 464 * clear it, and do not free the page. But we shall soon need
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
532/* 533/*
533 * permit the bootmem allocator to evade page validation on high-order frees 534 * permit the bootmem allocator to evade page validation on high-order frees
534 */ 535 */
535void __free_pages_bootmem(struct page *page, unsigned int order) 536void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
536{ 537{
537 if (order == 0) { 538 if (order == 0) {
538 __ClearPageReserved(page); 539 __ClearPageReserved(page);
@@ -596,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
596{ 597{
597 if (unlikely(page_mapcount(page) | 598 if (unlikely(page_mapcount(page) |
598 (page->mapping != NULL) | 599 (page->mapping != NULL) |
599 (page_get_page_cgroup(page) != NULL) |
600 (page_count(page) != 0) | 600 (page_count(page) != 0) |
601 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) 601 (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
602 bad_page(page); 602 bad_page(page);
@@ -610,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
610 610
611 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 611 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
612 1 << PG_referenced | 1 << PG_arch_1 | 612 1 << PG_referenced | 1 << PG_arch_1 |
613 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 613 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
614#ifdef CONFIG_UNEVICTABLE_LRU
615 | 1 << PG_mlocked
616#endif
617 );
614 set_page_private(page, 0); 618 set_page_private(page, 0);
615 set_page_refcounted(page); 619 set_page_refcounted(page);
616 620
@@ -673,9 +677,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
673 * Note that start_page and end_pages are not aligned on a pageblock 677 * Note that start_page and end_pages are not aligned on a pageblock
674 * boundary. If alignment is required, use move_freepages_block() 678 * boundary. If alignment is required, use move_freepages_block()
675 */ 679 */
676int move_freepages(struct zone *zone, 680static int move_freepages(struct zone *zone,
677 struct page *start_page, struct page *end_page, 681 struct page *start_page, struct page *end_page,
678 int migratetype) 682 int migratetype)
679{ 683{
680 struct page *page; 684 struct page *page;
681 unsigned long order; 685 unsigned long order;
@@ -693,6 +697,9 @@ int move_freepages(struct zone *zone,
693#endif 697#endif
694 698
695 for (page = start_page; page <= end_page;) { 699 for (page = start_page; page <= end_page;) {
700 /* Make sure we are not inadvertently changing nodes */
701 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
702
696 if (!pfn_valid_within(page_to_pfn(page))) { 703 if (!pfn_valid_within(page_to_pfn(page))) {
697 page++; 704 page++;
698 continue; 705 continue;
@@ -714,7 +721,8 @@ int move_freepages(struct zone *zone,
714 return pages_moved; 721 return pages_moved;
715} 722}
716 723
717int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 724static int move_freepages_block(struct zone *zone, struct page *page,
725 int migratetype)
718{ 726{
719 unsigned long start_pfn, end_pfn; 727 unsigned long start_pfn, end_pfn;
720 struct page *start_page, *end_page; 728 struct page *start_page, *end_page;
@@ -1429,7 +1437,7 @@ try_next_zone:
1429/* 1437/*
1430 * This is the 'heart' of the zoned buddy allocator. 1438 * This is the 'heart' of the zoned buddy allocator.
1431 */ 1439 */
1432static struct page * 1440struct page *
1433__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1441__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1434 struct zonelist *zonelist, nodemask_t *nodemask) 1442 struct zonelist *zonelist, nodemask_t *nodemask)
1435{ 1443{
@@ -1632,22 +1640,7 @@ nopage:
1632got_pg: 1640got_pg:
1633 return page; 1641 return page;
1634} 1642}
1635 1643EXPORT_SYMBOL(__alloc_pages_internal);
1636struct page *
1637__alloc_pages(gfp_t gfp_mask, unsigned int order,
1638 struct zonelist *zonelist)
1639{
1640 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1641}
1642
1643struct page *
1644__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1645 struct zonelist *zonelist, nodemask_t *nodemask)
1646{
1647 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1648}
1649
1650EXPORT_SYMBOL(__alloc_pages);
1651 1644
1652/* 1645/*
1653 * Common helper functions. 1646 * Common helper functions.
@@ -1711,6 +1704,59 @@ void free_pages(unsigned long addr, unsigned int order)
1711 1704
1712EXPORT_SYMBOL(free_pages); 1705EXPORT_SYMBOL(free_pages);
1713 1706
1707/**
1708 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
1709 * @size: the number of bytes to allocate
1710 * @gfp_mask: GFP flags for the allocation
1711 *
1712 * This function is similar to alloc_pages(), except that it allocates the
1713 * minimum number of pages to satisfy the request. alloc_pages() can only
1714 * allocate memory in power-of-two pages.
1715 *
1716 * This function is also limited by MAX_ORDER.
1717 *
1718 * Memory allocated by this function must be released by free_pages_exact().
1719 */
1720void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1721{
1722 unsigned int order = get_order(size);
1723 unsigned long addr;
1724
1725 addr = __get_free_pages(gfp_mask, order);
1726 if (addr) {
1727 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1728 unsigned long used = addr + PAGE_ALIGN(size);
1729
1730 split_page(virt_to_page(addr), order);
1731 while (used < alloc_end) {
1732 free_page(used);
1733 used += PAGE_SIZE;
1734 }
1735 }
1736
1737 return (void *)addr;
1738}
1739EXPORT_SYMBOL(alloc_pages_exact);
1740
1741/**
1742 * free_pages_exact - release memory allocated via alloc_pages_exact()
1743 * @virt: the value returned by alloc_pages_exact.
1744 * @size: size of allocation, same value as passed to alloc_pages_exact().
1745 *
1746 * Release the memory allocated by a previous call to alloc_pages_exact.
1747 */
1748void free_pages_exact(void *virt, size_t size)
1749{
1750 unsigned long addr = (unsigned long)virt;
1751 unsigned long end = addr + PAGE_ALIGN(size);
1752
1753 while (addr < end) {
1754 free_page(addr);
1755 addr += PAGE_SIZE;
1756 }
1757}
1758EXPORT_SYMBOL(free_pages_exact);
1759
1714static unsigned int nr_free_zone_pages(int offset) 1760static unsigned int nr_free_zone_pages(int offset)
1715{ 1761{
1716 struct zoneref *z; 1762 struct zoneref *z;
@@ -1816,10 +1862,21 @@ void show_free_areas(void)
1816 } 1862 }
1817 } 1863 }
1818 1864
1819 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1865 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1866 " inactive_file:%lu"
1867//TODO: check/adjust line lengths
1868#ifdef CONFIG_UNEVICTABLE_LRU
1869 " unevictable:%lu"
1870#endif
1871 " dirty:%lu writeback:%lu unstable:%lu\n"
1820 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1872 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1821 global_page_state(NR_ACTIVE), 1873 global_page_state(NR_ACTIVE_ANON),
1822 global_page_state(NR_INACTIVE), 1874 global_page_state(NR_ACTIVE_FILE),
1875 global_page_state(NR_INACTIVE_ANON),
1876 global_page_state(NR_INACTIVE_FILE),
1877#ifdef CONFIG_UNEVICTABLE_LRU
1878 global_page_state(NR_UNEVICTABLE),
1879#endif
1823 global_page_state(NR_FILE_DIRTY), 1880 global_page_state(NR_FILE_DIRTY),
1824 global_page_state(NR_WRITEBACK), 1881 global_page_state(NR_WRITEBACK),
1825 global_page_state(NR_UNSTABLE_NFS), 1882 global_page_state(NR_UNSTABLE_NFS),
@@ -1842,8 +1899,13 @@ void show_free_areas(void)
1842 " min:%lukB" 1899 " min:%lukB"
1843 " low:%lukB" 1900 " low:%lukB"
1844 " high:%lukB" 1901 " high:%lukB"
1845 " active:%lukB" 1902 " active_anon:%lukB"
1846 " inactive:%lukB" 1903 " inactive_anon:%lukB"
1904 " active_file:%lukB"
1905 " inactive_file:%lukB"
1906#ifdef CONFIG_UNEVICTABLE_LRU
1907 " unevictable:%lukB"
1908#endif
1847 " present:%lukB" 1909 " present:%lukB"
1848 " pages_scanned:%lu" 1910 " pages_scanned:%lu"
1849 " all_unreclaimable? %s" 1911 " all_unreclaimable? %s"
@@ -1853,8 +1915,13 @@ void show_free_areas(void)
1853 K(zone->pages_min), 1915 K(zone->pages_min),
1854 K(zone->pages_low), 1916 K(zone->pages_low),
1855 K(zone->pages_high), 1917 K(zone->pages_high),
1856 K(zone_page_state(zone, NR_ACTIVE)), 1918 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1857 K(zone_page_state(zone, NR_INACTIVE)), 1919 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1920 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1921 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1922#ifdef CONFIG_UNEVICTABLE_LRU
1923 K(zone_page_state(zone, NR_UNEVICTABLE)),
1924#endif
1858 K(zone->present_pages), 1925 K(zone->present_pages),
1859 zone->pages_scanned, 1926 zone->pages_scanned,
1860 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 1927 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2332,7 +2399,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2332 2399
2333#endif /* CONFIG_NUMA */ 2400#endif /* CONFIG_NUMA */
2334 2401
2335/* return values int ....just for stop_machine_run() */ 2402/* return values int ....just for stop_machine() */
2336static int __build_all_zonelists(void *dummy) 2403static int __build_all_zonelists(void *dummy)
2337{ 2404{
2338 int nid; 2405 int nid;
@@ -2352,11 +2419,12 @@ void build_all_zonelists(void)
2352 2419
2353 if (system_state == SYSTEM_BOOTING) { 2420 if (system_state == SYSTEM_BOOTING) {
2354 __build_all_zonelists(NULL); 2421 __build_all_zonelists(NULL);
2422 mminit_verify_zonelist();
2355 cpuset_init_current_mems_allowed(); 2423 cpuset_init_current_mems_allowed();
2356 } else { 2424 } else {
2357 /* we have to stop all cpus to guarantee there is no user 2425 /* we have to stop all cpus to guarantee there is no user
2358 of zonelist */ 2426 of zonelist */
2359 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2427 stop_machine(__build_all_zonelists, NULL, NULL);
2360 /* cpuset refresh routine should be here */ 2428 /* cpuset refresh routine should be here */
2361 } 2429 }
2362 vm_total_pages = nr_free_pagecache_pages(); 2430 vm_total_pages = nr_free_pagecache_pages();
@@ -2475,6 +2543,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2475 continue; 2543 continue;
2476 page = pfn_to_page(pfn); 2544 page = pfn_to_page(pfn);
2477 2545
2546 /* Watch out for overlapping nodes */
2547 if (page_to_nid(page) != zone_to_nid(zone))
2548 continue;
2549
2478 /* Blocks with reserved pages will never free, skip them. */ 2550 /* Blocks with reserved pages will never free, skip them. */
2479 if (PageReserved(page)) 2551 if (PageReserved(page))
2480 continue; 2552 continue;
@@ -2534,6 +2606,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2534 } 2606 }
2535 page = pfn_to_page(pfn); 2607 page = pfn_to_page(pfn);
2536 set_page_links(page, zone, nid, pfn); 2608 set_page_links(page, zone, nid, pfn);
2609 mminit_verify_page_links(page, zone, nid, pfn);
2537 init_page_count(page); 2610 init_page_count(page);
2538 reset_page_mapcount(page); 2611 reset_page_mapcount(page);
2539 SetPageReserved(page); 2612 SetPageReserved(page);
@@ -2611,7 +2684,7 @@ static int zone_batchsize(struct zone *zone)
2611 return batch; 2684 return batch;
2612} 2685}
2613 2686
2614inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2687static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2615{ 2688{
2616 struct per_cpu_pages *pcp; 2689 struct per_cpu_pages *pcp;
2617 2690
@@ -2836,6 +2909,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2836 2909
2837 zone->zone_start_pfn = zone_start_pfn; 2910 zone->zone_start_pfn = zone_start_pfn;
2838 2911
2912 mminit_dprintk(MMINIT_TRACE, "memmap_init",
2913 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
2914 pgdat->node_id,
2915 (unsigned long)zone_idx(zone),
2916 zone_start_pfn, (zone_start_pfn + size));
2917
2839 zone_init_free_lists(zone); 2918 zone_init_free_lists(zone);
2840 2919
2841 return 0; 2920 return 0;
@@ -2975,7 +3054,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
2975void __init push_node_boundaries(unsigned int nid, 3054void __init push_node_boundaries(unsigned int nid,
2976 unsigned long start_pfn, unsigned long end_pfn) 3055 unsigned long start_pfn, unsigned long end_pfn)
2977{ 3056{
2978 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 3057 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3058 "Entering push_node_boundaries(%u, %lu, %lu)\n",
2979 nid, start_pfn, end_pfn); 3059 nid, start_pfn, end_pfn);
2980 3060
2981 /* Initialise the boundary for this node if necessary */ 3061 /* Initialise the boundary for this node if necessary */
@@ -2993,7 +3073,8 @@ void __init push_node_boundaries(unsigned int nid,
2993static void __meminit account_node_boundary(unsigned int nid, 3073static void __meminit account_node_boundary(unsigned int nid,
2994 unsigned long *start_pfn, unsigned long *end_pfn) 3074 unsigned long *start_pfn, unsigned long *end_pfn)
2995{ 3075{
2996 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 3076 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3077 "Entering account_node_boundary(%u, %lu, %lu)\n",
2997 nid, *start_pfn, *end_pfn); 3078 nid, *start_pfn, *end_pfn);
2998 3079
2999 /* Return if boundary information has not been provided */ 3080 /* Return if boundary information has not been provided */
@@ -3050,7 +3131,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3050 * assumption is made that zones within a node are ordered in monotonic 3131 * assumption is made that zones within a node are ordered in monotonic
3051 * increasing memory addresses so that the "highest" populated zone is used 3132 * increasing memory addresses so that the "highest" populated zone is used
3052 */ 3133 */
3053void __init find_usable_zone_for_movable(void) 3134static void __init find_usable_zone_for_movable(void)
3054{ 3135{
3055 int zone_index; 3136 int zone_index;
3056 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3137 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3076,7 +3157,7 @@ void __init find_usable_zone_for_movable(void)
3076 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3157 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3077 * zones within a node are in order of monotonic increases memory addresses 3158 * zones within a node are in order of monotonic increases memory addresses
3078 */ 3159 */
3079void __meminit adjust_zone_range_for_zone_movable(int nid, 3160static void __meminit adjust_zone_range_for_zone_movable(int nid,
3080 unsigned long zone_type, 3161 unsigned long zone_type,
3081 unsigned long node_start_pfn, 3162 unsigned long node_start_pfn,
3082 unsigned long node_end_pfn, 3163 unsigned long node_end_pfn,
@@ -3137,7 +3218,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3137 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3218 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3138 * then all holes in the requested range will be accounted for. 3219 * then all holes in the requested range will be accounted for.
3139 */ 3220 */
3140unsigned long __meminit __absent_pages_in_range(int nid, 3221static unsigned long __meminit __absent_pages_in_range(int nid,
3141 unsigned long range_start_pfn, 3222 unsigned long range_start_pfn,
3142 unsigned long range_end_pfn) 3223 unsigned long range_end_pfn)
3143{ 3224{
@@ -3350,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3350 pgdat->nr_zones = 0; 3431 pgdat->nr_zones = 0;
3351 init_waitqueue_head(&pgdat->kswapd_wait); 3432 init_waitqueue_head(&pgdat->kswapd_wait);
3352 pgdat->kswapd_max_order = 0; 3433 pgdat->kswapd_max_order = 0;
3434 pgdat_page_cgroup_init(pgdat);
3353 3435
3354 for (j = 0; j < MAX_NR_ZONES; j++) { 3436 for (j = 0; j < MAX_NR_ZONES; j++) {
3355 struct zone *zone = pgdat->node_zones + j; 3437 struct zone *zone = pgdat->node_zones + j;
3356 unsigned long size, realsize, memmap_pages; 3438 unsigned long size, realsize, memmap_pages;
3439 enum lru_list l;
3357 3440
3358 size = zone_spanned_pages_in_node(nid, j, zones_size); 3441 size = zone_spanned_pages_in_node(nid, j, zones_size);
3359 realsize = size - zone_absent_pages_in_node(nid, j, 3442 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -3404,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3404 zone->prev_priority = DEF_PRIORITY; 3487 zone->prev_priority = DEF_PRIORITY;
3405 3488
3406 zone_pcp_init(zone); 3489 zone_pcp_init(zone);
3407 INIT_LIST_HEAD(&zone->active_list); 3490 for_each_lru(l) {
3408 INIT_LIST_HEAD(&zone->inactive_list); 3491 INIT_LIST_HEAD(&zone->lru[l].list);
3409 zone->nr_scan_active = 0; 3492 zone->lru[l].nr_scan = 0;
3410 zone->nr_scan_inactive = 0; 3493 }
3494 zone->recent_rotated[0] = 0;
3495 zone->recent_rotated[1] = 0;
3496 zone->recent_scanned[0] = 0;
3497 zone->recent_scanned[1] = 0;
3411 zap_zone_vm_stats(zone); 3498 zap_zone_vm_stats(zone);
3412 zone->flags = 0; 3499 zone->flags = 0;
3413 if (!size) 3500 if (!size)
@@ -3464,10 +3551,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3464#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3551#endif /* CONFIG_FLAT_NODE_MEM_MAP */
3465} 3552}
3466 3553
3467void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, 3554void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3468 unsigned long *zones_size, unsigned long node_start_pfn, 3555 unsigned long node_start_pfn, unsigned long *zholes_size)
3469 unsigned long *zholes_size)
3470{ 3556{
3557 pg_data_t *pgdat = NODE_DATA(nid);
3558
3471 pgdat->node_id = nid; 3559 pgdat->node_id = nid;
3472 pgdat->node_start_pfn = node_start_pfn; 3560 pgdat->node_start_pfn = node_start_pfn;
3473 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3561 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -3520,10 +3608,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3520{ 3608{
3521 int i; 3609 int i;
3522 3610
3523 printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " 3611 mminit_dprintk(MMINIT_TRACE, "memory_register",
3524 "%d entries of %d used\n", 3612 "Entering add_active_range(%d, %#lx, %#lx) "
3525 nid, start_pfn, end_pfn, 3613 "%d entries of %d used\n",
3526 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3614 nid, start_pfn, end_pfn,
3615 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
3616
3617 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
3527 3618
3528 /* Merge with existing active regions if possible */ 3619 /* Merge with existing active regions if possible */
3529 for (i = 0; i < nr_nodemap_entries; i++) { 3620 for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3669,7 +3760,7 @@ static void __init sort_node_map(void)
3669} 3760}
3670 3761
3671/* Find the lowest pfn for a node */ 3762/* Find the lowest pfn for a node */
3672unsigned long __init find_min_pfn_for_node(int nid) 3763static unsigned long __init find_min_pfn_for_node(int nid)
3673{ 3764{
3674 int i; 3765 int i;
3675 unsigned long min_pfn = ULONG_MAX; 3766 unsigned long min_pfn = ULONG_MAX;
@@ -3698,23 +3789,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
3698 return find_min_pfn_for_node(MAX_NUMNODES); 3789 return find_min_pfn_for_node(MAX_NUMNODES);
3699} 3790}
3700 3791
3701/**
3702 * find_max_pfn_with_active_regions - Find the maximum PFN registered
3703 *
3704 * It returns the maximum PFN based on information provided via
3705 * add_active_range().
3706 */
3707unsigned long __init find_max_pfn_with_active_regions(void)
3708{
3709 int i;
3710 unsigned long max_pfn = 0;
3711
3712 for (i = 0; i < nr_nodemap_entries; i++)
3713 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
3714
3715 return max_pfn;
3716}
3717
3718/* 3792/*
3719 * early_calculate_totalpages() 3793 * early_calculate_totalpages()
3720 * Sum pages in active regions for movable zone. 3794 * Sum pages in active regions for movable zone.
@@ -3741,7 +3815,7 @@ static unsigned long __init early_calculate_totalpages(void)
3741 * memory. When they don't, some nodes will have more kernelcore than 3815 * memory. When they don't, some nodes will have more kernelcore than
3742 * others 3816 * others
3743 */ 3817 */
3744void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3818static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3745{ 3819{
3746 int i, nid; 3820 int i, nid;
3747 unsigned long usable_startpfn; 3821 unsigned long usable_startpfn;
@@ -3904,7 +3978,7 @@ static void check_for_regular_memory(pg_data_t *pgdat)
3904void __init free_area_init_nodes(unsigned long *max_zone_pfn) 3978void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3905{ 3979{
3906 unsigned long nid; 3980 unsigned long nid;
3907 enum zone_type i; 3981 int i;
3908 3982
3909 /* Sort early_node_map as initialisation assumes it is sorted */ 3983 /* Sort early_node_map as initialisation assumes it is sorted */
3910 sort_node_map(); 3984 sort_node_map();
@@ -3957,10 +4031,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3957 early_node_map[i].end_pfn); 4031 early_node_map[i].end_pfn);
3958 4032
3959 /* Initialise every node */ 4033 /* Initialise every node */
4034 mminit_verify_pageflags_layout();
3960 setup_nr_node_ids(); 4035 setup_nr_node_ids();
3961 for_each_online_node(nid) { 4036 for_each_online_node(nid) {
3962 pg_data_t *pgdat = NODE_DATA(nid); 4037 pg_data_t *pgdat = NODE_DATA(nid);
3963 free_area_init_node(nid, pgdat, NULL, 4038 free_area_init_node(nid, NULL,
3964 find_min_pfn_for_node(nid), NULL); 4039 find_min_pfn_for_node(nid), NULL);
3965 4040
3966 /* Any memory on that node */ 4041 /* Any memory on that node */
@@ -4025,15 +4100,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4025} 4100}
4026 4101
4027#ifndef CONFIG_NEED_MULTIPLE_NODES 4102#ifndef CONFIG_NEED_MULTIPLE_NODES
4028static bootmem_data_t contig_bootmem_data; 4103struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
4029struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
4030
4031EXPORT_SYMBOL(contig_page_data); 4104EXPORT_SYMBOL(contig_page_data);
4032#endif 4105#endif
4033 4106
4034void __init free_area_init(unsigned long *zones_size) 4107void __init free_area_init(unsigned long *zones_size)
4035{ 4108{
4036 free_area_init_node(0, NODE_DATA(0), zones_size, 4109 free_area_init_node(0, zones_size,
4037 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4110 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4038} 4111}
4039 4112
@@ -4163,7 +4236,7 @@ void setup_per_zone_pages_min(void)
4163 for_each_zone(zone) { 4236 for_each_zone(zone) {
4164 u64 tmp; 4237 u64 tmp;
4165 4238
4166 spin_lock_irqsave(&zone->lru_lock, flags); 4239 spin_lock_irqsave(&zone->lock, flags);
4167 tmp = (u64)pages_min * zone->present_pages; 4240 tmp = (u64)pages_min * zone->present_pages;
4168 do_div(tmp, lowmem_pages); 4241 do_div(tmp, lowmem_pages);
4169 if (is_highmem(zone)) { 4242 if (is_highmem(zone)) {
@@ -4195,13 +4268,53 @@ void setup_per_zone_pages_min(void)
4195 zone->pages_low = zone->pages_min + (tmp >> 2); 4268 zone->pages_low = zone->pages_min + (tmp >> 2);
4196 zone->pages_high = zone->pages_min + (tmp >> 1); 4269 zone->pages_high = zone->pages_min + (tmp >> 1);
4197 setup_zone_migrate_reserve(zone); 4270 setup_zone_migrate_reserve(zone);
4198 spin_unlock_irqrestore(&zone->lru_lock, flags); 4271 spin_unlock_irqrestore(&zone->lock, flags);
4199 } 4272 }
4200 4273
4201 /* update totalreserve_pages */ 4274 /* update totalreserve_pages */
4202 calculate_totalreserve_pages(); 4275 calculate_totalreserve_pages();
4203} 4276}
4204 4277
4278/**
4279 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4280 *
4281 * The inactive anon list should be small enough that the VM never has to
4282 * do too much work, but large enough that each inactive page has a chance
4283 * to be referenced again before it is swapped out.
4284 *
4285 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
4286 * INACTIVE_ANON pages on this zone's LRU, maintained by the
4287 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
4288 * the anonymous pages are kept on the inactive list.
4289 *
4290 * total target max
4291 * memory ratio inactive anon
4292 * -------------------------------------
4293 * 10MB 1 5MB
4294 * 100MB 1 50MB
4295 * 1GB 3 250MB
4296 * 10GB 10 0.9GB
4297 * 100GB 31 3GB
4298 * 1TB 101 10GB
4299 * 10TB 320 32GB
4300 */
4301void setup_per_zone_inactive_ratio(void)
4302{
4303 struct zone *zone;
4304
4305 for_each_zone(zone) {
4306 unsigned int gb, ratio;
4307
4308 /* Zone size in gigabytes */
4309 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4310 ratio = int_sqrt(10 * gb);
4311 if (!ratio)
4312 ratio = 1;
4313
4314 zone->inactive_ratio = ratio;
4315 }
4316}
4317
4205/* 4318/*
4206 * Initialise min_free_kbytes. 4319 * Initialise min_free_kbytes.
4207 * 4320 *
@@ -4239,6 +4352,7 @@ static int __init init_per_zone_pages_min(void)
4239 min_free_kbytes = 65536; 4352 min_free_kbytes = 65536;
4240 setup_per_zone_pages_min(); 4353 setup_per_zone_pages_min();
4241 setup_per_zone_lowmem_reserve(); 4354 setup_per_zone_lowmem_reserve();
4355 setup_per_zone_inactive_ratio();
4242 return 0; 4356 return 0;
4243} 4357}
4244module_init(init_per_zone_pages_min) 4358module_init(init_per_zone_pages_min)
@@ -4400,7 +4514,7 @@ void *__init alloc_large_system_hash(const char *tablename,
4400 do { 4514 do {
4401 size = bucketsize << log2qty; 4515 size = bucketsize << log2qty;
4402 if (flags & HASH_EARLY) 4516 if (flags & HASH_EARLY)
4403 table = alloc_bootmem(size); 4517 table = alloc_bootmem_nopanic(size);
4404 else if (hashdist) 4518 else if (hashdist)
4405 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4519 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4406 else { 4520 else {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 000000000000..f59d797dc5a9
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,256 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11
12static void __meminit
13__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
14{
15 pc->flags = 0;
16 pc->mem_cgroup = NULL;
17 pc->page = pfn_to_page(pfn);
18}
19static unsigned long total_usage;
20
21#if !defined(CONFIG_SPARSEMEM)
22
23
24void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
25{
26 pgdat->node_page_cgroup = NULL;
27}
28
29struct page_cgroup *lookup_page_cgroup(struct page *page)
30{
31 unsigned long pfn = page_to_pfn(page);
32 unsigned long offset;
33 struct page_cgroup *base;
34
35 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
36 if (unlikely(!base))
37 return NULL;
38
39 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
40 return base + offset;
41}
42
43static int __init alloc_node_page_cgroup(int nid)
44{
45 struct page_cgroup *base, *pc;
46 unsigned long table_size;
47 unsigned long start_pfn, nr_pages, index;
48
49 start_pfn = NODE_DATA(nid)->node_start_pfn;
50 nr_pages = NODE_DATA(nid)->node_spanned_pages;
51
52 table_size = sizeof(struct page_cgroup) * nr_pages;
53
54 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
55 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
56 if (!base)
57 return -ENOMEM;
58 for (index = 0; index < nr_pages; index++) {
59 pc = base + index;
60 __init_page_cgroup(pc, start_pfn + index);
61 }
62 NODE_DATA(nid)->node_page_cgroup = base;
63 total_usage += table_size;
64 return 0;
65}
66
67void __init page_cgroup_init(void)
68{
69
70 int nid, fail;
71
72 if (mem_cgroup_subsys.disabled)
73 return;
74
75 for_each_online_node(nid) {
76 fail = alloc_node_page_cgroup(nid);
77 if (fail)
78 goto fail;
79 }
80 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
81 printk(KERN_INFO "please try cgroup_disable=memory option if you"
82 " don't want\n");
83 return;
84fail:
85 printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
86 printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
87 panic("Out of memory");
88}
89
90#else /* CONFIG_FLAT_NODE_MEM_MAP */
91
92struct page_cgroup *lookup_page_cgroup(struct page *page)
93{
94 unsigned long pfn = page_to_pfn(page);
95 struct mem_section *section = __pfn_to_section(pfn);
96
97 return section->page_cgroup + pfn;
98}
99
100int __meminit init_section_page_cgroup(unsigned long pfn)
101{
102 struct mem_section *section;
103 struct page_cgroup *base, *pc;
104 unsigned long table_size;
105 int nid, index;
106
107 section = __pfn_to_section(pfn);
108
109 if (section->page_cgroup)
110 return 0;
111
112 nid = page_to_nid(pfn_to_page(pfn));
113
114 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
115 if (slab_is_available()) {
116 base = kmalloc_node(table_size, GFP_KERNEL, nid);
117 if (!base)
118 base = vmalloc_node(table_size, nid);
119 } else {
120 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
121 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
122 }
123
124 if (!base) {
125 printk(KERN_ERR "page cgroup allocation failure\n");
126 return -ENOMEM;
127 }
128
129 for (index = 0; index < PAGES_PER_SECTION; index++) {
130 pc = base + index;
131 __init_page_cgroup(pc, pfn + index);
132 }
133
134 section = __pfn_to_section(pfn);
135 section->page_cgroup = base - pfn;
136 total_usage += table_size;
137 return 0;
138}
139#ifdef CONFIG_MEMORY_HOTPLUG
140void __free_page_cgroup(unsigned long pfn)
141{
142 struct mem_section *ms;
143 struct page_cgroup *base;
144
145 ms = __pfn_to_section(pfn);
146 if (!ms || !ms->page_cgroup)
147 return;
148 base = ms->page_cgroup + pfn;
149 if (is_vmalloc_addr(base)) {
150 vfree(base);
151 ms->page_cgroup = NULL;
152 } else {
153 struct page *page = virt_to_page(base);
154 if (!PageReserved(page)) { /* Is bootmem ? */
155 kfree(base);
156 ms->page_cgroup = NULL;
157 }
158 }
159}
160
161int online_page_cgroup(unsigned long start_pfn,
162 unsigned long nr_pages,
163 int nid)
164{
165 unsigned long start, end, pfn;
166 int fail = 0;
167
168 start = start_pfn & (PAGES_PER_SECTION - 1);
169 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
170
171 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
172 if (!pfn_present(pfn))
173 continue;
174 fail = init_section_page_cgroup(pfn);
175 }
176 if (!fail)
177 return 0;
178
179 /* rollback */
180 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
181 __free_page_cgroup(pfn);
182
183 return -ENOMEM;
184}
185
186int offline_page_cgroup(unsigned long start_pfn,
187 unsigned long nr_pages, int nid)
188{
189 unsigned long start, end, pfn;
190
191 start = start_pfn & (PAGES_PER_SECTION - 1);
192 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
193
194 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
195 __free_page_cgroup(pfn);
196 return 0;
197
198}
199
200static int page_cgroup_callback(struct notifier_block *self,
201 unsigned long action, void *arg)
202{
203 struct memory_notify *mn = arg;
204 int ret = 0;
205 switch (action) {
206 case MEM_GOING_ONLINE:
207 ret = online_page_cgroup(mn->start_pfn,
208 mn->nr_pages, mn->status_change_nid);
209 break;
210 case MEM_CANCEL_ONLINE:
211 case MEM_OFFLINE:
212 offline_page_cgroup(mn->start_pfn,
213 mn->nr_pages, mn->status_change_nid);
214 break;
215 case MEM_GOING_OFFLINE:
216 break;
217 case MEM_ONLINE:
218 case MEM_CANCEL_OFFLINE:
219 break;
220 }
221 ret = notifier_from_errno(ret);
222 return ret;
223}
224
225#endif
226
227void __init page_cgroup_init(void)
228{
229 unsigned long pfn;
230 int fail = 0;
231
232 if (mem_cgroup_subsys.disabled)
233 return;
234
235 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
236 if (!pfn_present(pfn))
237 continue;
238 fail = init_section_page_cgroup(pfn);
239 }
240 if (fail) {
241 printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
242 panic("Out of memory");
243 } else {
244 hotplug_memory_notifier(page_cgroup_callback, 0);
245 }
246 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
247 printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
248 " want\n");
249}
250
251void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
252{
253 return;
254}
255
256#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c8..b70a7fec1ff6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
2 * linux/mm/page_isolation.c 2 * linux/mm/page_isolation.c
3 */ 3 */
4 4
5#include <stddef.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
7#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
8#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
115 114
116int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 115int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
117{ 116{
118 unsigned long pfn; 117 unsigned long pfn, flags;
119 struct page *page; 118 struct page *page;
119 struct zone *zone;
120 int ret;
120 121
121 pfn = start_pfn; 122 pfn = start_pfn;
122 /* 123 /*
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
132 if (pfn < end_pfn) 133 if (pfn < end_pfn)
133 return -EBUSY; 134 return -EBUSY;
134 /* Check all pages are free or Marked as ISOLATED */ 135 /* Check all pages are free or Marked as ISOLATED */
135 if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) 136 zone = page_zone(pfn_to_page(pfn));
136 return 0; 137 spin_lock_irqsave(&zone->lock, flags);
137 return -EBUSY; 138 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
139 spin_unlock_irqrestore(&zone->lock, flags);
140 return ret ? 0 : -EBUSY;
138} 141}
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..a0a14c4d5072 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * 5 *
6 * 09Apr2002 akpm@zip.com.au 6 * 09Apr2002 Andrew Morton
7 * Initial version 7 * Initial version
8 * 29Feb2004 kaos@sgi.com 8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing 9 * Move worker thread creation to kthread to avoid chewing
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
130 * Thread creation: For how long have there been zero 130 * Thread creation: For how long have there been zero
131 * available threads? 131 * available threads?
132 */ 132 */
133 if (jiffies - last_empty_jifs > 1 * HZ) { 133 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
134 /* unlocked list_empty() test is OK here */ 134 /* unlocked list_empty() test is OK here */
135 if (list_empty(&pdflush_list)) { 135 if (list_empty(&pdflush_list)) {
136 /* unlocked test is OK here */ 136 /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) 151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
152 continue; 152 continue;
153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
154 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { 154 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
155 /* Limit exit rate */ 155 /* Limit exit rate */
156 pdf->when_i_went_to_sleep = jiffies; 156 pdf->when_i_went_to_sleep = jiffies;
157 break; /* exeunt */ 157 break; /* exeunt */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb398..8dbb6805ef35 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
26static unsigned long max_pages(unsigned long min_pages) 26static unsigned long max_pages(unsigned long min_pages)
27{ 27{
28 unsigned long node_free_pages, max; 28 unsigned long node_free_pages, max;
29 struct zone *zones = NODE_DATA(numa_node_id())->node_zones; 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node;
32 node_to_cpumask_ptr(cpumask_on_node, node);
30 33
31 node_free_pages = 34 node_free_pages =
32#ifdef CONFIG_ZONE_DMA 35#ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
38 zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); 41 zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
39 42
40 max = node_free_pages / FRACTION_OF_NODE_MEM; 43 max = node_free_pages / FRACTION_OF_NODE_MEM;
44
45 num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
46 max /= num_cpus_on_node;
47
41 return max(max, min_pages); 48 return max(max, min_pages);
42} 49}
43 50
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..bec83c15a78f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds 4 * Copyright (C) 2002, Linus Torvalds
5 * 5 *
6 * 09Apr2002 akpm@zip.com.au 6 * 09Apr2002 Andrew Morton
7 * Initial version. 7 * Initial version.
8 */ 8 */
9 9
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
229 */ 229 */
230unsigned long max_sane_readahead(unsigned long nr) 230unsigned long max_sane_readahead(unsigned long nr)
231{ 231{
232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) 232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
234} 234}
235 235
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
382 if (hit_readahead_marker) { 382 if (hit_readahead_marker) {
383 pgoff_t start; 383 pgoff_t start;
384 384
385 read_lock_irq(&mapping->tree_lock); 385 rcu_read_lock();
386 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); 386 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
387 read_unlock_irq(&mapping->tree_lock); 387 rcu_read_unlock();
388 388
389 if (!start || start - offset > max) 389 if (!start || start - offset > max)
390 return 0; 390 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..10993942d6c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,12 +49,51 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 51#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
54 55
55struct kmem_cache *anon_vma_cachep; 56#include "internal.h"
56 57
57/* This must be called under the mmap_sem. */ 58static struct kmem_cache *anon_vma_cachep;
59
60static inline struct anon_vma *anon_vma_alloc(void)
61{
62 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
63}
64
65static inline void anon_vma_free(struct anon_vma *anon_vma)
66{
67 kmem_cache_free(anon_vma_cachep, anon_vma);
68}
69
70/**
71 * anon_vma_prepare - attach an anon_vma to a memory region
72 * @vma: the memory region in question
73 *
74 * This makes sure the memory mapping described by 'vma' has
75 * an 'anon_vma' attached to it, so that we can associate the
76 * anonymous pages mapped into it with that anon_vma.
77 *
78 * The common case will be that we already have one, but if
79 * if not we either need to find an adjacent mapping that we
80 * can re-use the anon_vma from (very common when the only
81 * reason for splitting a vma has been mprotect()), or we
82 * allocate a new one.
83 *
84 * Anon-vma allocations are very subtle, because we may have
85 * optimistically looked up an anon_vma in page_lock_anon_vma()
86 * and that may actually touch the spinlock even in the newly
87 * allocated vma (it depends on RCU to make sure that the
88 * anon_vma isn't actually destroyed).
89 *
90 * As a result, we need to do proper anon_vma locking even
91 * for the new allocation. At the same time, we do not want
92 * to do any locking for the common case of already having
93 * an anon_vma.
94 *
95 * This must be called with the mmap_sem held for reading.
96 */
58int anon_vma_prepare(struct vm_area_struct *vma) 97int anon_vma_prepare(struct vm_area_struct *vma)
59{ 98{
60 struct anon_vma *anon_vma = vma->anon_vma; 99 struct anon_vma *anon_vma = vma->anon_vma;
@@ -62,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
62 might_sleep(); 101 might_sleep();
63 if (unlikely(!anon_vma)) { 102 if (unlikely(!anon_vma)) {
64 struct mm_struct *mm = vma->vm_mm; 103 struct mm_struct *mm = vma->vm_mm;
65 struct anon_vma *allocated, *locked; 104 struct anon_vma *allocated;
66 105
67 anon_vma = find_mergeable_anon_vma(vma); 106 anon_vma = find_mergeable_anon_vma(vma);
68 if (anon_vma) { 107 allocated = NULL;
69 allocated = NULL; 108 if (!anon_vma) {
70 locked = anon_vma;
71 spin_lock(&locked->lock);
72 } else {
73 anon_vma = anon_vma_alloc(); 109 anon_vma = anon_vma_alloc();
74 if (unlikely(!anon_vma)) 110 if (unlikely(!anon_vma))
75 return -ENOMEM; 111 return -ENOMEM;
76 allocated = anon_vma; 112 allocated = anon_vma;
77 locked = NULL;
78 } 113 }
114 spin_lock(&anon_vma->lock);
79 115
80 /* page_table_lock to protect against threads */ 116 /* page_table_lock to protect against threads */
81 spin_lock(&mm->page_table_lock); 117 spin_lock(&mm->page_table_lock);
@@ -86,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
86 } 122 }
87 spin_unlock(&mm->page_table_lock); 123 spin_unlock(&mm->page_table_lock);
88 124
89 if (locked) 125 spin_unlock(&anon_vma->lock);
90 spin_unlock(&locked->lock);
91 if (unlikely(allocated)) 126 if (unlikely(allocated))
92 anon_vma_free(allocated); 127 anon_vma_free(allocated);
93 } 128 }
@@ -138,7 +173,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
138 anon_vma_free(anon_vma); 173 anon_vma_free(anon_vma);
139} 174}
140 175
141static void anon_vma_ctor(struct kmem_cache *cachep, void *data) 176static void anon_vma_ctor(void *data)
142{ 177{
143 struct anon_vma *anon_vma = data; 178 struct anon_vma *anon_vma = data;
144 179
@@ -156,7 +191,7 @@ void __init anon_vma_init(void)
156 * Getting a lock on a stable anon_vma from a page off the LRU is 191 * Getting a lock on a stable anon_vma from a page off the LRU is
157 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 192 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
158 */ 193 */
159static struct anon_vma *page_lock_anon_vma(struct page *page) 194struct anon_vma *page_lock_anon_vma(struct page *page)
160{ 195{
161 struct anon_vma *anon_vma; 196 struct anon_vma *anon_vma;
162 unsigned long anon_mapping; 197 unsigned long anon_mapping;
@@ -176,7 +211,7 @@ out:
176 return NULL; 211 return NULL;
177} 212}
178 213
179static void page_unlock_anon_vma(struct anon_vma *anon_vma) 214void page_unlock_anon_vma(struct anon_vma *anon_vma)
180{ 215{
181 spin_unlock(&anon_vma->lock); 216 spin_unlock(&anon_vma->lock);
182 rcu_read_unlock(); 217 rcu_read_unlock();
@@ -223,10 +258,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
223/* 258/*
224 * Check that @page is mapped at @address into @mm. 259 * Check that @page is mapped at @address into @mm.
225 * 260 *
261 * If @sync is false, page_check_address may perform a racy check to avoid
262 * the page table lock when the pte is not present (helpful when reclaiming
263 * highly shared pages).
264 *
226 * On success returns with pte mapped and locked. 265 * On success returns with pte mapped and locked.
227 */ 266 */
228pte_t *page_check_address(struct page *page, struct mm_struct *mm, 267pte_t *page_check_address(struct page *page, struct mm_struct *mm,
229 unsigned long address, spinlock_t **ptlp) 268 unsigned long address, spinlock_t **ptlp, int sync)
230{ 269{
231 pgd_t *pgd; 270 pgd_t *pgd;
232 pud_t *pud; 271 pud_t *pud;
@@ -248,7 +287,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
248 287
249 pte = pte_offset_map(pmd, address); 288 pte = pte_offset_map(pmd, address);
250 /* Make a quick check before getting the lock */ 289 /* Make a quick check before getting the lock */
251 if (!pte_present(*pte)) { 290 if (!sync && !pte_present(*pte)) {
252 pte_unmap(pte); 291 pte_unmap(pte);
253 return NULL; 292 return NULL;
254 } 293 }
@@ -263,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
263 return NULL; 302 return NULL;
264} 303}
265 304
305/**
306 * page_mapped_in_vma - check whether a page is really mapped in a VMA
307 * @page: the page to test
308 * @vma: the VMA to test
309 *
310 * Returns 1 if the page is mapped into the page tables of the VMA, 0
311 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs.
313 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{
316 unsigned long address;
317 pte_t *pte;
318 spinlock_t *ptl;
319
320 address = vma_address(page, vma);
321 if (address == -EFAULT) /* out of vma range */
322 return 0;
323 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
324 if (!pte) /* the page is not in this mm */
325 return 0;
326 pte_unmap_unlock(pte, ptl);
327
328 return 1;
329}
330
266/* 331/*
267 * Subfunctions of page_referenced: page_referenced_one called 332 * Subfunctions of page_referenced: page_referenced_one called
268 * repeatedly from either page_referenced_anon or page_referenced_file. 333 * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -280,14 +345,21 @@ static int page_referenced_one(struct page *page,
280 if (address == -EFAULT) 345 if (address == -EFAULT)
281 goto out; 346 goto out;
282 347
283 pte = page_check_address(page, mm, address, &ptl); 348 pte = page_check_address(page, mm, address, &ptl, 0);
284 if (!pte) 349 if (!pte)
285 goto out; 350 goto out;
286 351
352 /*
353 * Don't want to elevate referenced for mlocked page that gets this far,
354 * in order that it progresses to try_to_unmap and is moved to the
355 * unevictable list.
356 */
287 if (vma->vm_flags & VM_LOCKED) { 357 if (vma->vm_flags & VM_LOCKED) {
288 referenced++;
289 *mapcount = 1; /* break early from loop */ 358 *mapcount = 1; /* break early from loop */
290 } else if (ptep_clear_flush_young(vma, address, pte)) 359 goto out_unmap;
360 }
361
362 if (ptep_clear_flush_young_notify(vma, address, pte))
291 referenced++; 363 referenced++;
292 364
293 /* Pretend the page is referenced if the task has the 365 /* Pretend the page is referenced if the task has the
@@ -296,6 +368,7 @@ static int page_referenced_one(struct page *page,
296 rwsem_is_locked(&mm->mmap_sem)) 368 rwsem_is_locked(&mm->mmap_sem))
297 referenced++; 369 referenced++;
298 370
371out_unmap:
299 (*mapcount)--; 372 (*mapcount)--;
300 pte_unmap_unlock(pte, ptl); 373 pte_unmap_unlock(pte, ptl);
301out: 374out:
@@ -385,11 +458,6 @@ static int page_referenced_file(struct page *page,
385 */ 458 */
386 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 459 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
387 continue; 460 continue;
388 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
389 == (VM_LOCKED|VM_MAYSHARE)) {
390 referenced++;
391 break;
392 }
393 referenced += page_referenced_one(page, vma, &mapcount); 461 referenced += page_referenced_one(page, vma, &mapcount);
394 if (!mapcount) 462 if (!mapcount)
395 break; 463 break;
@@ -421,7 +489,7 @@ int page_referenced(struct page *page, int is_locked,
421 referenced += page_referenced_anon(page, mem_cont); 489 referenced += page_referenced_anon(page, mem_cont);
422 else if (is_locked) 490 else if (is_locked)
423 referenced += page_referenced_file(page, mem_cont); 491 referenced += page_referenced_file(page, mem_cont);
424 else if (TestSetPageLocked(page)) 492 else if (!trylock_page(page))
425 referenced++; 493 referenced++;
426 else { 494 else {
427 if (page->mapping) 495 if (page->mapping)
@@ -449,7 +517,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
449 if (address == -EFAULT) 517 if (address == -EFAULT)
450 goto out; 518 goto out;
451 519
452 pte = page_check_address(page, mm, address, &ptl); 520 pte = page_check_address(page, mm, address, &ptl, 1);
453 if (!pte) 521 if (!pte)
454 goto out; 522 goto out;
455 523
@@ -457,7 +525,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
457 pte_t entry; 525 pte_t entry;
458 526
459 flush_cache_page(vma, address, pte_pfn(*pte)); 527 flush_cache_page(vma, address, pte_pfn(*pte));
460 entry = ptep_clear_flush(vma, address, pte); 528 entry = ptep_clear_flush_notify(vma, address, pte);
461 entry = pte_wrprotect(entry); 529 entry = pte_wrprotect(entry);
462 entry = pte_mkclean(entry); 530 entry = pte_mkclean(entry);
463 set_pte_at(mm, address, pte, entry); 531 set_pte_at(mm, address, pte, entry);
@@ -576,14 +644,8 @@ void page_add_anon_rmap(struct page *page,
576 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 644 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
577 if (atomic_inc_and_test(&page->_mapcount)) 645 if (atomic_inc_and_test(&page->_mapcount))
578 __page_set_anon_rmap(page, vma, address); 646 __page_set_anon_rmap(page, vma, address);
579 else { 647 else
580 __page_check_anon_rmap(page, vma, address); 648 __page_check_anon_rmap(page, vma, address);
581 /*
582 * We unconditionally charged during prepare, we uncharge here
583 * This takes care of balancing the reference counts
584 */
585 mem_cgroup_uncharge_page(page);
586 }
587} 649}
588 650
589/** 651/**
@@ -614,12 +676,6 @@ void page_add_file_rmap(struct page *page)
614{ 676{
615 if (atomic_inc_and_test(&page->_mapcount)) 677 if (atomic_inc_and_test(&page->_mapcount))
616 __inc_zone_page_state(page, NR_FILE_MAPPED); 678 __inc_zone_page_state(page, NR_FILE_MAPPED);
617 else
618 /*
619 * We unconditionally charged during prepare, we uncharge here
620 * This takes care of balancing the reference counts
621 */
622 mem_cgroup_uncharge_page(page);
623} 679}
624 680
625#ifdef CONFIG_DEBUG_VM 681#ifdef CONFIG_DEBUG_VM
@@ -670,6 +726,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
670 } 726 }
671 727
672 /* 728 /*
729 * Now that the last pte has gone, s390 must transfer dirty
730 * flag from storage key to struct page. We can usually skip
731 * this if the page is anon, so about to be freed; but perhaps
732 * not if it's in swapcache - there might be another pte slot
733 * containing the swap entry, but page not yet written to swap.
734 */
735 if ((!PageAnon(page) || PageSwapCache(page)) &&
736 page_test_dirty(page)) {
737 page_clear_dirty(page);
738 set_page_dirty(page);
739 }
740 if (PageAnon(page))
741 mem_cgroup_uncharge_page(page);
742 __dec_zone_page_state(page,
743 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
744 /*
673 * It would be tidy to reset the PageAnon mapping here, 745 * It would be tidy to reset the PageAnon mapping here,
674 * but that might overwrite a racing page_add_anon_rmap 746 * but that might overwrite a racing page_add_anon_rmap
675 * which increments mapcount after us but sets mapping 747 * which increments mapcount after us but sets mapping
@@ -678,14 +750,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
678 * Leaving it set also helps swapoff to reinstate ptes 750 * Leaving it set also helps swapoff to reinstate ptes
679 * faster for those pages still in swapcache. 751 * faster for those pages still in swapcache.
680 */ 752 */
681 if (page_test_dirty(page)) {
682 page_clear_dirty(page);
683 set_page_dirty(page);
684 }
685 mem_cgroup_uncharge_page(page);
686
687 __dec_zone_page_state(page,
688 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
689 } 753 }
690} 754}
691 755
@@ -707,7 +771,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
707 if (address == -EFAULT) 771 if (address == -EFAULT)
708 goto out; 772 goto out;
709 773
710 pte = page_check_address(page, mm, address, &ptl); 774 pte = page_check_address(page, mm, address, &ptl, 0);
711 if (!pte) 775 if (!pte)
712 goto out; 776 goto out;
713 777
@@ -716,15 +780,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
716 * If it's recently referenced (perhaps page_referenced 780 * If it's recently referenced (perhaps page_referenced
717 * skipped over this mm) then we should reactivate it. 781 * skipped over this mm) then we should reactivate it.
718 */ 782 */
719 if (!migration && ((vma->vm_flags & VM_LOCKED) || 783 if (!migration) {
720 (ptep_clear_flush_young(vma, address, pte)))) { 784 if (vma->vm_flags & VM_LOCKED) {
721 ret = SWAP_FAIL; 785 ret = SWAP_MLOCK;
722 goto out_unmap; 786 goto out_unmap;
723 } 787 }
788 if (ptep_clear_flush_young_notify(vma, address, pte)) {
789 ret = SWAP_FAIL;
790 goto out_unmap;
791 }
792 }
724 793
725 /* Nuke the page table entry. */ 794 /* Nuke the page table entry. */
726 flush_cache_page(vma, address, page_to_pfn(page)); 795 flush_cache_page(vma, address, page_to_pfn(page));
727 pteval = ptep_clear_flush(vma, address, pte); 796 pteval = ptep_clear_flush_notify(vma, address, pte);
728 797
729 /* Move the dirty bit to the physical page now the pte is gone. */ 798 /* Move the dirty bit to the physical page now the pte is gone. */
730 if (pte_dirty(pteval)) 799 if (pte_dirty(pteval))
@@ -801,12 +870,17 @@ out:
801 * For very sparsely populated VMAs this is a little inefficient - chances are 870 * For very sparsely populated VMAs this is a little inefficient - chances are
802 * there there won't be many ptes located within the scan cluster. In this case 871 * there there won't be many ptes located within the scan cluster. In this case
803 * maybe we could scan further - to the end of the pte page, perhaps. 872 * maybe we could scan further - to the end of the pte page, perhaps.
873 *
874 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
875 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
876 * rather than unmapping them. If we encounter the "check_page" that vmscan is
877 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
804 */ 878 */
805#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 879#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
806#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 880#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
807 881
808static void try_to_unmap_cluster(unsigned long cursor, 882static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
809 unsigned int *mapcount, struct vm_area_struct *vma) 883 struct vm_area_struct *vma, struct page *check_page)
810{ 884{
811 struct mm_struct *mm = vma->vm_mm; 885 struct mm_struct *mm = vma->vm_mm;
812 pgd_t *pgd; 886 pgd_t *pgd;
@@ -818,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
818 struct page *page; 892 struct page *page;
819 unsigned long address; 893 unsigned long address;
820 unsigned long end; 894 unsigned long end;
895 int ret = SWAP_AGAIN;
896 int locked_vma = 0;
821 897
822 address = (vma->vm_start + cursor) & CLUSTER_MASK; 898 address = (vma->vm_start + cursor) & CLUSTER_MASK;
823 end = address + CLUSTER_SIZE; 899 end = address + CLUSTER_SIZE;
@@ -828,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
828 904
829 pgd = pgd_offset(mm, address); 905 pgd = pgd_offset(mm, address);
830 if (!pgd_present(*pgd)) 906 if (!pgd_present(*pgd))
831 return; 907 return ret;
832 908
833 pud = pud_offset(pgd, address); 909 pud = pud_offset(pgd, address);
834 if (!pud_present(*pud)) 910 if (!pud_present(*pud))
835 return; 911 return ret;
836 912
837 pmd = pmd_offset(pud, address); 913 pmd = pmd_offset(pud, address);
838 if (!pmd_present(*pmd)) 914 if (!pmd_present(*pmd))
839 return; 915 return ret;
916
917 /*
918 * MLOCK_PAGES => feature is configured.
919 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
920 * keep the sem while scanning the cluster for mlocking pages.
921 */
922 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
923 locked_vma = (vma->vm_flags & VM_LOCKED);
924 if (!locked_vma)
925 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
926 }
840 927
841 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 928 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
842 929
@@ -849,12 +936,19 @@ static void try_to_unmap_cluster(unsigned long cursor,
849 page = vm_normal_page(vma, address, *pte); 936 page = vm_normal_page(vma, address, *pte);
850 BUG_ON(!page || PageAnon(page)); 937 BUG_ON(!page || PageAnon(page));
851 938
852 if (ptep_clear_flush_young(vma, address, pte)) 939 if (locked_vma) {
940 mlock_vma_page(page); /* no-op if already mlocked */
941 if (page == check_page)
942 ret = SWAP_MLOCK;
943 continue; /* don't unmap */
944 }
945
946 if (ptep_clear_flush_young_notify(vma, address, pte))
853 continue; 947 continue;
854 948
855 /* Nuke the page table entry. */ 949 /* Nuke the page table entry. */
856 flush_cache_page(vma, address, pte_pfn(*pte)); 950 flush_cache_page(vma, address, pte_pfn(*pte));
857 pteval = ptep_clear_flush(vma, address, pte); 951 pteval = ptep_clear_flush_notify(vma, address, pte);
858 952
859 /* If nonlinear, store the file page offset in the pte. */ 953 /* If nonlinear, store the file page offset in the pte. */
860 if (page->index != linear_page_index(vma, address)) 954 if (page->index != linear_page_index(vma, address))
@@ -870,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
870 (*mapcount)--; 964 (*mapcount)--;
871 } 965 }
872 pte_unmap_unlock(pte - 1, ptl); 966 pte_unmap_unlock(pte - 1, ptl);
967 if (locked_vma)
968 up_read(&vma->vm_mm->mmap_sem);
969 return ret;
873} 970}
874 971
875static int try_to_unmap_anon(struct page *page, int migration) 972/*
973 * common handling for pages mapped in VM_LOCKED vmas
974 */
975static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
976{
977 int mlocked = 0;
978
979 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
980 if (vma->vm_flags & VM_LOCKED) {
981 mlock_vma_page(page);
982 mlocked++; /* really mlocked the page */
983 }
984 up_read(&vma->vm_mm->mmap_sem);
985 }
986 return mlocked;
987}
988
989/**
990 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
991 * rmap method
992 * @page: the page to unmap/unlock
993 * @unlock: request for unlock rather than unmap [unlikely]
994 * @migration: unmapping for migration - ignored if @unlock
995 *
996 * Find all the mappings of a page using the mapping pointer and the vma chains
997 * contained in the anon_vma struct it points to.
998 *
999 * This function is only called from try_to_unmap/try_to_munlock for
1000 * anonymous pages.
1001 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1002 * where the page was found will be held for write. So, we won't recheck
1003 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1004 * 'LOCKED.
1005 */
1006static int try_to_unmap_anon(struct page *page, int unlock, int migration)
876{ 1007{
877 struct anon_vma *anon_vma; 1008 struct anon_vma *anon_vma;
878 struct vm_area_struct *vma; 1009 struct vm_area_struct *vma;
1010 unsigned int mlocked = 0;
879 int ret = SWAP_AGAIN; 1011 int ret = SWAP_AGAIN;
880 1012
1013 if (MLOCK_PAGES && unlikely(unlock))
1014 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1015
881 anon_vma = page_lock_anon_vma(page); 1016 anon_vma = page_lock_anon_vma(page);
882 if (!anon_vma) 1017 if (!anon_vma)
883 return ret; 1018 return ret;
884 1019
885 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1020 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
886 ret = try_to_unmap_one(page, vma, migration); 1021 if (MLOCK_PAGES && unlikely(unlock)) {
887 if (ret == SWAP_FAIL || !page_mapped(page)) 1022 if (!((vma->vm_flags & VM_LOCKED) &&
888 break; 1023 page_mapped_in_vma(page, vma)))
1024 continue; /* must visit all unlocked vmas */
1025 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1026 } else {
1027 ret = try_to_unmap_one(page, vma, migration);
1028 if (ret == SWAP_FAIL || !page_mapped(page))
1029 break;
1030 }
1031 if (ret == SWAP_MLOCK) {
1032 mlocked = try_to_mlock_page(page, vma);
1033 if (mlocked)
1034 break; /* stop if actually mlocked page */
1035 }
889 } 1036 }
890 1037
891 page_unlock_anon_vma(anon_vma); 1038 page_unlock_anon_vma(anon_vma);
1039
1040 if (mlocked)
1041 ret = SWAP_MLOCK; /* actually mlocked the page */
1042 else if (ret == SWAP_MLOCK)
1043 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1044
892 return ret; 1045 return ret;
893} 1046}
894 1047
895/** 1048/**
896 * try_to_unmap_file - unmap file page using the object-based rmap method 1049 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
897 * @page: the page to unmap 1050 * @page: the page to unmap/unlock
898 * @migration: migration flag 1051 * @unlock: request for unlock rather than unmap [unlikely]
1052 * @migration: unmapping for migration - ignored if @unlock
899 * 1053 *
900 * Find all the mappings of a page using the mapping pointer and the vma chains 1054 * Find all the mappings of a page using the mapping pointer and the vma chains
901 * contained in the address_space struct it points to. 1055 * contained in the address_space struct it points to.
902 * 1056 *
903 * This function is only called from try_to_unmap for object-based pages. 1057 * This function is only called from try_to_unmap/try_to_munlock for
1058 * object-based pages.
1059 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1060 * where the page was found will be held for write. So, we won't recheck
1061 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1062 * 'LOCKED.
904 */ 1063 */
905static int try_to_unmap_file(struct page *page, int migration) 1064static int try_to_unmap_file(struct page *page, int unlock, int migration)
906{ 1065{
907 struct address_space *mapping = page->mapping; 1066 struct address_space *mapping = page->mapping;
908 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1067 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -913,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration)
913 unsigned long max_nl_cursor = 0; 1072 unsigned long max_nl_cursor = 0;
914 unsigned long max_nl_size = 0; 1073 unsigned long max_nl_size = 0;
915 unsigned int mapcount; 1074 unsigned int mapcount;
1075 unsigned int mlocked = 0;
1076
1077 if (MLOCK_PAGES && unlikely(unlock))
1078 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
916 1079
917 spin_lock(&mapping->i_mmap_lock); 1080 spin_lock(&mapping->i_mmap_lock);
918 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1081 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
919 ret = try_to_unmap_one(page, vma, migration); 1082 if (MLOCK_PAGES && unlikely(unlock)) {
920 if (ret == SWAP_FAIL || !page_mapped(page)) 1083 if (!(vma->vm_flags & VM_LOCKED))
921 goto out; 1084 continue; /* must visit all vmas */
1085 ret = SWAP_MLOCK;
1086 } else {
1087 ret = try_to_unmap_one(page, vma, migration);
1088 if (ret == SWAP_FAIL || !page_mapped(page))
1089 goto out;
1090 }
1091 if (ret == SWAP_MLOCK) {
1092 mlocked = try_to_mlock_page(page, vma);
1093 if (mlocked)
1094 break; /* stop if actually mlocked page */
1095 }
922 } 1096 }
923 1097
1098 if (mlocked)
1099 goto out;
1100
924 if (list_empty(&mapping->i_mmap_nonlinear)) 1101 if (list_empty(&mapping->i_mmap_nonlinear))
925 goto out; 1102 goto out;
926 1103
927 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1104 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
928 shared.vm_set.list) { 1105 shared.vm_set.list) {
929 if ((vma->vm_flags & VM_LOCKED) && !migration) 1106 if (MLOCK_PAGES && unlikely(unlock)) {
1107 if (!(vma->vm_flags & VM_LOCKED))
1108 continue; /* must visit all vmas */
1109 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1110 goto out; /* no need to look further */
1111 }
1112 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
930 continue; 1113 continue;
931 cursor = (unsigned long) vma->vm_private_data; 1114 cursor = (unsigned long) vma->vm_private_data;
932 if (cursor > max_nl_cursor) 1115 if (cursor > max_nl_cursor)
@@ -936,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration)
936 max_nl_size = cursor; 1119 max_nl_size = cursor;
937 } 1120 }
938 1121
939 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 1122 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
940 ret = SWAP_FAIL; 1123 ret = SWAP_FAIL;
941 goto out; 1124 goto out;
942 } 1125 }
@@ -960,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration)
960 do { 1143 do {
961 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1144 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
962 shared.vm_set.list) { 1145 shared.vm_set.list) {
963 if ((vma->vm_flags & VM_LOCKED) && !migration) 1146 if (!MLOCK_PAGES && !migration &&
1147 (vma->vm_flags & VM_LOCKED))
964 continue; 1148 continue;
965 cursor = (unsigned long) vma->vm_private_data; 1149 cursor = (unsigned long) vma->vm_private_data;
966 while ( cursor < max_nl_cursor && 1150 while ( cursor < max_nl_cursor &&
967 cursor < vma->vm_end - vma->vm_start) { 1151 cursor < vma->vm_end - vma->vm_start) {
968 try_to_unmap_cluster(cursor, &mapcount, vma); 1152 ret = try_to_unmap_cluster(cursor, &mapcount,
1153 vma, page);
1154 if (ret == SWAP_MLOCK)
1155 mlocked = 2; /* to return below */
969 cursor += CLUSTER_SIZE; 1156 cursor += CLUSTER_SIZE;
970 vma->vm_private_data = (void *) cursor; 1157 vma->vm_private_data = (void *) cursor;
971 if ((int)mapcount <= 0) 1158 if ((int)mapcount <= 0)
@@ -986,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration)
986 vma->vm_private_data = NULL; 1173 vma->vm_private_data = NULL;
987out: 1174out:
988 spin_unlock(&mapping->i_mmap_lock); 1175 spin_unlock(&mapping->i_mmap_lock);
1176 if (mlocked)
1177 ret = SWAP_MLOCK; /* actually mlocked the page */
1178 else if (ret == SWAP_MLOCK)
1179 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
989 return ret; 1180 return ret;
990} 1181}
991 1182
@@ -1001,6 +1192,7 @@ out:
1001 * SWAP_SUCCESS - we succeeded in removing all mappings 1192 * SWAP_SUCCESS - we succeeded in removing all mappings
1002 * SWAP_AGAIN - we missed a mapping, try again later 1193 * SWAP_AGAIN - we missed a mapping, try again later
1003 * SWAP_FAIL - the page is unswappable 1194 * SWAP_FAIL - the page is unswappable
1195 * SWAP_MLOCK - page is mlocked.
1004 */ 1196 */
1005int try_to_unmap(struct page *page, int migration) 1197int try_to_unmap(struct page *page, int migration)
1006{ 1198{
@@ -1009,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration)
1009 BUG_ON(!PageLocked(page)); 1201 BUG_ON(!PageLocked(page));
1010 1202
1011 if (PageAnon(page)) 1203 if (PageAnon(page))
1012 ret = try_to_unmap_anon(page, migration); 1204 ret = try_to_unmap_anon(page, 0, migration);
1013 else 1205 else
1014 ret = try_to_unmap_file(page, migration); 1206 ret = try_to_unmap_file(page, 0, migration);
1015 1207 if (ret != SWAP_MLOCK && !page_mapped(page))
1016 if (!page_mapped(page))
1017 ret = SWAP_SUCCESS; 1208 ret = SWAP_SUCCESS;
1018 return ret; 1209 return ret;
1019} 1210}
1020 1211
1212#ifdef CONFIG_UNEVICTABLE_LRU
1213/**
1214 * try_to_munlock - try to munlock a page
1215 * @page: the page to be munlocked
1216 *
1217 * Called from munlock code. Checks all of the VMAs mapping the page
1218 * to make sure nobody else has this page mlocked. The page will be
1219 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1220 *
1221 * Return values are:
1222 *
1223 * SWAP_SUCCESS - no vma's holding page mlocked.
1224 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1225 * SWAP_MLOCK - page is now mlocked.
1226 */
1227int try_to_munlock(struct page *page)
1228{
1229 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1230
1231 if (PageAnon(page))
1232 return try_to_unmap_anon(page, 1, 0);
1233 else
1234 return try_to_unmap_file(page, 1, 0);
1235}
1236#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..d38d7e61fcd0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -50,14 +50,12 @@
50#include <linux/migrate.h> 50#include <linux/migrate.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/seq_file.h> 52#include <linux/seq_file.h>
53#include <linux/magic.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/div64.h> 56#include <asm/div64.h>
56#include <asm/pgtable.h> 57#include <asm/pgtable.h>
57 58
58/* This magic number is used in glibc for posix shared memory */
59#define TMPFS_MAGIC 0x01021994
60
61#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) 59#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
62#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) 60#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
63#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 61#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
@@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
201 199
202static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 200static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
203 .ra_pages = 0, /* No readahead */ 201 .ra_pages = 0, /* No readahead */
204 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 202 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
205 .unplug_io_fn = default_unplug_io_fn, 203 .unplug_io_fn = default_unplug_io_fn,
206}; 204};
207 205
@@ -922,20 +920,26 @@ found:
922 error = 1; 920 error = 1;
923 if (!inode) 921 if (!inode)
924 goto out; 922 goto out;
925 /* Precharge page while we can wait, compensate afterwards */ 923 /* Precharge page using GFP_KERNEL while we can wait */
926 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 924 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
927 if (error) 925 if (error)
928 goto out; 926 goto out;
929 error = radix_tree_preload(GFP_KERNEL); 927 error = radix_tree_preload(GFP_KERNEL);
930 if (error) 928 if (error) {
931 goto uncharge; 929 mem_cgroup_uncharge_cache_page(page);
930 goto out;
931 }
932 error = 1; 932 error = 1;
933 933
934 spin_lock(&info->lock); 934 spin_lock(&info->lock);
935 ptr = shmem_swp_entry(info, idx, NULL); 935 ptr = shmem_swp_entry(info, idx, NULL);
936 if (ptr && ptr->val == entry.val) 936 if (ptr && ptr->val == entry.val) {
937 error = add_to_page_cache(page, inode->i_mapping, 937 error = add_to_page_cache_locked(page, inode->i_mapping,
938 idx, GFP_NOWAIT); 938 idx, GFP_NOWAIT);
939 /* does mem_cgroup_uncharge_cache_page on error */
940 } else /* we must compensate for our precharge above */
941 mem_cgroup_uncharge_cache_page(page);
942
939 if (error == -EEXIST) { 943 if (error == -EEXIST) {
940 struct page *filepage = find_get_page(inode->i_mapping, idx); 944 struct page *filepage = find_get_page(inode->i_mapping, idx);
941 error = 1; 945 error = 1;
@@ -961,8 +965,6 @@ found:
961 shmem_swp_unmap(ptr); 965 shmem_swp_unmap(ptr);
962 spin_unlock(&info->lock); 966 spin_unlock(&info->lock);
963 radix_tree_preload_end(); 967 radix_tree_preload_end();
964uncharge:
965 mem_cgroup_uncharge_page(page);
966out: 968out:
967 unlock_page(page); 969 unlock_page(page);
968 page_cache_release(page); 970 page_cache_release(page);
@@ -1261,7 +1263,7 @@ repeat:
1261 } 1263 }
1262 1264
1263 /* We have to do this with page locked to prevent races */ 1265 /* We have to do this with page locked to prevent races */
1264 if (TestSetPageLocked(swappage)) { 1266 if (!trylock_page(swappage)) {
1265 shmem_swp_unmap(entry); 1267 shmem_swp_unmap(entry);
1266 spin_unlock(&info->lock); 1268 spin_unlock(&info->lock);
1267 wait_on_page_locked(swappage); 1269 wait_on_page_locked(swappage);
@@ -1297,8 +1299,8 @@ repeat:
1297 SetPageUptodate(filepage); 1299 SetPageUptodate(filepage);
1298 set_page_dirty(filepage); 1300 set_page_dirty(filepage);
1299 swap_free(swap); 1301 swap_free(swap);
1300 } else if (!(error = add_to_page_cache( 1302 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1301 swappage, mapping, idx, GFP_NOWAIT))) { 1303 idx, GFP_NOWAIT))) {
1302 info->flags |= SHMEM_PAGEIN; 1304 info->flags |= SHMEM_PAGEIN;
1303 shmem_swp_set(info, entry, 0); 1305 shmem_swp_set(info, entry, 0);
1304 shmem_swp_unmap(entry); 1306 shmem_swp_unmap(entry);
@@ -1311,24 +1313,21 @@ repeat:
1311 shmem_swp_unmap(entry); 1313 shmem_swp_unmap(entry);
1312 spin_unlock(&info->lock); 1314 spin_unlock(&info->lock);
1313 unlock_page(swappage); 1315 unlock_page(swappage);
1316 page_cache_release(swappage);
1314 if (error == -ENOMEM) { 1317 if (error == -ENOMEM) {
1315 /* allow reclaim from this memory cgroup */ 1318 /* allow reclaim from this memory cgroup */
1316 error = mem_cgroup_cache_charge(swappage, 1319 error = mem_cgroup_shrink_usage(current->mm,
1317 current->mm, gfp & ~__GFP_HIGHMEM); 1320 gfp);
1318 if (error) { 1321 if (error)
1319 page_cache_release(swappage);
1320 goto failed; 1322 goto failed;
1321 }
1322 mem_cgroup_uncharge_page(swappage);
1323 } 1323 }
1324 page_cache_release(swappage);
1325 goto repeat; 1324 goto repeat;
1326 } 1325 }
1327 } else if (sgp == SGP_READ && !filepage) { 1326 } else if (sgp == SGP_READ && !filepage) {
1328 shmem_swp_unmap(entry); 1327 shmem_swp_unmap(entry);
1329 filepage = find_get_page(mapping, idx); 1328 filepage = find_get_page(mapping, idx);
1330 if (filepage && 1329 if (filepage &&
1331 (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { 1330 (!PageUptodate(filepage) || !trylock_page(filepage))) {
1332 spin_unlock(&info->lock); 1331 spin_unlock(&info->lock);
1333 wait_on_page_locked(filepage); 1332 wait_on_page_locked(filepage);
1334 page_cache_release(filepage); 1333 page_cache_release(filepage);
@@ -1358,6 +1357,8 @@ repeat:
1358 } 1357 }
1359 1358
1360 if (!filepage) { 1359 if (!filepage) {
1360 int ret;
1361
1361 spin_unlock(&info->lock); 1362 spin_unlock(&info->lock);
1362 filepage = shmem_alloc_page(gfp, info, idx); 1363 filepage = shmem_alloc_page(gfp, info, idx);
1363 if (!filepage) { 1364 if (!filepage) {
@@ -1366,6 +1367,7 @@ repeat:
1366 error = -ENOMEM; 1367 error = -ENOMEM;
1367 goto failed; 1368 goto failed;
1368 } 1369 }
1370 SetPageSwapBacked(filepage);
1369 1371
1370 /* Precharge page while we can wait, compensate after */ 1372 /* Precharge page while we can wait, compensate after */
1371 error = mem_cgroup_cache_charge(filepage, current->mm, 1373 error = mem_cgroup_cache_charge(filepage, current->mm,
@@ -1386,10 +1388,18 @@ repeat:
1386 swap = *entry; 1388 swap = *entry;
1387 shmem_swp_unmap(entry); 1389 shmem_swp_unmap(entry);
1388 } 1390 }
1389 if (error || swap.val || 0 != add_to_page_cache_lru( 1391 ret = error || swap.val;
1390 filepage, mapping, idx, GFP_NOWAIT)) { 1392 if (ret)
1393 mem_cgroup_uncharge_cache_page(filepage);
1394 else
1395 ret = add_to_page_cache_lru(filepage, mapping,
1396 idx, GFP_NOWAIT);
1397 /*
1398 * At add_to_page_cache_lru() failure, uncharge will
1399 * be done automatically.
1400 */
1401 if (ret) {
1391 spin_unlock(&info->lock); 1402 spin_unlock(&info->lock);
1392 mem_cgroup_uncharge_page(filepage);
1393 page_cache_release(filepage); 1403 page_cache_release(filepage);
1394 shmem_unacct_blocks(info->flags, 1); 1404 shmem_unacct_blocks(info->flags, 1);
1395 shmem_free_blocks(inode, 1); 1405 shmem_free_blocks(inode, 1);
@@ -1398,7 +1408,6 @@ repeat:
1398 goto failed; 1408 goto failed;
1399 goto repeat; 1409 goto repeat;
1400 } 1410 }
1401 mem_cgroup_uncharge_page(filepage);
1402 info->flags |= SHMEM_PAGEIN; 1411 info->flags |= SHMEM_PAGEIN;
1403 } 1412 }
1404 1413
@@ -1468,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
1468 if (!user_shm_lock(inode->i_size, user)) 1477 if (!user_shm_lock(inode->i_size, user))
1469 goto out_nomem; 1478 goto out_nomem;
1470 info->flags |= VM_LOCKED; 1479 info->flags |= VM_LOCKED;
1480 mapping_set_unevictable(file->f_mapping);
1471 } 1481 }
1472 if (!lock && (info->flags & VM_LOCKED) && user) { 1482 if (!lock && (info->flags & VM_LOCKED) && user) {
1473 user_shm_unlock(inode->i_size, user); 1483 user_shm_unlock(inode->i_size, user);
1474 info->flags &= ~VM_LOCKED; 1484 info->flags &= ~VM_LOCKED;
1485 mapping_clear_unevictable(file->f_mapping);
1486 scan_mapping_unevictable_pages(file->f_mapping);
1475 } 1487 }
1476 retval = 0; 1488 retval = 0;
1489
1477out_nomem: 1490out_nomem:
1478 spin_unlock(&info->lock); 1491 spin_unlock(&info->lock);
1479 return retval; 1492 return retval;
@@ -1503,7 +1516,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1503 inode->i_uid = current->fsuid; 1516 inode->i_uid = current->fsuid;
1504 inode->i_gid = current->fsgid; 1517 inode->i_gid = current->fsgid;
1505 inode->i_blocks = 0; 1518 inode->i_blocks = 0;
1506 inode->i_mapping->a_ops = &shmem_aops;
1507 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1519 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1508 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1520 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1509 inode->i_generation = get_seconds(); 1521 inode->i_generation = get_seconds();
@@ -1518,6 +1530,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1518 init_special_inode(inode, mode, dev); 1530 init_special_inode(inode, mode, dev);
1519 break; 1531 break;
1520 case S_IFREG: 1532 case S_IFREG:
1533 inode->i_mapping->a_ops = &shmem_aops;
1521 inode->i_op = &shmem_inode_operations; 1534 inode->i_op = &shmem_inode_operations;
1522 inode->i_fop = &shmem_file_operations; 1535 inode->i_fop = &shmem_file_operations;
1523 mpol_shared_policy_init(&info->policy, 1536 mpol_shared_policy_init(&info->policy,
@@ -1690,26 +1703,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1690 file_accessed(filp); 1703 file_accessed(filp);
1691} 1704}
1692 1705
1693static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 1706static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1707 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1694{ 1708{
1695 read_descriptor_t desc; 1709 struct file *filp = iocb->ki_filp;
1710 ssize_t retval;
1711 unsigned long seg;
1712 size_t count;
1713 loff_t *ppos = &iocb->ki_pos;
1696 1714
1697 if ((ssize_t) count < 0) 1715 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1698 return -EINVAL; 1716 if (retval)
1699 if (!access_ok(VERIFY_WRITE, buf, count)) 1717 return retval;
1700 return -EFAULT;
1701 if (!count)
1702 return 0;
1703 1718
1704 desc.written = 0; 1719 for (seg = 0; seg < nr_segs; seg++) {
1705 desc.count = count; 1720 read_descriptor_t desc;
1706 desc.arg.buf = buf;
1707 desc.error = 0;
1708 1721
1709 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1722 desc.written = 0;
1710 if (desc.written) 1723 desc.arg.buf = iov[seg].iov_base;
1711 return desc.written; 1724 desc.count = iov[seg].iov_len;
1712 return desc.error; 1725 if (desc.count == 0)
1726 continue;
1727 desc.error = 0;
1728 do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1729 retval += desc.written;
1730 if (desc.error) {
1731 retval = retval ?: desc.error;
1732 break;
1733 }
1734 if (desc.count > 0)
1735 break;
1736 }
1737 return retval;
1713} 1738}
1714 1739
1715static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1740static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1907,6 +1932,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1907 return error; 1932 return error;
1908 } 1933 }
1909 unlock_page(page); 1934 unlock_page(page);
1935 inode->i_mapping->a_ops = &shmem_aops;
1910 inode->i_op = &shmem_symlink_inode_operations; 1936 inode->i_op = &shmem_symlink_inode_operations;
1911 kaddr = kmap_atomic(page, KM_USER0); 1937 kaddr = kmap_atomic(page, KM_USER0);
1912 memcpy(kaddr, symname, len); 1938 memcpy(kaddr, symname, len);
@@ -2330,7 +2356,7 @@ static void shmem_destroy_inode(struct inode *inode)
2330 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2356 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2331} 2357}
2332 2358
2333static void init_once(struct kmem_cache *cachep, void *foo) 2359static void init_once(void *foo)
2334{ 2360{
2335 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2361 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2336 2362
@@ -2369,8 +2395,9 @@ static const struct file_operations shmem_file_operations = {
2369 .mmap = shmem_mmap, 2395 .mmap = shmem_mmap,
2370#ifdef CONFIG_TMPFS 2396#ifdef CONFIG_TMPFS
2371 .llseek = generic_file_llseek, 2397 .llseek = generic_file_llseek,
2372 .read = shmem_file_read, 2398 .read = do_sync_read,
2373 .write = do_sync_write, 2399 .write = do_sync_write,
2400 .aio_read = shmem_file_aio_read,
2374 .aio_write = generic_file_aio_write, 2401 .aio_write = generic_file_aio_write,
2375 .fsync = simple_sync_file, 2402 .fsync = simple_sync_file,
2376 .splice_read = generic_file_splice_read, 2403 .splice_read = generic_file_splice_read,
@@ -2558,6 +2585,7 @@ put_memory:
2558 shmem_unacct_size(flags, size); 2585 shmem_unacct_size(flags, size);
2559 return ERR_PTR(error); 2586 return ERR_PTR(error);
2560} 2587}
2588EXPORT_SYMBOL_GPL(shmem_file_setup);
2561 2589
2562/** 2590/**
2563 * shmem_zero_setup - setup a shared anonymous mapping 2591 * shmem_zero_setup - setup a shared anonymous mapping
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
191 * shmem_permission - permission() inode operation 191 * shmem_permission - permission() inode operation
192 */ 192 */
193int 193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd) 194shmem_permission(struct inode *inode, int mask)
195{ 195{
196 return generic_permission(inode, mask, shmem_check_acl); 196 return generic_permission(inode, mask, shmem_check_acl);
197} 197}
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..09187517f9dc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -95,6 +95,7 @@
95#include <linux/init.h> 95#include <linux/init.h>
96#include <linux/compiler.h> 96#include <linux/compiler.h>
97#include <linux/cpuset.h> 97#include <linux/cpuset.h>
98#include <linux/proc_fs.h>
98#include <linux/seq_file.h> 99#include <linux/seq_file.h>
99#include <linux/notifier.h> 100#include <linux/notifier.h>
100#include <linux/kallsyms.h> 101#include <linux/kallsyms.h>
@@ -406,7 +407,7 @@ struct kmem_cache {
406 unsigned int dflags; /* dynamic flags */ 407 unsigned int dflags; /* dynamic flags */
407 408
408 /* constructor func */ 409 /* constructor func */
409 void (*ctor)(struct kmem_cache *, void *); 410 void (*ctor)(void *obj);
410 411
411/* 5) cache creation/removal */ 412/* 5) cache creation/removal */
412 const char *name; 413 const char *name;
@@ -2137,8 +2138,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2137 */ 2138 */
2138struct kmem_cache * 2139struct kmem_cache *
2139kmem_cache_create (const char *name, size_t size, size_t align, 2140kmem_cache_create (const char *name, size_t size, size_t align,
2140 unsigned long flags, 2141 unsigned long flags, void (*ctor)(void *))
2141 void (*ctor)(struct kmem_cache *, void *))
2142{ 2142{
2143 size_t left_over, slab_size, ralign; 2143 size_t left_over, slab_size, ralign;
2144 struct kmem_cache *cachep = NULL, *pc; 2144 struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2653,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2653 * They must also be threaded. 2653 * They must also be threaded.
2654 */ 2654 */
2655 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2655 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2656 cachep->ctor(cachep, objp + obj_offset(cachep)); 2656 cachep->ctor(objp + obj_offset(cachep));
2657 2657
2658 if (cachep->flags & SLAB_RED_ZONE) { 2658 if (cachep->flags & SLAB_RED_ZONE) {
2659 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2659 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2669,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2669 cachep->buffer_size / PAGE_SIZE, 0); 2669 cachep->buffer_size / PAGE_SIZE, 0);
2670#else 2670#else
2671 if (cachep->ctor) 2671 if (cachep->ctor)
2672 cachep->ctor(cachep, objp); 2672 cachep->ctor(objp);
2673#endif 2673#endif
2674 slab_bufctl(slabp)[i] = i + 1; 2674 slab_bufctl(slabp)[i] = i + 1;
2675 } 2675 }
@@ -3093,7 +3093,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3093#endif 3093#endif
3094 objp += obj_offset(cachep); 3094 objp += obj_offset(cachep);
3095 if (cachep->ctor && cachep->flags & SLAB_POISON) 3095 if (cachep->ctor && cachep->flags & SLAB_POISON)
3096 cachep->ctor(cachep, objp); 3096 cachep->ctor(objp);
3097#if ARCH_SLAB_MINALIGN 3097#if ARCH_SLAB_MINALIGN
3098 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3098 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3099 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3099 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -4259,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p)
4259 * + further values on SMP and with statistics enabled 4259 * + further values on SMP and with statistics enabled
4260 */ 4260 */
4261 4261
4262const struct seq_operations slabinfo_op = { 4262static const struct seq_operations slabinfo_op = {
4263 .start = s_start, 4263 .start = s_start,
4264 .next = s_next, 4264 .next = s_next,
4265 .stop = s_stop, 4265 .stop = s_stop,
@@ -4316,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4316 return res; 4316 return res;
4317} 4317}
4318 4318
4319static int slabinfo_open(struct inode *inode, struct file *file)
4320{
4321 return seq_open(file, &slabinfo_op);
4322}
4323
4324static const struct file_operations proc_slabinfo_operations = {
4325 .open = slabinfo_open,
4326 .read = seq_read,
4327 .write = slabinfo_write,
4328 .llseek = seq_lseek,
4329 .release = seq_release,
4330};
4331
4319#ifdef CONFIG_DEBUG_SLAB_LEAK 4332#ifdef CONFIG_DEBUG_SLAB_LEAK
4320 4333
4321static void *leaks_start(struct seq_file *m, loff_t *pos) 4334static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4444,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p)
4444 return 0; 4457 return 0;
4445} 4458}
4446 4459
4447const struct seq_operations slabstats_op = { 4460static const struct seq_operations slabstats_op = {
4448 .start = leaks_start, 4461 .start = leaks_start,
4449 .next = s_next, 4462 .next = s_next,
4450 .stop = s_stop, 4463 .stop = s_stop,
4451 .show = leaks_show, 4464 .show = leaks_show,
4452}; 4465};
4466
4467static int slabstats_open(struct inode *inode, struct file *file)
4468{
4469 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4470 int ret = -ENOMEM;
4471 if (n) {
4472 ret = seq_open(file, &slabstats_op);
4473 if (!ret) {
4474 struct seq_file *m = file->private_data;
4475 *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4476 m->private = n;
4477 n = NULL;
4478 }
4479 kfree(n);
4480 }
4481 return ret;
4482}
4483
4484static const struct file_operations proc_slabstats_operations = {
4485 .open = slabstats_open,
4486 .read = seq_read,
4487 .llseek = seq_lseek,
4488 .release = seq_release_private,
4489};
4490#endif
4491
4492static int __init slab_proc_init(void)
4493{
4494 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4495#ifdef CONFIG_DEBUG_SLAB_LEAK
4496 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4453#endif 4497#endif
4498 return 0;
4499}
4500module_init(slab_proc_init);
4454#endif 4501#endif
4455 4502
4456/** 4503/**
@@ -4473,4 +4520,3 @@ size_t ksize(const void *objp)
4473 4520
4474 return obj_size(virt_to_cache(objp)); 4521 return obj_size(virt_to_cache(objp));
4475} 4522}
4476EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..cb675d126791 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
130 */ 130 */
131static inline int slob_page(struct slob_page *sp) 131static inline int slob_page(struct slob_page *sp)
132{ 132{
133 return test_bit(PG_active, &sp->flags); 133 return PageSlobPage((struct page *)sp);
134} 134}
135 135
136static inline void set_slob_page(struct slob_page *sp) 136static inline void set_slob_page(struct slob_page *sp)
137{ 137{
138 __set_bit(PG_active, &sp->flags); 138 __SetPageSlobPage((struct page *)sp);
139} 139}
140 140
141static inline void clear_slob_page(struct slob_page *sp) 141static inline void clear_slob_page(struct slob_page *sp)
142{ 142{
143 __clear_bit(PG_active, &sp->flags); 143 __ClearPageSlobPage((struct page *)sp);
144} 144}
145 145
146/* 146/*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
148 */ 148 */
149static inline int slob_page_free(struct slob_page *sp) 149static inline int slob_page_free(struct slob_page *sp)
150{ 150{
151 return test_bit(PG_private, &sp->flags); 151 return PageSlobFree((struct page *)sp);
152} 152}
153 153
154static void set_slob_page_free(struct slob_page *sp, struct list_head *list) 154static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
155{ 155{
156 list_add(&sp->list, list); 156 list_add(&sp->list, list);
157 __set_bit(PG_private, &sp->flags); 157 __SetPageSlobFree((struct page *)sp);
158} 158}
159 159
160static inline void clear_slob_page_free(struct slob_page *sp) 160static inline void clear_slob_page_free(struct slob_page *sp)
161{ 161{
162 list_del(&sp->list); 162 list_del(&sp->list);
163 __clear_bit(PG_private, &sp->flags); 163 __ClearPageSlobFree((struct page *)sp);
164} 164}
165 165
166#define SLOB_UNIT sizeof(slob_t) 166#define SLOB_UNIT sizeof(slob_t)
@@ -514,23 +514,23 @@ size_t ksize(const void *block)
514 return 0; 514 return 0;
515 515
516 sp = (struct slob_page *)virt_to_page(block); 516 sp = (struct slob_page *)virt_to_page(block);
517 if (slob_page(sp)) 517 if (slob_page(sp)) {
518 return ((slob_t *)block - 1)->units + SLOB_UNIT; 518 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
519 else 519 unsigned int *m = (unsigned int *)(block - align);
520 return SLOB_UNITS(*m) * SLOB_UNIT;
521 } else
520 return sp->page.private; 522 return sp->page.private;
521} 523}
522EXPORT_SYMBOL(ksize);
523 524
524struct kmem_cache { 525struct kmem_cache {
525 unsigned int size, align; 526 unsigned int size, align;
526 unsigned long flags; 527 unsigned long flags;
527 const char *name; 528 const char *name;
528 void (*ctor)(struct kmem_cache *, void *); 529 void (*ctor)(void *);
529}; 530};
530 531
531struct kmem_cache *kmem_cache_create(const char *name, size_t size, 532struct kmem_cache *kmem_cache_create(const char *name, size_t size,
532 size_t align, unsigned long flags, 533 size_t align, unsigned long flags, void (*ctor)(void *))
533 void (*ctor)(struct kmem_cache *, void *))
534{ 534{
535 struct kmem_cache *c; 535 struct kmem_cache *c;
536 536
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
575 b = slob_new_page(flags, get_order(c->size), node); 575 b = slob_new_page(flags, get_order(c->size), node);
576 576
577 if (c->ctor) 577 if (c->ctor)
578 c->ctor(c, b); 578 c->ctor(b);
579 579
580 return b; 580 return b;
581} 581}
diff --git a/mm/slub.c b/mm/slub.c
index 35ab38a94b46..7ad489af9561 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -14,6 +14,7 @@
14#include <linux/interrupt.h> 14#include <linux/interrupt.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17#include <linux/seq_file.h> 18#include <linux/seq_file.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
19#include <linux/cpuset.h> 20#include <linux/cpuset.h>
@@ -102,44 +103,12 @@
102 * the fast path and disables lockless freelists. 103 * the fast path and disables lockless freelists.
103 */ 104 */
104 105
105#define FROZEN (1 << PG_active)
106
107#ifdef CONFIG_SLUB_DEBUG 106#ifdef CONFIG_SLUB_DEBUG
108#define SLABDEBUG (1 << PG_error) 107#define SLABDEBUG 1
109#else 108#else
110#define SLABDEBUG 0 109#define SLABDEBUG 0
111#endif 110#endif
112 111
113static inline int SlabFrozen(struct page *page)
114{
115 return page->flags & FROZEN;
116}
117
118static inline void SetSlabFrozen(struct page *page)
119{
120 page->flags |= FROZEN;
121}
122
123static inline void ClearSlabFrozen(struct page *page)
124{
125 page->flags &= ~FROZEN;
126}
127
128static inline int SlabDebug(struct page *page)
129{
130 return page->flags & SLABDEBUG;
131}
132
133static inline void SetSlabDebug(struct page *page)
134{
135 page->flags |= SLABDEBUG;
136}
137
138static inline void ClearSlabDebug(struct page *page)
139{
140 page->flags &= ~SLABDEBUG;
141}
142
143/* 112/*
144 * Issues still to be resolved: 113 * Issues still to be resolved:
145 * 114 *
@@ -492,7 +461,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
492 if (p > addr + 16) 461 if (p > addr + 16)
493 print_section("Bytes b4", p - 16, 16); 462 print_section("Bytes b4", p - 16, 16);
494 463
495 print_section("Object", p, min(s->objsize, 128)); 464 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
496 465
497 if (s->flags & SLAB_RED_ZONE) 466 if (s->flags & SLAB_RED_ZONE)
498 print_section("Redzone", p + s->objsize, 467 print_section("Redzone", p + s->objsize,
@@ -971,7 +940,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
971 } 940 }
972 941
973 /* Special debug activities for freeing objects */ 942 /* Special debug activities for freeing objects */
974 if (!SlabFrozen(page) && !page->freelist) 943 if (!PageSlubFrozen(page) && !page->freelist)
975 remove_full(s, page); 944 remove_full(s, page);
976 if (s->flags & SLAB_STORE_USER) 945 if (s->flags & SLAB_STORE_USER)
977 set_track(s, object, TRACK_FREE, addr); 946 set_track(s, object, TRACK_FREE, addr);
@@ -1044,7 +1013,7 @@ __setup("slub_debug", setup_slub_debug);
1044 1013
1045static unsigned long kmem_cache_flags(unsigned long objsize, 1014static unsigned long kmem_cache_flags(unsigned long objsize,
1046 unsigned long flags, const char *name, 1015 unsigned long flags, const char *name,
1047 void (*ctor)(struct kmem_cache *, void *)) 1016 void (*ctor)(void *))
1048{ 1017{
1049 /* 1018 /*
1050 * Enable debugging if selected on the kernel commandline. 1019 * Enable debugging if selected on the kernel commandline.
@@ -1072,7 +1041,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1072static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1041static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1073static inline unsigned long kmem_cache_flags(unsigned long objsize, 1042static inline unsigned long kmem_cache_flags(unsigned long objsize,
1074 unsigned long flags, const char *name, 1043 unsigned long flags, const char *name,
1075 void (*ctor)(struct kmem_cache *, void *)) 1044 void (*ctor)(void *))
1076{ 1045{
1077 return flags; 1046 return flags;
1078} 1047}
@@ -1135,7 +1104,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1135{ 1104{
1136 setup_object_debug(s, page, object); 1105 setup_object_debug(s, page, object);
1137 if (unlikely(s->ctor)) 1106 if (unlikely(s->ctor))
1138 s->ctor(s, object); 1107 s->ctor(object);
1139} 1108}
1140 1109
1141static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1110static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1157,7 +1126,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1157 page->flags |= 1 << PG_slab; 1126 page->flags |= 1 << PG_slab;
1158 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1127 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1159 SLAB_STORE_USER | SLAB_TRACE)) 1128 SLAB_STORE_USER | SLAB_TRACE))
1160 SetSlabDebug(page); 1129 __SetPageSlubDebug(page);
1161 1130
1162 start = page_address(page); 1131 start = page_address(page);
1163 1132
@@ -1184,14 +1153,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1184 int order = compound_order(page); 1153 int order = compound_order(page);
1185 int pages = 1 << order; 1154 int pages = 1 << order;
1186 1155
1187 if (unlikely(SlabDebug(page))) { 1156 if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
1188 void *p; 1157 void *p;
1189 1158
1190 slab_pad_check(s, page); 1159 slab_pad_check(s, page);
1191 for_each_object(p, s, page_address(page), 1160 for_each_object(p, s, page_address(page),
1192 page->objects) 1161 page->objects)
1193 check_object(s, page, p, 0); 1162 check_object(s, page, p, 0);
1194 ClearSlabDebug(page); 1163 __ClearPageSlubDebug(page);
1195 } 1164 }
1196 1165
1197 mod_zone_page_state(page_zone(page), 1166 mod_zone_page_state(page_zone(page),
@@ -1288,7 +1257,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1288 if (slab_trylock(page)) { 1257 if (slab_trylock(page)) {
1289 list_del(&page->lru); 1258 list_del(&page->lru);
1290 n->nr_partial--; 1259 n->nr_partial--;
1291 SetSlabFrozen(page); 1260 __SetPageSlubFrozen(page);
1292 return 1; 1261 return 1;
1293 } 1262 }
1294 return 0; 1263 return 0;
@@ -1361,7 +1330,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1361 n = get_node(s, zone_to_nid(zone)); 1330 n = get_node(s, zone_to_nid(zone));
1362 1331
1363 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1332 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1364 n->nr_partial > MIN_PARTIAL) { 1333 n->nr_partial > n->min_partial) {
1365 page = get_partial_node(n); 1334 page = get_partial_node(n);
1366 if (page) 1335 if (page)
1367 return page; 1336 return page;
@@ -1398,7 +1367,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1398 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1367 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1399 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); 1368 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1400 1369
1401 ClearSlabFrozen(page); 1370 __ClearPageSlubFrozen(page);
1402 if (page->inuse) { 1371 if (page->inuse) {
1403 1372
1404 if (page->freelist) { 1373 if (page->freelist) {
@@ -1406,13 +1375,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1406 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1375 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1407 } else { 1376 } else {
1408 stat(c, DEACTIVATE_FULL); 1377 stat(c, DEACTIVATE_FULL);
1409 if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1378 if (SLABDEBUG && PageSlubDebug(page) &&
1379 (s->flags & SLAB_STORE_USER))
1410 add_full(n, page); 1380 add_full(n, page);
1411 } 1381 }
1412 slab_unlock(page); 1382 slab_unlock(page);
1413 } else { 1383 } else {
1414 stat(c, DEACTIVATE_EMPTY); 1384 stat(c, DEACTIVATE_EMPTY);
1415 if (n->nr_partial < MIN_PARTIAL) { 1385 if (n->nr_partial < n->min_partial) {
1416 /* 1386 /*
1417 * Adding an empty slab to the partial slabs in order 1387 * Adding an empty slab to the partial slabs in order
1418 * to avoid page allocator overhead. This slab needs 1388 * to avoid page allocator overhead. This slab needs
@@ -1495,15 +1465,7 @@ static void flush_cpu_slab(void *d)
1495 1465
1496static void flush_all(struct kmem_cache *s) 1466static void flush_all(struct kmem_cache *s)
1497{ 1467{
1498#ifdef CONFIG_SMP
1499 on_each_cpu(flush_cpu_slab, s, 1); 1468 on_each_cpu(flush_cpu_slab, s, 1);
1500#else
1501 unsigned long flags;
1502
1503 local_irq_save(flags);
1504 flush_cpu_slab(s);
1505 local_irq_restore(flags);
1506#endif
1507} 1469}
1508 1470
1509/* 1471/*
@@ -1559,7 +1521,7 @@ load_freelist:
1559 object = c->page->freelist; 1521 object = c->page->freelist;
1560 if (unlikely(!object)) 1522 if (unlikely(!object))
1561 goto another_slab; 1523 goto another_slab;
1562 if (unlikely(SlabDebug(c->page))) 1524 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1563 goto debug; 1525 goto debug;
1564 1526
1565 c->freelist = object[c->offset]; 1527 c->freelist = object[c->offset];
@@ -1596,7 +1558,7 @@ new_slab:
1596 if (c->page) 1558 if (c->page)
1597 flush_slab(s, c); 1559 flush_slab(s, c);
1598 slab_lock(new); 1560 slab_lock(new);
1599 SetSlabFrozen(new); 1561 __SetPageSlubFrozen(new);
1600 c->page = new; 1562 c->page = new;
1601 goto load_freelist; 1563 goto load_freelist;
1602 } 1564 }
@@ -1682,7 +1644,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1682 stat(c, FREE_SLOWPATH); 1644 stat(c, FREE_SLOWPATH);
1683 slab_lock(page); 1645 slab_lock(page);
1684 1646
1685 if (unlikely(SlabDebug(page))) 1647 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1686 goto debug; 1648 goto debug;
1687 1649
1688checks_ok: 1650checks_ok:
@@ -1690,7 +1652,7 @@ checks_ok:
1690 page->freelist = object; 1652 page->freelist = object;
1691 page->inuse--; 1653 page->inuse--;
1692 1654
1693 if (unlikely(SlabFrozen(page))) { 1655 if (unlikely(PageSlubFrozen(page))) {
1694 stat(c, FREE_FROZEN); 1656 stat(c, FREE_FROZEN);
1695 goto out_unlock; 1657 goto out_unlock;
1696 } 1658 }
@@ -1952,13 +1914,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
1952#endif 1914#endif
1953} 1915}
1954 1916
1955static void init_kmem_cache_node(struct kmem_cache_node *n) 1917static void
1918init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1956{ 1919{
1957 n->nr_partial = 0; 1920 n->nr_partial = 0;
1921
1922 /*
1923 * The larger the object size is, the more pages we want on the partial
1924 * list to avoid pounding the page allocator excessively.
1925 */
1926 n->min_partial = ilog2(s->size);
1927 if (n->min_partial < MIN_PARTIAL)
1928 n->min_partial = MIN_PARTIAL;
1929 else if (n->min_partial > MAX_PARTIAL)
1930 n->min_partial = MAX_PARTIAL;
1931
1958 spin_lock_init(&n->list_lock); 1932 spin_lock_init(&n->list_lock);
1959 INIT_LIST_HEAD(&n->partial); 1933 INIT_LIST_HEAD(&n->partial);
1960#ifdef CONFIG_SLUB_DEBUG 1934#ifdef CONFIG_SLUB_DEBUG
1961 atomic_long_set(&n->nr_slabs, 0); 1935 atomic_long_set(&n->nr_slabs, 0);
1936 atomic_long_set(&n->total_objects, 0);
1962 INIT_LIST_HEAD(&n->full); 1937 INIT_LIST_HEAD(&n->full);
1963#endif 1938#endif
1964} 1939}
@@ -2126,7 +2101,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2126 init_object(kmalloc_caches, n, 1); 2101 init_object(kmalloc_caches, n, 1);
2127 init_tracking(kmalloc_caches, n); 2102 init_tracking(kmalloc_caches, n);
2128#endif 2103#endif
2129 init_kmem_cache_node(n); 2104 init_kmem_cache_node(n, kmalloc_caches);
2130 inc_slabs_node(kmalloc_caches, node, page->objects); 2105 inc_slabs_node(kmalloc_caches, node, page->objects);
2131 2106
2132 /* 2107 /*
@@ -2183,7 +2158,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2183 2158
2184 } 2159 }
2185 s->node[node] = n; 2160 s->node[node] = n;
2186 init_kmem_cache_node(n); 2161 init_kmem_cache_node(n, s);
2187 } 2162 }
2188 return 1; 2163 return 1;
2189} 2164}
@@ -2194,7 +2169,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
2194 2169
2195static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2170static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2196{ 2171{
2197 init_kmem_cache_node(&s->local_node); 2172 init_kmem_cache_node(&s->local_node, s);
2198 return 1; 2173 return 1;
2199} 2174}
2200#endif 2175#endif
@@ -2325,7 +2300,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2325static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2300static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2326 const char *name, size_t size, 2301 const char *name, size_t size,
2327 size_t align, unsigned long flags, 2302 size_t align, unsigned long flags,
2328 void (*ctor)(struct kmem_cache *, void *)) 2303 void (*ctor)(void *))
2329{ 2304{
2330 memset(s, 0, kmem_size); 2305 memset(s, 0, kmem_size);
2331 s->name = name; 2306 s->name = name;
@@ -2339,7 +2314,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2339 2314
2340 s->refcount = 1; 2315 s->refcount = 1;
2341#ifdef CONFIG_NUMA 2316#ifdef CONFIG_NUMA
2342 s->remote_node_defrag_ratio = 100; 2317 s->remote_node_defrag_ratio = 1000;
2343#endif 2318#endif
2344 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2319 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2345 goto error; 2320 goto error;
@@ -2754,7 +2729,6 @@ size_t ksize(const void *object)
2754 */ 2729 */
2755 return s->size; 2730 return s->size;
2756} 2731}
2757EXPORT_SYMBOL(ksize);
2758 2732
2759void kfree(const void *x) 2733void kfree(const void *x)
2760{ 2734{
@@ -2929,7 +2903,7 @@ static int slab_mem_going_online_callback(void *arg)
2929 ret = -ENOMEM; 2903 ret = -ENOMEM;
2930 goto out; 2904 goto out;
2931 } 2905 }
2932 init_kmem_cache_node(n); 2906 init_kmem_cache_node(n, s);
2933 s->node[nid] = n; 2907 s->node[nid] = n;
2934 } 2908 }
2935out: 2909out:
@@ -3081,7 +3055,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3081 3055
3082static struct kmem_cache *find_mergeable(size_t size, 3056static struct kmem_cache *find_mergeable(size_t size,
3083 size_t align, unsigned long flags, const char *name, 3057 size_t align, unsigned long flags, const char *name,
3084 void (*ctor)(struct kmem_cache *, void *)) 3058 void (*ctor)(void *))
3085{ 3059{
3086 struct kmem_cache *s; 3060 struct kmem_cache *s;
3087 3061
@@ -3121,8 +3095,7 @@ static struct kmem_cache *find_mergeable(size_t size,
3121} 3095}
3122 3096
3123struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3097struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3124 size_t align, unsigned long flags, 3098 size_t align, unsigned long flags, void (*ctor)(void *))
3125 void (*ctor)(struct kmem_cache *, void *))
3126{ 3099{
3127 struct kmem_cache *s; 3100 struct kmem_cache *s;
3128 3101
@@ -3325,12 +3298,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3325 s->name, page); 3298 s->name, page);
3326 3299
3327 if (s->flags & DEBUG_DEFAULT_FLAGS) { 3300 if (s->flags & DEBUG_DEFAULT_FLAGS) {
3328 if (!SlabDebug(page)) 3301 if (!PageSlubDebug(page))
3329 printk(KERN_ERR "SLUB %s: SlabDebug not set " 3302 printk(KERN_ERR "SLUB %s: SlubDebug not set "
3330 "on slab 0x%p\n", s->name, page); 3303 "on slab 0x%p\n", s->name, page);
3331 } else { 3304 } else {
3332 if (SlabDebug(page)) 3305 if (PageSlubDebug(page))
3333 printk(KERN_ERR "SLUB %s: SlabDebug set on " 3306 printk(KERN_ERR "SLUB %s: SlubDebug set on "
3334 "slab 0x%p\n", s->name, page); 3307 "slab 0x%p\n", s->name, page);
3335 } 3308 }
3336} 3309}
@@ -4087,7 +4060,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4087 if (err) 4060 if (err)
4088 return err; 4061 return err;
4089 4062
4090 if (ratio < 100) 4063 if (ratio <= 100)
4091 s->remote_node_defrag_ratio = ratio * 10; 4064 s->remote_node_defrag_ratio = ratio * 10;
4092 4065
4093 return length; 4066 return length;
@@ -4445,14 +4418,6 @@ __initcall(slab_sysfs_init);
4445 * The /proc/slabinfo ABI 4418 * The /proc/slabinfo ABI
4446 */ 4419 */
4447#ifdef CONFIG_SLABINFO 4420#ifdef CONFIG_SLABINFO
4448
4449ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4450 size_t count, loff_t *ppos)
4451{
4452 return -EINVAL;
4453}
4454
4455
4456static void print_slabinfo_header(struct seq_file *m) 4421static void print_slabinfo_header(struct seq_file *m)
4457{ 4422{
4458 seq_puts(m, "slabinfo - version: 2.1\n"); 4423 seq_puts(m, "slabinfo - version: 2.1\n");
@@ -4520,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p)
4520 return 0; 4485 return 0;
4521} 4486}
4522 4487
4523const struct seq_operations slabinfo_op = { 4488static const struct seq_operations slabinfo_op = {
4524 .start = s_start, 4489 .start = s_start,
4525 .next = s_next, 4490 .next = s_next,
4526 .stop = s_stop, 4491 .stop = s_stop,
4527 .show = s_show, 4492 .show = s_show,
4528}; 4493};
4529 4494
4495static int slabinfo_open(struct inode *inode, struct file *file)
4496{
4497 return seq_open(file, &slabinfo_op);
4498}
4499
4500static const struct file_operations proc_slabinfo_operations = {
4501 .open = slabinfo_open,
4502 .read = seq_read,
4503 .llseek = seq_lseek,
4504 .release = seq_release,
4505};
4506
4507static int __init slab_proc_init(void)
4508{
4509 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4510 return 0;
4511}
4512module_init(slab_proc_init);
4530#endif /* CONFIG_SLABINFO */ 4513#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section)
147 return (section->section_mem_map >> SECTION_NID_SHIFT); 147 return (section->section_mem_map >> SECTION_NID_SHIFT);
148} 148}
149 149
150/* Record a memory area against a node. */ 150/* Validate the physical addressing limitations of the model */
151void __init memory_present(int nid, unsigned long start, unsigned long end) 151void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
152 unsigned long *end_pfn)
152{ 153{
153 unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); 154 unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
154 unsigned long pfn;
155 155
156 /* 156 /*
157 * Sanity checks - do not allow an architecture to pass 157 * Sanity checks - do not allow an architecture to pass
158 * in larger pfns than the maximum scope of sparsemem: 158 * in larger pfns than the maximum scope of sparsemem:
159 */ 159 */
160 if (start >= max_arch_pfn) 160 if (*start_pfn > max_sparsemem_pfn) {
161 return; 161 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
162 if (end >= max_arch_pfn) 162 "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
163 end = max_arch_pfn; 163 *start_pfn, *end_pfn, max_sparsemem_pfn);
164 WARN_ON_ONCE(1);
165 *start_pfn = max_sparsemem_pfn;
166 *end_pfn = max_sparsemem_pfn;
167 }
168
169 if (*end_pfn > max_sparsemem_pfn) {
170 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
171 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
172 *start_pfn, *end_pfn, max_sparsemem_pfn);
173 WARN_ON_ONCE(1);
174 *end_pfn = max_sparsemem_pfn;
175 }
176}
177
178/* Record a memory area against a node. */
179void __init memory_present(int nid, unsigned long start, unsigned long end)
180{
181 unsigned long pfn;
164 182
165 start &= PAGE_SECTION_MASK; 183 start &= PAGE_SECTION_MASK;
184 mminit_validate_memmodel_limits(&start, &end);
166 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { 185 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
167 unsigned long section = pfn_to_section_nr(pfn); 186 unsigned long section = pfn_to_section_nr(pfn);
168 struct mem_section *ms; 187 struct mem_section *ms;
@@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
187 unsigned long pfn; 206 unsigned long pfn;
188 unsigned long nr_pages = 0; 207 unsigned long nr_pages = 0;
189 208
209 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
190 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 210 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
191 if (nid != early_pfn_to_nid(pfn)) 211 if (nid != early_pfn_to_nid(pfn))
192 continue; 212 continue;
@@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void)
248} 268}
249#endif /* CONFIG_MEMORY_HOTPLUG */ 269#endif /* CONFIG_MEMORY_HOTPLUG */
250 270
271#ifdef CONFIG_MEMORY_HOTREMOVE
272static unsigned long * __init
273sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
274{
275 unsigned long section_nr;
276
277 /*
278 * A page may contain usemaps for other sections preventing the
279 * page being freed and making a section unremovable while
280 * other sections referencing the usemap retmain active. Similarly,
281 * a pgdat can prevent a section being removed. If section A
282 * contains a pgdat and section B contains the usemap, both
283 * sections become inter-dependent. This allocates usemaps
284 * from the same section as the pgdat where possible to avoid
285 * this problem.
286 */
287 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
288 return alloc_bootmem_section(usemap_size(), section_nr);
289}
290
291static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
292{
293 unsigned long usemap_snr, pgdat_snr;
294 static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
295 static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
296 struct pglist_data *pgdat = NODE_DATA(nid);
297 int usemap_nid;
298
299 usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
300 pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
301 if (usemap_snr == pgdat_snr)
302 return;
303
304 if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
305 /* skip redundant message */
306 return;
307
308 old_usemap_snr = usemap_snr;
309 old_pgdat_snr = pgdat_snr;
310
311 usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
312 if (usemap_nid != nid) {
313 printk(KERN_INFO
314 "node %d must be removed before remove section %ld\n",
315 nid, usemap_snr);
316 return;
317 }
318 /*
319 * There is a circular dependency.
320 * Some platforms allow un-removable section because they will just
321 * gather other removable sections for dynamic partitioning.
322 * Just notify un-removable section's number here.
323 */
324 printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
325 pgdat_snr, nid);
326 printk(KERN_CONT
327 " have a circular dependency on usemap and pgdat allocations\n");
328}
329#else
330static unsigned long * __init
331sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
332{
333 return NULL;
334}
335
336static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
337{
338}
339#endif /* CONFIG_MEMORY_HOTREMOVE */
340
251static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 341static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
252{ 342{
253 unsigned long *usemap; 343 unsigned long *usemap;
254 struct mem_section *ms = __nr_to_section(pnum); 344 struct mem_section *ms = __nr_to_section(pnum);
255 int nid = sparse_early_nid(ms); 345 int nid = sparse_early_nid(ms);
256 346
257 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 347 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
258 if (usemap) 348 if (usemap)
259 return usemap; 349 return usemap;
260 350
351 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
352 if (usemap) {
353 check_usemap_section_nr(nid, usemap);
354 return usemap;
355 }
356
261 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 357 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
262 nid = 0; 358 nid = 0;
263 359
@@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
280} 376}
281#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 377#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
282 378
283struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 379static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
284{ 380{
285 struct page *map; 381 struct page *map;
286 struct mem_section *ms = __nr_to_section(pnum); 382 struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..2152e48a7b8f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,12 +31,13 @@
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33 33
34#include "internal.h"
35
34/* How many pages do we try to swap or page in/out together? */ 36/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 37int page_cluster;
36 38
37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 39static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 40static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
40 41
41/* 42/*
42 * This path almost never happens for VM activity - pages are normally 43 * This path almost never happens for VM activity - pages are normally
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec)
116 zone = pagezone; 117 zone = pagezone;
117 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
118 } 119 }
119 if (PageLRU(page) && !PageActive(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
120 list_move_tail(&page->lru, &zone->inactive_list); 121 int lru = page_is_file_cache(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list);
121 pgmoved++; 123 pgmoved++;
122 } 124 }
123 } 125 }
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
136void rotate_reclaimable_page(struct page *page) 138void rotate_reclaimable_page(struct page *page)
137{ 139{
138 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 140 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
139 PageLRU(page)) { 141 !PageUnevictable(page) && PageLRU(page)) {
140 struct pagevec *pvec; 142 struct pagevec *pvec;
141 unsigned long flags; 143 unsigned long flags;
142 144
@@ -157,12 +159,19 @@ void activate_page(struct page *page)
157 struct zone *zone = page_zone(page); 159 struct zone *zone = page_zone(page);
158 160
159 spin_lock_irq(&zone->lru_lock); 161 spin_lock_irq(&zone->lru_lock);
160 if (PageLRU(page) && !PageActive(page)) { 162 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
161 del_page_from_inactive_list(zone, page); 163 int file = page_is_file_cache(page);
164 int lru = LRU_BASE + file;
165 del_page_from_lru_list(zone, page, lru);
166
162 SetPageActive(page); 167 SetPageActive(page);
163 add_page_to_active_list(zone, page); 168 lru += LRU_ACTIVE;
169 add_page_to_lru_list(zone, page, lru);
164 __count_vm_event(PGACTIVATE); 170 __count_vm_event(PGACTIVATE);
165 mem_cgroup_move_lists(page, true); 171 mem_cgroup_move_lists(page, lru);
172
173 zone->recent_rotated[!!file]++;
174 zone->recent_scanned[!!file]++;
166 } 175 }
167 spin_unlock_irq(&zone->lru_lock); 176 spin_unlock_irq(&zone->lru_lock);
168} 177}
@@ -176,7 +185,8 @@ void activate_page(struct page *page)
176 */ 185 */
177void mark_page_accessed(struct page *page) 186void mark_page_accessed(struct page *page)
178{ 187{
179 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 188 if (!PageActive(page) && !PageUnevictable(page) &&
189 PageReferenced(page) && PageLRU(page)) {
180 activate_page(page); 190 activate_page(page);
181 ClearPageReferenced(page); 191 ClearPageReferenced(page);
182 } else if (!PageReferenced(page)) { 192 } else if (!PageReferenced(page)) {
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page)
186 196
187EXPORT_SYMBOL(mark_page_accessed); 197EXPORT_SYMBOL(mark_page_accessed);
188 198
189/** 199void __lru_cache_add(struct page *page, enum lru_list lru)
190 * lru_cache_add: add a page to the page lists
191 * @page: the page to add
192 */
193void lru_cache_add(struct page *page)
194{ 200{
195 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 201 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
196 202
197 page_cache_get(page); 203 page_cache_get(page);
198 if (!pagevec_add(pvec, page)) 204 if (!pagevec_add(pvec, page))
199 __pagevec_lru_add(pvec); 205 ____pagevec_lru_add(pvec, lru);
200 put_cpu_var(lru_add_pvecs); 206 put_cpu_var(lru_add_pvecs);
201} 207}
202 208
203void lru_cache_add_active(struct page *page) 209/**
210 * lru_cache_add_lru - add a page to a page list
211 * @page: the page to be added to the LRU.
212 * @lru: the LRU list to which the page is added.
213 */
214void lru_cache_add_lru(struct page *page, enum lru_list lru)
204{ 215{
205 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 216 if (PageActive(page)) {
217 VM_BUG_ON(PageUnevictable(page));
218 ClearPageActive(page);
219 } else if (PageUnevictable(page)) {
220 VM_BUG_ON(PageActive(page));
221 ClearPageUnevictable(page);
222 }
206 223
207 page_cache_get(page); 224 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
208 if (!pagevec_add(pvec, page)) 225 __lru_cache_add(page, lru);
209 __pagevec_lru_add_active(pvec); 226}
210 put_cpu_var(lru_add_active_pvecs); 227
228/**
229 * add_page_to_unevictable_list - add a page to the unevictable list
230 * @page: the page to be added to the unevictable list
231 *
232 * Add page directly to its zone's unevictable list. To avoid races with
233 * tasks that might be making the page evictable, through eg. munlock,
234 * munmap or exit, while it's not on the lru, we want to add the page
235 * while it's locked or otherwise "invisible" to other tasks. This is
236 * difficult to do when using the pagevec cache, so bypass that.
237 */
238void add_page_to_unevictable_list(struct page *page)
239{
240 struct zone *zone = page_zone(page);
241
242 spin_lock_irq(&zone->lru_lock);
243 SetPageUnevictable(page);
244 SetPageLRU(page);
245 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
246 spin_unlock_irq(&zone->lru_lock);
247}
248
249/**
250 * lru_cache_add_active_or_unevictable
251 * @page: the page to be added to LRU
252 * @vma: vma in which page is mapped for determining reclaimability
253 *
254 * place @page on active or unevictable LRU list, depending on
255 * page_evictable(). Note that if the page is not evictable,
256 * it goes directly back onto it's zone's unevictable list. It does
257 * NOT use a per cpu pagevec.
258 */
259void lru_cache_add_active_or_unevictable(struct page *page,
260 struct vm_area_struct *vma)
261{
262 if (page_evictable(page, vma))
263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
264 else
265 add_page_to_unevictable_list(page);
211} 266}
212 267
213/* 268/*
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page)
217 */ 272 */
218static void drain_cpu_pagevecs(int cpu) 273static void drain_cpu_pagevecs(int cpu)
219{ 274{
275 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
220 struct pagevec *pvec; 276 struct pagevec *pvec;
277 int lru;
221 278
222 pvec = &per_cpu(lru_add_pvecs, cpu); 279 for_each_lru(lru) {
223 if (pagevec_count(pvec)) 280 pvec = &pvecs[lru - LRU_BASE];
224 __pagevec_lru_add(pvec); 281 if (pagevec_count(pvec))
225 282 ____pagevec_lru_add(pvec, lru);
226 pvec = &per_cpu(lru_add_active_pvecs, cpu); 283 }
227 if (pagevec_count(pvec))
228 __pagevec_lru_add_active(pvec);
229 284
230 pvec = &per_cpu(lru_rotate_pvecs, cpu); 285 pvec = &per_cpu(lru_rotate_pvecs, cpu);
231 if (pagevec_count(pvec)) { 286 if (pagevec_count(pvec)) {
@@ -244,7 +299,7 @@ void lru_add_drain(void)
244 put_cpu(); 299 put_cpu();
245} 300}
246 301
247#ifdef CONFIG_NUMA 302#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
248static void lru_add_drain_per_cpu(struct work_struct *dummy) 303static void lru_add_drain_per_cpu(struct work_struct *dummy)
249{ 304{
250 lru_add_drain(); 305 lru_add_drain();
@@ -278,9 +333,10 @@ int lru_add_drain_all(void)
278 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 333 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
279 * for the remainder of the operation. 334 * for the remainder of the operation.
280 * 335 *
281 * The locking in this function is against shrink_cache(): we recheck the 336 * The locking in this function is against shrink_inactive_list(): we recheck
282 * page count inside the lock to see whether shrink_cache grabbed the page 337 * the page count inside the lock to see whether shrink_inactive_list()
283 * via the LRU. If it did, give up: shrink_cache will free it. 338 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
339 * will free it.
284 */ 340 */
285void release_pages(struct page **pages, int nr, int cold) 341void release_pages(struct page **pages, int nr, int cold)
286{ 342{
@@ -307,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold)
307 363
308 if (PageLRU(page)) { 364 if (PageLRU(page)) {
309 struct zone *pagezone = page_zone(page); 365 struct zone *pagezone = page_zone(page);
366
310 if (pagezone != zone) { 367 if (pagezone != zone) {
311 if (zone) 368 if (zone)
312 spin_unlock_irqrestore(&zone->lru_lock, 369 spin_unlock_irqrestore(&zone->lru_lock,
@@ -379,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
379 * Add the passed pages to the LRU, then drop the caller's refcount 436 * Add the passed pages to the LRU, then drop the caller's refcount
380 * on them. Reinitialises the caller's pagevec. 437 * on them. Reinitialises the caller's pagevec.
381 */ 438 */
382void __pagevec_lru_add(struct pagevec *pvec) 439void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
383{ 440{
384 int i; 441 int i;
385 struct zone *zone = NULL; 442 struct zone *zone = NULL;
443 VM_BUG_ON(is_unevictable_lru(lru));
386 444
387 for (i = 0; i < pagevec_count(pvec); i++) { 445 for (i = 0; i < pagevec_count(pvec); i++) {
388 struct page *page = pvec->pages[i]; 446 struct page *page = pvec->pages[i];
@@ -394,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec)
394 zone = pagezone; 452 zone = pagezone;
395 spin_lock_irq(&zone->lru_lock); 453 spin_lock_irq(&zone->lru_lock);
396 } 454 }
455 VM_BUG_ON(PageActive(page));
456 VM_BUG_ON(PageUnevictable(page));
397 VM_BUG_ON(PageLRU(page)); 457 VM_BUG_ON(PageLRU(page));
398 SetPageLRU(page); 458 SetPageLRU(page);
399 add_page_to_inactive_list(zone, page); 459 if (is_active_lru(lru))
460 SetPageActive(page);
461 add_page_to_lru_list(zone, page, lru);
400 } 462 }
401 if (zone) 463 if (zone)
402 spin_unlock_irq(&zone->lru_lock); 464 spin_unlock_irq(&zone->lru_lock);
@@ -404,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec)
404 pagevec_reinit(pvec); 466 pagevec_reinit(pvec);
405} 467}
406 468
407EXPORT_SYMBOL(__pagevec_lru_add); 469EXPORT_SYMBOL(____pagevec_lru_add);
408 470
409void __pagevec_lru_add_active(struct pagevec *pvec) 471/*
472 * Try to drop buffers from the pages in a pagevec
473 */
474void pagevec_strip(struct pagevec *pvec)
410{ 475{
411 int i; 476 int i;
412 struct zone *zone = NULL;
413 477
414 for (i = 0; i < pagevec_count(pvec); i++) { 478 for (i = 0; i < pagevec_count(pvec); i++) {
415 struct page *page = pvec->pages[i]; 479 struct page *page = pvec->pages[i];
416 struct zone *pagezone = page_zone(page);
417 480
418 if (pagezone != zone) { 481 if (PagePrivate(page) && trylock_page(page)) {
419 if (zone) 482 if (PagePrivate(page))
420 spin_unlock_irq(&zone->lru_lock); 483 try_to_release_page(page, 0);
421 zone = pagezone; 484 unlock_page(page);
422 spin_lock_irq(&zone->lru_lock);
423 } 485 }
424 VM_BUG_ON(PageLRU(page));
425 SetPageLRU(page);
426 VM_BUG_ON(PageActive(page));
427 SetPageActive(page);
428 add_page_to_active_list(zone, page);
429 } 486 }
430 if (zone)
431 spin_unlock_irq(&zone->lru_lock);
432 release_pages(pvec->pages, pvec->nr, pvec->cold);
433 pagevec_reinit(pvec);
434} 487}
435 488
436/* 489/**
437 * Try to drop buffers from the pages in a pagevec 490 * pagevec_swap_free - try to free swap space from the pages in a pagevec
491 * @pvec: pagevec with swapcache pages to free the swap space of
492 *
493 * The caller needs to hold an extra reference to each page and
494 * not hold the page lock on the pages. This function uses a
495 * trylock on the page lock so it may not always free the swap
496 * space associated with a page.
438 */ 497 */
439void pagevec_strip(struct pagevec *pvec) 498void pagevec_swap_free(struct pagevec *pvec)
440{ 499{
441 int i; 500 int i;
442 501
443 for (i = 0; i < pagevec_count(pvec); i++) { 502 for (i = 0; i < pagevec_count(pvec); i++) {
444 struct page *page = pvec->pages[i]; 503 struct page *page = pvec->pages[i];
445 504
446 if (PagePrivate(page) && !TestSetPageLocked(page)) { 505 if (PageSwapCache(page) && trylock_page(page)) {
447 if (PagePrivate(page)) 506 if (PageSwapCache(page))
448 try_to_release_page(page, 0); 507 remove_exclusive_swap_page_ref(page);
449 unlock_page(page); 508 unlock_page(page);
450 } 509 }
451 } 510 }
@@ -493,7 +552,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
493 */ 552 */
494#define ACCT_THRESHOLD max(16, NR_CPUS * 2) 553#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
495 554
496static DEFINE_PER_CPU(long, committed_space) = 0; 555static DEFINE_PER_CPU(long, committed_space);
497 556
498void vm_acct_memory(long pages) 557void vm_acct_memory(long pages)
499{ 558{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..3353c9029cef 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,13 +33,13 @@ static const struct address_space_operations swap_aops = {
33}; 33};
34 34
35static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37 .unplug_io_fn = swap_unplug_io_fn, 37 .unplug_io_fn = swap_unplug_io_fn,
38}; 38};
39 39
40struct address_space swapper_space = { 40struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), 42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
43 .a_ops = &swap_aops, 43 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_info, 45 .backing_dev_info = &swap_backing_dev_info,
@@ -56,15 +56,16 @@ static struct {
56 56
57void show_swap_cache_info(void) 57void show_swap_cache_info(void)
58{ 58{
59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", 59 printk("%lu pages in swap cache\n", total_swapcache_pages);
60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 61 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 62 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 64 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64} 65}
65 66
66/* 67/*
67 * add_to_swap_cache resembles add_to_page_cache on swapper_space, 68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 69 * but sets SwapCache flag and private instead of mapping and index.
69 */ 70 */
70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 71int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -74,21 +75,29 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
74 BUG_ON(!PageLocked(page)); 75 BUG_ON(!PageLocked(page));
75 BUG_ON(PageSwapCache(page)); 76 BUG_ON(PageSwapCache(page));
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
78 BUG_ON(!PageSwapBacked(page));
77 error = radix_tree_preload(gfp_mask); 79 error = radix_tree_preload(gfp_mask);
78 if (!error) { 80 if (!error) {
79 write_lock_irq(&swapper_space.tree_lock); 81 page_cache_get(page);
82 SetPageSwapCache(page);
83 set_page_private(page, entry.val);
84
85 spin_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 86 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 87 entry.val, page);
82 if (!error) { 88 if (likely(!error)) {
83 page_cache_get(page);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86 total_swapcache_pages++; 89 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 90 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 91 INC_CACHE_INFO(add_total);
89 } 92 }
90 write_unlock_irq(&swapper_space.tree_lock); 93 spin_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 94 radix_tree_preload_end();
95
96 if (unlikely(error)) {
97 set_page_private(page, 0UL);
98 ClearPageSwapCache(page);
99 page_cache_release(page);
100 }
92 } 101 }
93 return error; 102 return error;
94} 103}
@@ -175,9 +184,9 @@ void delete_from_swap_cache(struct page *page)
175 184
176 entry.val = page_private(page); 185 entry.val = page_private(page);
177 186
178 write_lock_irq(&swapper_space.tree_lock); 187 spin_lock_irq(&swapper_space.tree_lock);
179 __delete_from_swap_cache(page); 188 __delete_from_swap_cache(page);
180 write_unlock_irq(&swapper_space.tree_lock); 189 spin_unlock_irq(&swapper_space.tree_lock);
181 190
182 swap_free(entry); 191 swap_free(entry);
183 page_cache_release(page); 192 page_cache_release(page);
@@ -193,7 +202,7 @@ void delete_from_swap_cache(struct page *page)
193 */ 202 */
194static inline void free_swap_cache(struct page *page) 203static inline void free_swap_cache(struct page *page)
195{ 204{
196 if (PageSwapCache(page) && !TestSetPageLocked(page)) { 205 if (PageSwapCache(page) && trylock_page(page)) {
197 remove_exclusive_swap_page(page); 206 remove_exclusive_swap_page(page);
198 unlock_page(page); 207 unlock_page(page);
199 } 208 }
@@ -294,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
294 * re-using the just freed swap entry for an existing page. 303 * re-using the just freed swap entry for an existing page.
295 * May fail (-ENOMEM) if radix-tree node allocation failed. 304 * May fail (-ENOMEM) if radix-tree node allocation failed.
296 */ 305 */
297 SetPageLocked(new_page); 306 __set_page_locked(new_page);
307 SetPageSwapBacked(new_page);
298 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 308 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
299 if (!err) { 309 if (likely(!err)) {
300 /* 310 /*
301 * Initiate read into locked page and return. 311 * Initiate read into locked page and return.
302 */ 312 */
303 lru_cache_add_active(new_page); 313 lru_cache_add_anon(new_page);
304 swap_readpage(NULL, new_page); 314 swap_readpage(NULL, new_page);
305 return new_page; 315 return new_page;
306 } 316 }
307 ClearPageLocked(new_page); 317 ClearPageSwapBacked(new_page);
318 __clear_page_locked(new_page);
308 swap_free(entry); 319 swap_free(entry);
309 } while (err != -ENOMEM); 320 } while (err != -ENOMEM);
310 321
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..90cb67a5417c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,17 +33,18 @@
33#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <linux/swapops.h> 34#include <linux/swapops.h>
35 35
36DEFINE_SPINLOCK(swap_lock); 36static DEFINE_SPINLOCK(swap_lock);
37unsigned int nr_swapfiles; 37static unsigned int nr_swapfiles;
38long total_swap_pages; 38long total_swap_pages;
39static int swap_overflow; 39static int swap_overflow;
40static int least_priority;
40 41
41static const char Bad_file[] = "Bad swap file entry "; 42static const char Bad_file[] = "Bad swap file entry ";
42static const char Unused_file[] = "Unused swap file entry "; 43static const char Unused_file[] = "Unused swap file entry ";
43static const char Bad_offset[] = "Bad swap offset entry "; 44static const char Bad_offset[] = "Bad swap offset entry ";
44static const char Unused_offset[] = "Unused swap offset entry "; 45static const char Unused_offset[] = "Unused swap offset entry ";
45 46
46struct swap_list_t swap_list = {-1, -1}; 47static struct swap_list_t swap_list = {-1, -1};
47 48
48static struct swap_info_struct swap_info[MAX_SWAPFILES]; 49static struct swap_info_struct swap_info[MAX_SWAPFILES];
49 50
@@ -343,7 +344,7 @@ int can_share_swap_page(struct page *page)
343 * Work out if there are any other processes sharing this 344 * Work out if there are any other processes sharing this
344 * swap cache page. Free it if you can. Return success. 345 * swap cache page. Free it if you can. Return success.
345 */ 346 */
346int remove_exclusive_swap_page(struct page *page) 347static int remove_exclusive_swap_page_count(struct page *page, int count)
347{ 348{
348 int retval; 349 int retval;
349 struct swap_info_struct * p; 350 struct swap_info_struct * p;
@@ -356,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
356 return 0; 357 return 0;
357 if (PageWriteback(page)) 358 if (PageWriteback(page))
358 return 0; 359 return 0;
359 if (page_count(page) != 2) /* 2: us + cache */ 360 if (page_count(page) != count) /* us + cache + ptes */
360 return 0; 361 return 0;
361 362
362 entry.val = page_private(page); 363 entry.val = page_private(page);
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
368 retval = 0; 369 retval = 0;
369 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
370 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
371 write_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
372 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == count) && !PageWriteback(page)) {
373 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
374 SetPageDirty(page); 375 SetPageDirty(page);
375 retval = 1; 376 retval = 1;
376 } 377 }
377 write_unlock_irq(&swapper_space.tree_lock); 378 spin_unlock_irq(&swapper_space.tree_lock);
378 } 379 }
379 spin_unlock(&swap_lock); 380 spin_unlock(&swap_lock);
380 381
@@ -387,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page)
387} 388}
388 389
389/* 390/*
391 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache.
393 */
394int remove_exclusive_swap_page(struct page *page)
395{
396 return remove_exclusive_swap_page_count(page, 2);
397}
398
399/*
400 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page.
403 */
404int remove_exclusive_swap_page_ref(struct page *page)
405{
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407}
408
409/*
390 * Free the swap entry like above, but also try to 410 * Free the swap entry like above, but also try to
391 * free the page cache entry if it is the last user. 411 * free the page cache entry if it is the last user.
392 */ 412 */
@@ -402,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
402 if (p) { 422 if (p) {
403 if (swap_entry_free(p, swp_offset(entry)) == 1) { 423 if (swap_entry_free(p, swp_offset(entry)) == 1) {
404 page = find_get_page(&swapper_space, entry.val); 424 page = find_get_page(&swapper_space, entry.val);
405 if (page && unlikely(TestSetPageLocked(page))) { 425 if (page && !trylock_page(page)) {
406 page_cache_release(page); 426 page_cache_release(page);
407 page = NULL; 427 page = NULL;
408 } 428 }
@@ -655,8 +675,8 @@ static int unuse_mm(struct mm_struct *mm,
655 675
656 if (!down_read_trylock(&mm->mmap_sem)) { 676 if (!down_read_trylock(&mm->mmap_sem)) {
657 /* 677 /*
658 * Activate page so shrink_cache is unlikely to unmap its 678 * Activate page so shrink_inactive_list is unlikely to unmap
659 * ptes while lock is dropped, so swapoff can make progress. 679 * its ptes while lock is dropped, so swapoff can make progress.
660 */ 680 */
661 activate_page(page); 681 activate_page(page);
662 unlock_page(page); 682 unlock_page(page);
@@ -1260,6 +1280,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1260 /* just pick something that's safe... */ 1280 /* just pick something that's safe... */
1261 swap_list.next = swap_list.head; 1281 swap_list.next = swap_list.head;
1262 } 1282 }
1283 if (p->prio < 0) {
1284 for (i = p->next; i >= 0; i = swap_info[i].next)
1285 swap_info[i].prio = p->prio--;
1286 least_priority++;
1287 }
1263 nr_swap_pages -= p->pages; 1288 nr_swap_pages -= p->pages;
1264 total_swap_pages -= p->pages; 1289 total_swap_pages -= p->pages;
1265 p->flags &= ~SWP_WRITEOK; 1290 p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1297,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1272 if (err) { 1297 if (err) {
1273 /* re-insert swap space back into swap_list */ 1298 /* re-insert swap space back into swap_list */
1274 spin_lock(&swap_lock); 1299 spin_lock(&swap_lock);
1275 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 1300 if (p->prio < 0)
1301 p->prio = --least_priority;
1302 prev = -1;
1303 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1276 if (p->prio >= swap_info[i].prio) 1304 if (p->prio >= swap_info[i].prio)
1277 break; 1305 break;
1306 prev = i;
1307 }
1278 p->next = i; 1308 p->next = i;
1279 if (prev < 0) 1309 if (prev < 0)
1280 swap_list.head = swap_list.next = p - swap_info; 1310 swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1477,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1447 unsigned int type; 1477 unsigned int type;
1448 int i, prev; 1478 int i, prev;
1449 int error; 1479 int error;
1450 static int least_priority;
1451 union swap_header *swap_header = NULL; 1480 union swap_header *swap_header = NULL;
1452 int swap_header_version; 1481 int swap_header_version;
1453 unsigned int nr_good_pages = 0; 1482 unsigned int nr_good_pages = 0;
@@ -1455,7 +1484,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1455 sector_t span; 1484 sector_t span;
1456 unsigned long maxpages = 1; 1485 unsigned long maxpages = 1;
1457 int swapfilesize; 1486 int swapfilesize;
1458 unsigned short *swap_map; 1487 unsigned short *swap_map = NULL;
1459 struct page *page = NULL; 1488 struct page *page = NULL;
1460 struct inode *inode = NULL; 1489 struct inode *inode = NULL;
1461 int did_down = 0; 1490 int did_down = 0;
@@ -1474,22 +1503,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1474 } 1503 }
1475 if (type >= nr_swapfiles) 1504 if (type >= nr_swapfiles)
1476 nr_swapfiles = type+1; 1505 nr_swapfiles = type+1;
1506 memset(p, 0, sizeof(*p));
1477 INIT_LIST_HEAD(&p->extent_list); 1507 INIT_LIST_HEAD(&p->extent_list);
1478 p->flags = SWP_USED; 1508 p->flags = SWP_USED;
1479 p->swap_file = NULL;
1480 p->old_block_size = 0;
1481 p->swap_map = NULL;
1482 p->lowest_bit = 0;
1483 p->highest_bit = 0;
1484 p->cluster_nr = 0;
1485 p->inuse_pages = 0;
1486 p->next = -1; 1509 p->next = -1;
1487 if (swap_flags & SWAP_FLAG_PREFER) {
1488 p->prio =
1489 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1490 } else {
1491 p->prio = --least_priority;
1492 }
1493 spin_unlock(&swap_lock); 1510 spin_unlock(&swap_lock);
1494 name = getname(specialfile); 1511 name = getname(specialfile);
1495 error = PTR_ERR(name); 1512 error = PTR_ERR(name);
@@ -1632,19 +1649,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1632 goto bad_swap; 1649 goto bad_swap;
1633 1650
1634 /* OK, set up the swap map and apply the bad block list */ 1651 /* OK, set up the swap map and apply the bad block list */
1635 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1652 swap_map = vmalloc(maxpages * sizeof(short));
1653 if (!swap_map) {
1636 error = -ENOMEM; 1654 error = -ENOMEM;
1637 goto bad_swap; 1655 goto bad_swap;
1638 } 1656 }
1639 1657
1640 error = 0; 1658 error = 0;
1641 memset(p->swap_map, 0, maxpages * sizeof(short)); 1659 memset(swap_map, 0, maxpages * sizeof(short));
1642 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1660 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1643 int page_nr = swap_header->info.badpages[i]; 1661 int page_nr = swap_header->info.badpages[i];
1644 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1662 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1645 error = -EINVAL; 1663 error = -EINVAL;
1646 else 1664 else
1647 p->swap_map[page_nr] = SWAP_MAP_BAD; 1665 swap_map[page_nr] = SWAP_MAP_BAD;
1648 } 1666 }
1649 nr_good_pages = swap_header->info.last_page - 1667 nr_good_pages = swap_header->info.last_page -
1650 swap_header->info.nr_badpages - 1668 swap_header->info.nr_badpages -
@@ -1654,7 +1672,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1654 } 1672 }
1655 1673
1656 if (nr_good_pages) { 1674 if (nr_good_pages) {
1657 p->swap_map[0] = SWAP_MAP_BAD; 1675 swap_map[0] = SWAP_MAP_BAD;
1658 p->max = maxpages; 1676 p->max = maxpages;
1659 p->pages = nr_good_pages; 1677 p->pages = nr_good_pages;
1660 nr_extents = setup_swap_extents(p, &span); 1678 nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1690,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1672 1690
1673 mutex_lock(&swapon_mutex); 1691 mutex_lock(&swapon_mutex);
1674 spin_lock(&swap_lock); 1692 spin_lock(&swap_lock);
1693 if (swap_flags & SWAP_FLAG_PREFER)
1694 p->prio =
1695 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
1696 else
1697 p->prio = --least_priority;
1698 p->swap_map = swap_map;
1675 p->flags = SWP_ACTIVE; 1699 p->flags = SWP_ACTIVE;
1676 nr_swap_pages += nr_good_pages; 1700 nr_swap_pages += nr_good_pages;
1677 total_swap_pages += nr_good_pages; 1701 total_swap_pages += nr_good_pages;
@@ -1707,12 +1731,8 @@ bad_swap:
1707 destroy_swap_extents(p); 1731 destroy_swap_extents(p);
1708bad_swap_2: 1732bad_swap_2:
1709 spin_lock(&swap_lock); 1733 spin_lock(&swap_lock);
1710 swap_map = p->swap_map;
1711 p->swap_file = NULL; 1734 p->swap_file = NULL;
1712 p->swap_map = NULL;
1713 p->flags = 0; 1735 p->flags = 0;
1714 if (!(swap_flags & SWAP_FLAG_PREFER))
1715 ++least_priority;
1716 spin_unlock(&swap_lock); 1736 spin_unlock(&swap_lock);
1717 vfree(swap_map); 1737 vfree(swap_map);
1718 if (swap_file) 1738 if (swap_file)
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f501943..3e67d575ee6e 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,36 +65,37 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
65 if (!dentry) 65 if (!dentry)
66 goto put_memory; 66 goto put_memory;
67 67
68 error = -ENFILE;
69 file = get_empty_filp();
70 if (!file)
71 goto put_dentry;
72
68 error = -ENOSPC; 73 error = -ENOSPC;
69 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 74 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
70 if (!inode) 75 if (!inode)
71 goto put_dentry; 76 goto close_file;
72 77
73 d_instantiate(dentry, inode); 78 d_instantiate(dentry, inode);
74 error = -ENFILE; 79 inode->i_size = size;
75 file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
76 &ramfs_file_operations);
77 if (!file)
78 goto put_dentry;
79
80 inode->i_nlink = 0; /* It is unlinked */ 80 inode->i_nlink = 0; /* It is unlinked */
81 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
82 &ramfs_file_operations);
81 83
82 /* notify everyone as to the change of file size */ 84#ifndef CONFIG_MMU
83 error = do_truncate(dentry, size, 0, file); 85 error = ramfs_nommu_expand_for_mapping(inode, size);
84 if (error < 0) 86 if (error)
85 goto close_file; 87 goto close_file;
86 88#endif
87 return file; 89 return file;
88 90
89close_file: 91close_file:
90 put_filp(file); 92 put_filp(file);
91 return ERR_PTR(error);
92
93put_dentry: 93put_dentry:
94 dput(dentry); 94 dput(dentry);
95put_memory: 95put_memory:
96 return ERR_PTR(error); 96 return ERR_PTR(error);
97} 97}
98EXPORT_SYMBOL_GPL(shmem_file_setup);
98 99
99/** 100/**
100 * shmem_zero_setup - setup a shared anonymous mapping 101 * shmem_zero_setup - setup a shared anonymous mapping
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..1229211104f8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds 4 * Copyright (C) 2002, Linus Torvalds
5 * 5 *
6 * 10Sep2002 akpm@zip.com.au 6 * 10Sep2002 Andrew Morton
7 * Initial version. 7 * Initial version.
8 */ 8 */
9 9
@@ -18,6 +18,7 @@
18#include <linux/task_io_accounting_ops.h> 18#include <linux/task_io_accounting_ops.h>
19#include <linux/buffer_head.h> /* grr. try_to_release_page, 19#include <linux/buffer_head.h> /* grr. try_to_release_page,
20 do_invalidatepage */ 20 do_invalidatepage */
21#include "internal.h"
21 22
22 23
23/** 24/**
@@ -103,8 +104,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
103 104
104 cancel_dirty_page(page, PAGE_CACHE_SIZE); 105 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105 106
107 clear_page_mlock(page);
106 remove_from_page_cache(page); 108 remove_from_page_cache(page);
107 ClearPageUptodate(page);
108 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
109 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
110} 111}
@@ -128,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
128 if (PagePrivate(page) && !try_to_release_page(page, 0)) 129 if (PagePrivate(page) && !try_to_release_page(page, 0))
129 return 0; 130 return 0;
130 131
132 clear_page_mlock(page);
131 ret = remove_mapping(mapping, page); 133 ret = remove_mapping(mapping, page);
132 134
133 return ret; 135 return ret;
@@ -188,7 +190,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
188 if (page_index > next) 190 if (page_index > next)
189 next = page_index; 191 next = page_index;
190 next++; 192 next++;
191 if (TestSetPageLocked(page)) 193 if (!trylock_page(page))
192 continue; 194 continue;
193 if (PageWriteback(page)) { 195 if (PageWriteback(page)) {
194 unlock_page(page); 196 unlock_page(page);
@@ -281,7 +283,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
281 pgoff_t index; 283 pgoff_t index;
282 int lock_failed; 284 int lock_failed;
283 285
284 lock_failed = TestSetPageLocked(page); 286 lock_failed = !trylock_page(page);
285 287
286 /* 288 /*
287 * We really shouldn't be looking at the ->index of an 289 * We really shouldn't be looking at the ->index of an
@@ -349,18 +351,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 351 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
350 return 0; 352 return 0;
351 353
352 write_lock_irq(&mapping->tree_lock); 354 spin_lock_irq(&mapping->tree_lock);
353 if (PageDirty(page)) 355 if (PageDirty(page))
354 goto failed; 356 goto failed;
355 357
358 clear_page_mlock(page);
356 BUG_ON(PagePrivate(page)); 359 BUG_ON(PagePrivate(page));
357 __remove_from_page_cache(page); 360 __remove_from_page_cache(page);
358 write_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
359 ClearPageUptodate(page);
360 page_cache_release(page); /* pagecache ref */ 362 page_cache_release(page); /* pagecache ref */
361 return 1; 363 return 1;
362failed: 364failed:
363 write_unlock_irq(&mapping->tree_lock); 365 spin_unlock_irq(&mapping->tree_lock);
364 return 0; 366 return 0;
365} 367}
366 368
@@ -382,7 +384,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
382 * Any pages which are found to be mapped into pagetables are unmapped prior to 384 * Any pages which are found to be mapped into pagetables are unmapped prior to
383 * invalidation. 385 * invalidation.
384 * 386 *
385 * Returns -EIO if any pages could not be invalidated. 387 * Returns -EBUSY if any pages could not be invalidated.
386 */ 388 */
387int invalidate_inode_pages2_range(struct address_space *mapping, 389int invalidate_inode_pages2_range(struct address_space *mapping,
388 pgoff_t start, pgoff_t end) 390 pgoff_t start, pgoff_t end)
@@ -442,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
442 ret2 = do_launder_page(mapping, page); 444 ret2 = do_launder_page(mapping, page);
443 if (ret2 == 0) { 445 if (ret2 == 0) {
444 if (!invalidate_complete_page2(mapping, page)) 446 if (!invalidate_complete_page2(mapping, page))
445 ret2 = -EIO; 447 ret2 = -EBUSY;
446 } 448 }
447 if (ret2 < 0) 449 if (ret2 < 0)
448 ret = ret2; 450 ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
1#include <linux/mm.h>
1#include <linux/slab.h> 2#include <linux/slab.h>
2#include <linux/string.h> 3#include <linux/string.h>
3#include <linux/module.h> 4#include <linux/module.h>
4#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h>
5#include <asm/uaccess.h> 7#include <asm/uaccess.h>
6 8
7/** 9/**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
68EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
69 71
70/** 72/**
71 * krealloc - reallocate memory. The contents will remain unchanged. 73 * __krealloc - like krealloc() but don't free @p.
72 * @p: object to reallocate memory for. 74 * @p: object to reallocate memory for.
73 * @new_size: how many bytes of memory are required. 75 * @new_size: how many bytes of memory are required.
74 * @flags: the type of memory to allocate. 76 * @flags: the type of memory to allocate.
75 * 77 *
76 * The contents of the object pointed to are preserved up to the 78 * This function is like krealloc() except it never frees the originally
77 * lesser of the new and old sizes. If @p is %NULL, krealloc() 79 * allocated buffer. Use this if you don't want to free the buffer immediately
78 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 80 * like, for example, with RCU.
79 * %NULL pointer, the object pointed to is freed.
80 */ 81 */
81void *krealloc(const void *p, size_t new_size, gfp_t flags) 82void *__krealloc(const void *p, size_t new_size, gfp_t flags)
82{ 83{
83 void *ret; 84 void *ret;
84 size_t ks = 0; 85 size_t ks = 0;
85 86
86 if (unlikely(!new_size)) { 87 if (unlikely(!new_size))
87 kfree(p);
88 return ZERO_SIZE_PTR; 88 return ZERO_SIZE_PTR;
89 }
90 89
91 if (p) 90 if (p)
92 ks = ksize(p); 91 ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
95 return (void *)p; 94 return (void *)p;
96 95
97 ret = kmalloc_track_caller(new_size, flags); 96 ret = kmalloc_track_caller(new_size, flags);
98 if (ret && p) { 97 if (ret && p)
99 memcpy(ret, p, ks); 98 memcpy(ret, p, ks);
99
100 return ret;
101}
102EXPORT_SYMBOL(__krealloc);
103
104/**
105 * krealloc - reallocate memory. The contents will remain unchanged.
106 * @p: object to reallocate memory for.
107 * @new_size: how many bytes of memory are required.
108 * @flags: the type of memory to allocate.
109 *
110 * The contents of the object pointed to are preserved up to the
111 * lesser of the new and old sizes. If @p is %NULL, krealloc()
112 * behaves exactly like kmalloc(). If @size is 0 and @p is not a
113 * %NULL pointer, the object pointed to is freed.
114 */
115void *krealloc(const void *p, size_t new_size, gfp_t flags)
116{
117 void *ret;
118
119 if (unlikely(!new_size)) {
100 kfree(p); 120 kfree(p);
121 return ZERO_SIZE_PTR;
101 } 122 }
123
124 ret = __krealloc(p, new_size, flags);
125 if (ret && p != ret)
126 kfree(p);
127
102 return ret; 128 return ret;
103} 129}
104EXPORT_SYMBOL(krealloc); 130EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
136 return p; 162 return p;
137} 163}
138EXPORT_SYMBOL(strndup_user); 164EXPORT_SYMBOL(strndup_user);
165
166#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
167void arch_pick_mmap_layout(struct mm_struct *mm)
168{
169 mm->mmap_base = TASK_UNMAPPED_BASE;
170 mm->get_unmapped_area = arch_get_unmapped_area;
171 mm->unmap_area = arch_unmap_area;
172}
173#endif
174
175int __attribute__((weak)) get_user_pages_fast(unsigned long start,
176 int nr_pages, int write, struct page **pages)
177{
178 struct mm_struct *mm = current->mm;
179 int ret;
180
181 down_read(&mm->mmap_sem);
182 ret = get_user_pages(current, mm, start, nr_pages,
183 write, 0, pages, NULL);
184 up_read(&mm->mmap_sem);
185
186 return ret;
187}
188EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d125..036536945dd9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,26 +8,28 @@
8 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Numa awareness, Christoph Lameter, SGI, June 2005
9 */ 9 */
10 10
11#include <linux/vmalloc.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/highmem.h> 14#include <linux/highmem.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <linux/proc_fs.h>
17#include <linux/seq_file.h> 19#include <linux/seq_file.h>
18#include <linux/debugobjects.h> 20#include <linux/debugobjects.h>
19#include <linux/vmalloc.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
22#include <linux/list.h>
23#include <linux/rbtree.h>
24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h>
21 26
27#include <asm/atomic.h>
22#include <asm/uaccess.h> 28#include <asm/uaccess.h>
23#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
24 30
25 31
26DEFINE_RWLOCK(vmlist_lock); 32/*** Page table manipulation functions ***/
27struct vm_struct *vmlist;
28
29static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
30 int node, void *caller);
31 33
32static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 34static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
33{ 35{
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
40 } while (pte++, addr += PAGE_SIZE, addr != end); 42 } while (pte++, addr += PAGE_SIZE, addr != end);
41} 43}
42 44
43static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, 45static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
44 unsigned long end)
45{ 46{
46 pmd_t *pmd; 47 pmd_t *pmd;
47 unsigned long next; 48 unsigned long next;
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
55 } while (pmd++, addr = next, addr != end); 56 } while (pmd++, addr = next, addr != end);
56} 57}
57 58
58static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, 59static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
59 unsigned long end)
60{ 60{
61 pud_t *pud; 61 pud_t *pud;
62 unsigned long next; 62 unsigned long next;
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
70 } while (pud++, addr = next, addr != end); 70 } while (pud++, addr = next, addr != end);
71} 71}
72 72
73void unmap_kernel_range(unsigned long addr, unsigned long size) 73static void vunmap_page_range(unsigned long addr, unsigned long end)
74{ 74{
75 pgd_t *pgd; 75 pgd_t *pgd;
76 unsigned long next; 76 unsigned long next;
77 unsigned long start = addr;
78 unsigned long end = addr + size;
79 77
80 BUG_ON(addr >= end); 78 BUG_ON(addr >= end);
81 pgd = pgd_offset_k(addr); 79 pgd = pgd_offset_k(addr);
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
86 continue; 84 continue;
87 vunmap_pud_range(pgd, addr, next); 85 vunmap_pud_range(pgd, addr, next);
88 } while (pgd++, addr = next, addr != end); 86 } while (pgd++, addr = next, addr != end);
89 flush_tlb_kernel_range(start, end);
90}
91
92static void unmap_vm_area(struct vm_struct *area)
93{
94 unmap_kernel_range((unsigned long)area->addr, area->size);
95} 87}
96 88
97static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 89static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
98 unsigned long end, pgprot_t prot, struct page ***pages) 90 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
99{ 91{
100 pte_t *pte; 92 pte_t *pte;
101 93
94 /*
95 * nr is a running index into the array which helps higher level
96 * callers keep track of where we're up to.
97 */
98
102 pte = pte_alloc_kernel(pmd, addr); 99 pte = pte_alloc_kernel(pmd, addr);
103 if (!pte) 100 if (!pte)
104 return -ENOMEM; 101 return -ENOMEM;
105 do { 102 do {
106 struct page *page = **pages; 103 struct page *page = pages[*nr];
107 WARN_ON(!pte_none(*pte)); 104
108 if (!page) 105 if (WARN_ON(!pte_none(*pte)))
106 return -EBUSY;
107 if (WARN_ON(!page))
109 return -ENOMEM; 108 return -ENOMEM;
110 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 109 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
111 (*pages)++; 110 (*nr)++;
112 } while (pte++, addr += PAGE_SIZE, addr != end); 111 } while (pte++, addr += PAGE_SIZE, addr != end);
113 return 0; 112 return 0;
114} 113}
115 114
116static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, 115static int vmap_pmd_range(pud_t *pud, unsigned long addr,
117 unsigned long end, pgprot_t prot, struct page ***pages) 116 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
118{ 117{
119 pmd_t *pmd; 118 pmd_t *pmd;
120 unsigned long next; 119 unsigned long next;
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
124 return -ENOMEM; 123 return -ENOMEM;
125 do { 124 do {
126 next = pmd_addr_end(addr, end); 125 next = pmd_addr_end(addr, end);
127 if (vmap_pte_range(pmd, addr, next, prot, pages)) 126 if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
128 return -ENOMEM; 127 return -ENOMEM;
129 } while (pmd++, addr = next, addr != end); 128 } while (pmd++, addr = next, addr != end);
130 return 0; 129 return 0;
131} 130}
132 131
133static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, 132static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
134 unsigned long end, pgprot_t prot, struct page ***pages) 133 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
135{ 134{
136 pud_t *pud; 135 pud_t *pud;
137 unsigned long next; 136 unsigned long next;
@@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
141 return -ENOMEM; 140 return -ENOMEM;
142 do { 141 do {
143 next = pud_addr_end(addr, end); 142 next = pud_addr_end(addr, end);
144 if (vmap_pmd_range(pud, addr, next, prot, pages)) 143 if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
145 return -ENOMEM; 144 return -ENOMEM;
146 } while (pud++, addr = next, addr != end); 145 } while (pud++, addr = next, addr != end);
147 return 0; 146 return 0;
148} 147}
149 148
150int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 149/*
150 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
151 * will have pfns corresponding to the "pages" array.
152 *
153 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
154 */
155static int vmap_page_range(unsigned long addr, unsigned long end,
156 pgprot_t prot, struct page **pages)
151{ 157{
152 pgd_t *pgd; 158 pgd_t *pgd;
153 unsigned long next; 159 unsigned long next;
154 unsigned long addr = (unsigned long) area->addr; 160 int err = 0;
155 unsigned long end = addr + area->size - PAGE_SIZE; 161 int nr = 0;
156 int err;
157 162
158 BUG_ON(addr >= end); 163 BUG_ON(addr >= end);
159 pgd = pgd_offset_k(addr); 164 pgd = pgd_offset_k(addr);
160 do { 165 do {
161 next = pgd_addr_end(addr, end); 166 next = pgd_addr_end(addr, end);
162 err = vmap_pud_range(pgd, addr, next, prot, pages); 167 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
163 if (err) 168 if (err)
164 break; 169 break;
165 } while (pgd++, addr = next, addr != end); 170 } while (pgd++, addr = next, addr != end);
166 flush_cache_vmap((unsigned long) area->addr, end); 171 flush_cache_vmap(addr, end);
167 return err; 172
173 if (unlikely(err))
174 return err;
175 return nr;
176}
177
178static inline int is_vmalloc_or_module_addr(const void *x)
179{
180 /*
181 * x86-64 and sparc64 put modules in a special place,
182 * and fall back on vmalloc() if that fails. Others
183 * just put it in the vmalloc space.
184 */
185#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
186 unsigned long addr = (unsigned long)x;
187 if (addr >= MODULES_VADDR && addr < MODULES_END)
188 return 1;
189#endif
190 return is_vmalloc_addr(x);
168} 191}
169EXPORT_SYMBOL_GPL(map_vm_area);
170 192
171/* 193/*
172 * Map a vmalloc()-space virtual address to the physical page. 194 * Walk a vmap address to the struct page it maps.
173 */ 195 */
174struct page *vmalloc_to_page(const void *vmalloc_addr) 196struct page *vmalloc_to_page(const void *vmalloc_addr)
175{ 197{
176 unsigned long addr = (unsigned long) vmalloc_addr; 198 unsigned long addr = (unsigned long) vmalloc_addr;
177 struct page *page = NULL; 199 struct page *page = NULL;
178 pgd_t *pgd = pgd_offset_k(addr); 200 pgd_t *pgd = pgd_offset_k(addr);
179 pud_t *pud; 201
180 pmd_t *pmd; 202 /*
181 pte_t *ptep, pte; 203 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
204 * architectures that do not vmalloc module space
205 */
206 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
182 207
183 if (!pgd_none(*pgd)) { 208 if (!pgd_none(*pgd)) {
184 pud = pud_offset(pgd, addr); 209 pud_t *pud = pud_offset(pgd, addr);
185 if (!pud_none(*pud)) { 210 if (!pud_none(*pud)) {
186 pmd = pmd_offset(pud, addr); 211 pmd_t *pmd = pmd_offset(pud, addr);
187 if (!pmd_none(*pmd)) { 212 if (!pmd_none(*pmd)) {
213 pte_t *ptep, pte;
214
188 ptep = pte_offset_map(pmd, addr); 215 ptep = pte_offset_map(pmd, addr);
189 pte = *ptep; 216 pte = *ptep;
190 if (pte_present(pte)) 217 if (pte_present(pte))
@@ -206,13 +233,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
206} 233}
207EXPORT_SYMBOL(vmalloc_to_pfn); 234EXPORT_SYMBOL(vmalloc_to_pfn);
208 235
209static struct vm_struct * 236
210__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, 237/*** Global kva allocator ***/
211 unsigned long end, int node, gfp_t gfp_mask, void *caller) 238
239#define VM_LAZY_FREE 0x01
240#define VM_LAZY_FREEING 0x02
241#define VM_VM_AREA 0x04
242
243struct vmap_area {
244 unsigned long va_start;
245 unsigned long va_end;
246 unsigned long flags;
247 struct rb_node rb_node; /* address sorted rbtree */
248 struct list_head list; /* address sorted list */
249 struct list_head purge_list; /* "lazy purge" list */
250 void *private;
251 struct rcu_head rcu_head;
252};
253
254static DEFINE_SPINLOCK(vmap_area_lock);
255static struct rb_root vmap_area_root = RB_ROOT;
256static LIST_HEAD(vmap_area_list);
257
258static struct vmap_area *__find_vmap_area(unsigned long addr)
212{ 259{
213 struct vm_struct **p, *tmp, *area; 260 struct rb_node *n = vmap_area_root.rb_node;
214 unsigned long align = 1; 261
262 while (n) {
263 struct vmap_area *va;
264
265 va = rb_entry(n, struct vmap_area, rb_node);
266 if (addr < va->va_start)
267 n = n->rb_left;
268 else if (addr > va->va_start)
269 n = n->rb_right;
270 else
271 return va;
272 }
273
274 return NULL;
275}
276
277static void __insert_vmap_area(struct vmap_area *va)
278{
279 struct rb_node **p = &vmap_area_root.rb_node;
280 struct rb_node *parent = NULL;
281 struct rb_node *tmp;
282
283 while (*p) {
284 struct vmap_area *tmp;
285
286 parent = *p;
287 tmp = rb_entry(parent, struct vmap_area, rb_node);
288 if (va->va_start < tmp->va_end)
289 p = &(*p)->rb_left;
290 else if (va->va_end > tmp->va_start)
291 p = &(*p)->rb_right;
292 else
293 BUG();
294 }
295
296 rb_link_node(&va->rb_node, parent, p);
297 rb_insert_color(&va->rb_node, &vmap_area_root);
298
299 /* address-sort this list so it is usable like the vmlist */
300 tmp = rb_prev(&va->rb_node);
301 if (tmp) {
302 struct vmap_area *prev;
303 prev = rb_entry(tmp, struct vmap_area, rb_node);
304 list_add_rcu(&va->list, &prev->list);
305 } else
306 list_add_rcu(&va->list, &vmap_area_list);
307}
308
309static void purge_vmap_area_lazy(void);
310
311/*
312 * Allocate a region of KVA of the specified size and alignment, within the
313 * vstart and vend.
314 */
315static struct vmap_area *alloc_vmap_area(unsigned long size,
316 unsigned long align,
317 unsigned long vstart, unsigned long vend,
318 int node, gfp_t gfp_mask)
319{
320 struct vmap_area *va;
321 struct rb_node *n;
322 unsigned long addr;
323 int purged = 0;
324
325 BUG_ON(size & ~PAGE_MASK);
326
327 addr = ALIGN(vstart, align);
328
329 va = kmalloc_node(sizeof(struct vmap_area),
330 gfp_mask & GFP_RECLAIM_MASK, node);
331 if (unlikely(!va))
332 return ERR_PTR(-ENOMEM);
333
334retry:
335 spin_lock(&vmap_area_lock);
336 /* XXX: could have a last_hole cache */
337 n = vmap_area_root.rb_node;
338 if (n) {
339 struct vmap_area *first = NULL;
340
341 do {
342 struct vmap_area *tmp;
343 tmp = rb_entry(n, struct vmap_area, rb_node);
344 if (tmp->va_end >= addr) {
345 if (!first && tmp->va_start < addr + size)
346 first = tmp;
347 n = n->rb_left;
348 } else {
349 first = tmp;
350 n = n->rb_right;
351 }
352 } while (n);
353
354 if (!first)
355 goto found;
356
357 if (first->va_end < addr) {
358 n = rb_next(&first->rb_node);
359 if (n)
360 first = rb_entry(n, struct vmap_area, rb_node);
361 else
362 goto found;
363 }
364
365 while (addr + size >= first->va_start && addr + size <= vend) {
366 addr = ALIGN(first->va_end + PAGE_SIZE, align);
367
368 n = rb_next(&first->rb_node);
369 if (n)
370 first = rb_entry(n, struct vmap_area, rb_node);
371 else
372 goto found;
373 }
374 }
375found:
376 if (addr + size > vend) {
377 spin_unlock(&vmap_area_lock);
378 if (!purged) {
379 purge_vmap_area_lazy();
380 purged = 1;
381 goto retry;
382 }
383 if (printk_ratelimit())
384 printk(KERN_WARNING "vmap allocation failed: "
385 "use vmalloc=<size> to increase size.\n");
386 return ERR_PTR(-EBUSY);
387 }
388
389 BUG_ON(addr & (align-1));
390
391 va->va_start = addr;
392 va->va_end = addr + size;
393 va->flags = 0;
394 __insert_vmap_area(va);
395 spin_unlock(&vmap_area_lock);
396
397 return va;
398}
399
400static void rcu_free_va(struct rcu_head *head)
401{
402 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
403
404 kfree(va);
405}
406
407static void __free_vmap_area(struct vmap_area *va)
408{
409 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
410 rb_erase(&va->rb_node, &vmap_area_root);
411 RB_CLEAR_NODE(&va->rb_node);
412 list_del_rcu(&va->list);
413
414 call_rcu(&va->rcu_head, rcu_free_va);
415}
416
417/*
418 * Free a region of KVA allocated by alloc_vmap_area
419 */
420static void free_vmap_area(struct vmap_area *va)
421{
422 spin_lock(&vmap_area_lock);
423 __free_vmap_area(va);
424 spin_unlock(&vmap_area_lock);
425}
426
427/*
428 * Clear the pagetable entries of a given vmap_area
429 */
430static void unmap_vmap_area(struct vmap_area *va)
431{
432 vunmap_page_range(va->va_start, va->va_end);
433}
434
435/*
436 * lazy_max_pages is the maximum amount of virtual address space we gather up
437 * before attempting to purge with a TLB flush.
438 *
439 * There is a tradeoff here: a larger number will cover more kernel page tables
440 * and take slightly longer to purge, but it will linearly reduce the number of
441 * global TLB flushes that must be performed. It would seem natural to scale
442 * this number up linearly with the number of CPUs (because vmapping activity
443 * could also scale linearly with the number of CPUs), however it is likely
444 * that in practice, workloads might be constrained in other ways that mean
445 * vmap activity will not scale linearly with CPUs. Also, I want to be
446 * conservative and not introduce a big latency on huge systems, so go with
447 * a less aggressive log scale. It will still be an improvement over the old
448 * code, and it will be simple to change the scale factor if we find that it
449 * becomes a problem on bigger systems.
450 */
451static unsigned long lazy_max_pages(void)
452{
453 unsigned int log;
454
455 log = fls(num_online_cpus());
456
457 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
458}
459
460static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
461
462/*
463 * Purges all lazily-freed vmap areas.
464 *
465 * If sync is 0 then don't purge if there is already a purge in progress.
466 * If force_flush is 1, then flush kernel TLBs between *start and *end even
467 * if we found no lazy vmap areas to unmap (callers can use this to optimise
468 * their own TLB flushing).
469 * Returns with *start = min(*start, lowest purged address)
470 * *end = max(*end, highest purged address)
471 */
472static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
473 int sync, int force_flush)
474{
475 static DEFINE_SPINLOCK(purge_lock);
476 LIST_HEAD(valist);
477 struct vmap_area *va;
478 int nr = 0;
479
480 /*
481 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
482 * should not expect such behaviour. This just simplifies locking for
483 * the case that isn't actually used at the moment anyway.
484 */
485 if (!sync && !force_flush) {
486 if (!spin_trylock(&purge_lock))
487 return;
488 } else
489 spin_lock(&purge_lock);
490
491 rcu_read_lock();
492 list_for_each_entry_rcu(va, &vmap_area_list, list) {
493 if (va->flags & VM_LAZY_FREE) {
494 if (va->va_start < *start)
495 *start = va->va_start;
496 if (va->va_end > *end)
497 *end = va->va_end;
498 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
499 unmap_vmap_area(va);
500 list_add_tail(&va->purge_list, &valist);
501 va->flags |= VM_LAZY_FREEING;
502 va->flags &= ~VM_LAZY_FREE;
503 }
504 }
505 rcu_read_unlock();
506
507 if (nr) {
508 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
509 atomic_sub(nr, &vmap_lazy_nr);
510 }
511
512 if (nr || force_flush)
513 flush_tlb_kernel_range(*start, *end);
514
515 if (nr) {
516 spin_lock(&vmap_area_lock);
517 list_for_each_entry(va, &valist, purge_list)
518 __free_vmap_area(va);
519 spin_unlock(&vmap_area_lock);
520 }
521 spin_unlock(&purge_lock);
522}
523
524/*
525 * Kick off a purge of the outstanding lazy areas.
526 */
527static void purge_vmap_area_lazy(void)
528{
529 unsigned long start = ULONG_MAX, end = 0;
530
531 __purge_vmap_area_lazy(&start, &end, 0, 0);
532}
533
534/*
535 * Free and unmap a vmap area
536 */
537static void free_unmap_vmap_area(struct vmap_area *va)
538{
539 va->flags |= VM_LAZY_FREE;
540 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
541 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
542 purge_vmap_area_lazy();
543}
544
545static struct vmap_area *find_vmap_area(unsigned long addr)
546{
547 struct vmap_area *va;
548
549 spin_lock(&vmap_area_lock);
550 va = __find_vmap_area(addr);
551 spin_unlock(&vmap_area_lock);
552
553 return va;
554}
555
556static void free_unmap_vmap_area_addr(unsigned long addr)
557{
558 struct vmap_area *va;
559
560 va = find_vmap_area(addr);
561 BUG_ON(!va);
562 free_unmap_vmap_area(va);
563}
564
565
566/*** Per cpu kva allocator ***/
567
568/*
569 * vmap space is limited especially on 32 bit architectures. Ensure there is
570 * room for at least 16 percpu vmap blocks per CPU.
571 */
572/*
573 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
574 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
575 * instead (we just need a rough idea)
576 */
577#if BITS_PER_LONG == 32
578#define VMALLOC_SPACE (128UL*1024*1024)
579#else
580#define VMALLOC_SPACE (128UL*1024*1024*1024)
581#endif
582
583#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
584#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
585#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
586#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
587#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
588#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
589#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
590 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
591 VMALLOC_PAGES / NR_CPUS / 16))
592
593#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
594
595struct vmap_block_queue {
596 spinlock_t lock;
597 struct list_head free;
598 struct list_head dirty;
599 unsigned int nr_dirty;
600};
601
602struct vmap_block {
603 spinlock_t lock;
604 struct vmap_area *va;
605 struct vmap_block_queue *vbq;
606 unsigned long free, dirty;
607 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
608 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
609 union {
610 struct {
611 struct list_head free_list;
612 struct list_head dirty_list;
613 };
614 struct rcu_head rcu_head;
615 };
616};
617
618/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
619static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
620
621/*
622 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
623 * in the free path. Could get rid of this if we change the API to return a
624 * "cookie" from alloc, to be passed to free. But no big deal yet.
625 */
626static DEFINE_SPINLOCK(vmap_block_tree_lock);
627static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
628
629/*
630 * We should probably have a fallback mechanism to allocate virtual memory
631 * out of partially filled vmap blocks. However vmap block sizing should be
632 * fairly reasonable according to the vmalloc size, so it shouldn't be a
633 * big problem.
634 */
635
636static unsigned long addr_to_vb_idx(unsigned long addr)
637{
638 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
639 addr /= VMAP_BLOCK_SIZE;
640 return addr;
641}
642
643static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
644{
645 struct vmap_block_queue *vbq;
646 struct vmap_block *vb;
647 struct vmap_area *va;
648 unsigned long vb_idx;
649 int node, err;
650
651 node = numa_node_id();
652
653 vb = kmalloc_node(sizeof(struct vmap_block),
654 gfp_mask & GFP_RECLAIM_MASK, node);
655 if (unlikely(!vb))
656 return ERR_PTR(-ENOMEM);
657
658 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
659 VMALLOC_START, VMALLOC_END,
660 node, gfp_mask);
661 if (unlikely(IS_ERR(va))) {
662 kfree(vb);
663 return ERR_PTR(PTR_ERR(va));
664 }
665
666 err = radix_tree_preload(gfp_mask);
667 if (unlikely(err)) {
668 kfree(vb);
669 free_vmap_area(va);
670 return ERR_PTR(err);
671 }
672
673 spin_lock_init(&vb->lock);
674 vb->va = va;
675 vb->free = VMAP_BBMAP_BITS;
676 vb->dirty = 0;
677 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
678 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
679 INIT_LIST_HEAD(&vb->free_list);
680 INIT_LIST_HEAD(&vb->dirty_list);
681
682 vb_idx = addr_to_vb_idx(va->va_start);
683 spin_lock(&vmap_block_tree_lock);
684 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
685 spin_unlock(&vmap_block_tree_lock);
686 BUG_ON(err);
687 radix_tree_preload_end();
688
689 vbq = &get_cpu_var(vmap_block_queue);
690 vb->vbq = vbq;
691 spin_lock(&vbq->lock);
692 list_add(&vb->free_list, &vbq->free);
693 spin_unlock(&vbq->lock);
694 put_cpu_var(vmap_cpu_blocks);
695
696 return vb;
697}
698
699static void rcu_free_vb(struct rcu_head *head)
700{
701 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
702
703 kfree(vb);
704}
705
706static void free_vmap_block(struct vmap_block *vb)
707{
708 struct vmap_block *tmp;
709 unsigned long vb_idx;
710
711 spin_lock(&vb->vbq->lock);
712 if (!list_empty(&vb->free_list))
713 list_del(&vb->free_list);
714 if (!list_empty(&vb->dirty_list))
715 list_del(&vb->dirty_list);
716 spin_unlock(&vb->vbq->lock);
717
718 vb_idx = addr_to_vb_idx(vb->va->va_start);
719 spin_lock(&vmap_block_tree_lock);
720 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
721 spin_unlock(&vmap_block_tree_lock);
722 BUG_ON(tmp != vb);
723
724 free_unmap_vmap_area(vb->va);
725 call_rcu(&vb->rcu_head, rcu_free_vb);
726}
727
728static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
729{
730 struct vmap_block_queue *vbq;
731 struct vmap_block *vb;
732 unsigned long addr = 0;
733 unsigned int order;
734
735 BUG_ON(size & ~PAGE_MASK);
736 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
737 order = get_order(size);
738
739again:
740 rcu_read_lock();
741 vbq = &get_cpu_var(vmap_block_queue);
742 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
743 int i;
744
745 spin_lock(&vb->lock);
746 i = bitmap_find_free_region(vb->alloc_map,
747 VMAP_BBMAP_BITS, order);
748
749 if (i >= 0) {
750 addr = vb->va->va_start + (i << PAGE_SHIFT);
751 BUG_ON(addr_to_vb_idx(addr) !=
752 addr_to_vb_idx(vb->va->va_start));
753 vb->free -= 1UL << order;
754 if (vb->free == 0) {
755 spin_lock(&vbq->lock);
756 list_del_init(&vb->free_list);
757 spin_unlock(&vbq->lock);
758 }
759 spin_unlock(&vb->lock);
760 break;
761 }
762 spin_unlock(&vb->lock);
763 }
764 put_cpu_var(vmap_cpu_blocks);
765 rcu_read_unlock();
766
767 if (!addr) {
768 vb = new_vmap_block(gfp_mask);
769 if (IS_ERR(vb))
770 return vb;
771 goto again;
772 }
773
774 return (void *)addr;
775}
776
777static void vb_free(const void *addr, unsigned long size)
778{
779 unsigned long offset;
780 unsigned long vb_idx;
781 unsigned int order;
782 struct vmap_block *vb;
783
784 BUG_ON(size & ~PAGE_MASK);
785 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
786 order = get_order(size);
787
788 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
789
790 vb_idx = addr_to_vb_idx((unsigned long)addr);
791 rcu_read_lock();
792 vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
793 rcu_read_unlock();
794 BUG_ON(!vb);
795
796 spin_lock(&vb->lock);
797 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
798 if (!vb->dirty) {
799 spin_lock(&vb->vbq->lock);
800 list_add(&vb->dirty_list, &vb->vbq->dirty);
801 spin_unlock(&vb->vbq->lock);
802 }
803 vb->dirty += 1UL << order;
804 if (vb->dirty == VMAP_BBMAP_BITS) {
805 BUG_ON(vb->free || !list_empty(&vb->free_list));
806 spin_unlock(&vb->lock);
807 free_vmap_block(vb);
808 } else
809 spin_unlock(&vb->lock);
810}
811
812/**
813 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
814 *
815 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
816 * to amortize TLB flushing overheads. What this means is that any page you
817 * have now, may, in a former life, have been mapped into kernel virtual
818 * address by the vmap layer and so there might be some CPUs with TLB entries
819 * still referencing that page (additional to the regular 1:1 kernel mapping).
820 *
821 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
822 * be sure that none of the pages we have control over will have any aliases
823 * from the vmap layer.
824 */
825void vm_unmap_aliases(void)
826{
827 unsigned long start = ULONG_MAX, end = 0;
828 int cpu;
829 int flush = 0;
830
831 for_each_possible_cpu(cpu) {
832 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
833 struct vmap_block *vb;
834
835 rcu_read_lock();
836 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
837 int i;
838
839 spin_lock(&vb->lock);
840 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
841 while (i < VMAP_BBMAP_BITS) {
842 unsigned long s, e;
843 int j;
844 j = find_next_zero_bit(vb->dirty_map,
845 VMAP_BBMAP_BITS, i);
846
847 s = vb->va->va_start + (i << PAGE_SHIFT);
848 e = vb->va->va_start + (j << PAGE_SHIFT);
849 vunmap_page_range(s, e);
850 flush = 1;
851
852 if (s < start)
853 start = s;
854 if (e > end)
855 end = e;
856
857 i = j;
858 i = find_next_bit(vb->dirty_map,
859 VMAP_BBMAP_BITS, i);
860 }
861 spin_unlock(&vb->lock);
862 }
863 rcu_read_unlock();
864 }
865
866 __purge_vmap_area_lazy(&start, &end, 1, flush);
867}
868EXPORT_SYMBOL_GPL(vm_unmap_aliases);
869
870/**
871 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
872 * @mem: the pointer returned by vm_map_ram
873 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
874 */
875void vm_unmap_ram(const void *mem, unsigned int count)
876{
877 unsigned long size = count << PAGE_SHIFT;
878 unsigned long addr = (unsigned long)mem;
879
880 BUG_ON(!addr);
881 BUG_ON(addr < VMALLOC_START);
882 BUG_ON(addr > VMALLOC_END);
883 BUG_ON(addr & (PAGE_SIZE-1));
884
885 debug_check_no_locks_freed(mem, size);
886
887 if (likely(count <= VMAP_MAX_ALLOC))
888 vb_free(mem, size);
889 else
890 free_unmap_vmap_area_addr(addr);
891}
892EXPORT_SYMBOL(vm_unmap_ram);
893
894/**
895 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
896 * @pages: an array of pointers to the pages to be mapped
897 * @count: number of pages
898 * @node: prefer to allocate data structures on this node
899 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
900 * @returns: a pointer to the address that has been mapped, or NULL on failure
901 */
902void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
903{
904 unsigned long size = count << PAGE_SHIFT;
215 unsigned long addr; 905 unsigned long addr;
906 void *mem;
907
908 if (likely(count <= VMAP_MAX_ALLOC)) {
909 mem = vb_alloc(size, GFP_KERNEL);
910 if (IS_ERR(mem))
911 return NULL;
912 addr = (unsigned long)mem;
913 } else {
914 struct vmap_area *va;
915 va = alloc_vmap_area(size, PAGE_SIZE,
916 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
917 if (IS_ERR(va))
918 return NULL;
919
920 addr = va->va_start;
921 mem = (void *)addr;
922 }
923 if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
924 vm_unmap_ram(mem, count);
925 return NULL;
926 }
927 return mem;
928}
929EXPORT_SYMBOL(vm_map_ram);
930
931void __init vmalloc_init(void)
932{
933 int i;
934
935 for_each_possible_cpu(i) {
936 struct vmap_block_queue *vbq;
937
938 vbq = &per_cpu(vmap_block_queue, i);
939 spin_lock_init(&vbq->lock);
940 INIT_LIST_HEAD(&vbq->free);
941 INIT_LIST_HEAD(&vbq->dirty);
942 vbq->nr_dirty = 0;
943 }
944}
945
946void unmap_kernel_range(unsigned long addr, unsigned long size)
947{
948 unsigned long end = addr + size;
949 vunmap_page_range(addr, end);
950 flush_tlb_kernel_range(addr, end);
951}
952
953int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
954{
955 unsigned long addr = (unsigned long)area->addr;
956 unsigned long end = addr + area->size - PAGE_SIZE;
957 int err;
958
959 err = vmap_page_range(addr, end, prot, *pages);
960 if (err > 0) {
961 *pages += err;
962 err = 0;
963 }
964
965 return err;
966}
967EXPORT_SYMBOL_GPL(map_vm_area);
968
969/*** Old vmalloc interfaces ***/
970DEFINE_RWLOCK(vmlist_lock);
971struct vm_struct *vmlist;
972
973static struct vm_struct *__get_vm_area_node(unsigned long size,
974 unsigned long flags, unsigned long start, unsigned long end,
975 int node, gfp_t gfp_mask, void *caller)
976{
977 static struct vmap_area *va;
978 struct vm_struct *area;
979 struct vm_struct *tmp, **p;
980 unsigned long align = 1;
216 981
217 BUG_ON(in_interrupt()); 982 BUG_ON(in_interrupt());
218 if (flags & VM_IOREMAP) { 983 if (flags & VM_IOREMAP) {
@@ -225,13 +990,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
225 990
226 align = 1ul << bit; 991 align = 1ul << bit;
227 } 992 }
228 addr = ALIGN(start, align); 993
229 size = PAGE_ALIGN(size); 994 size = PAGE_ALIGN(size);
230 if (unlikely(!size)) 995 if (unlikely(!size))
231 return NULL; 996 return NULL;
232 997
233 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 998 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
234
235 if (unlikely(!area)) 999 if (unlikely(!area))
236 return NULL; 1000 return NULL;
237 1001
@@ -240,48 +1004,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
240 */ 1004 */
241 size += PAGE_SIZE; 1005 size += PAGE_SIZE;
242 1006
243 write_lock(&vmlist_lock); 1007 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
244 for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { 1008 if (IS_ERR(va)) {
245 if ((unsigned long)tmp->addr < addr) { 1009 kfree(area);
246 if((unsigned long)tmp->addr + tmp->size >= addr) 1010 return NULL;
247 addr = ALIGN(tmp->size +
248 (unsigned long)tmp->addr, align);
249 continue;
250 }
251 if ((size + addr) < addr)
252 goto out;
253 if (size + addr <= (unsigned long)tmp->addr)
254 goto found;
255 addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
256 if (addr > end - size)
257 goto out;
258 } 1011 }
259 if ((size + addr) < addr)
260 goto out;
261 if (addr > end - size)
262 goto out;
263
264found:
265 area->next = *p;
266 *p = area;
267 1012
268 area->flags = flags; 1013 area->flags = flags;
269 area->addr = (void *)addr; 1014 area->addr = (void *)va->va_start;
270 area->size = size; 1015 area->size = size;
271 area->pages = NULL; 1016 area->pages = NULL;
272 area->nr_pages = 0; 1017 area->nr_pages = 0;
273 area->phys_addr = 0; 1018 area->phys_addr = 0;
274 area->caller = caller; 1019 area->caller = caller;
1020 va->private = area;
1021 va->flags |= VM_VM_AREA;
1022
1023 write_lock(&vmlist_lock);
1024 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1025 if (tmp->addr >= area->addr)
1026 break;
1027 }
1028 area->next = *p;
1029 *p = area;
275 write_unlock(&vmlist_lock); 1030 write_unlock(&vmlist_lock);
276 1031
277 return area; 1032 return area;
278
279out:
280 write_unlock(&vmlist_lock);
281 kfree(area);
282 if (printk_ratelimit())
283 printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
284 return NULL;
285} 1033}
286 1034
287struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1035struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
@@ -321,39 +1069,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
321 gfp_mask, __builtin_return_address(0)); 1069 gfp_mask, __builtin_return_address(0));
322} 1070}
323 1071
324/* Caller must hold vmlist_lock */ 1072static struct vm_struct *find_vm_area(const void *addr)
325static struct vm_struct *__find_vm_area(const void *addr)
326{ 1073{
327 struct vm_struct *tmp; 1074 struct vmap_area *va;
328 1075
329 for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { 1076 va = find_vmap_area((unsigned long)addr);
330 if (tmp->addr == addr) 1077 if (va && va->flags & VM_VM_AREA)
331 break; 1078 return va->private;
332 }
333 1079
334 return tmp;
335}
336
337/* Caller must hold vmlist_lock */
338static struct vm_struct *__remove_vm_area(const void *addr)
339{
340 struct vm_struct **p, *tmp;
341
342 for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
343 if (tmp->addr == addr)
344 goto found;
345 }
346 return NULL; 1080 return NULL;
347
348found:
349 unmap_vm_area(tmp);
350 *p = tmp->next;
351
352 /*
353 * Remove the guard page.
354 */
355 tmp->size -= PAGE_SIZE;
356 return tmp;
357} 1081}
358 1082
359/** 1083/**
@@ -366,11 +1090,24 @@ found:
366 */ 1090 */
367struct vm_struct *remove_vm_area(const void *addr) 1091struct vm_struct *remove_vm_area(const void *addr)
368{ 1092{
369 struct vm_struct *v; 1093 struct vmap_area *va;
370 write_lock(&vmlist_lock); 1094
371 v = __remove_vm_area(addr); 1095 va = find_vmap_area((unsigned long)addr);
372 write_unlock(&vmlist_lock); 1096 if (va && va->flags & VM_VM_AREA) {
373 return v; 1097 struct vm_struct *vm = va->private;
1098 struct vm_struct *tmp, **p;
1099 free_unmap_vmap_area(va);
1100 vm->size -= PAGE_SIZE;
1101
1102 write_lock(&vmlist_lock);
1103 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1104 ;
1105 *p = tmp->next;
1106 write_unlock(&vmlist_lock);
1107
1108 return vm;
1109 }
1110 return NULL;
374} 1111}
375 1112
376static void __vunmap(const void *addr, int deallocate_pages) 1113static void __vunmap(const void *addr, int deallocate_pages)
@@ -381,16 +1118,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
381 return; 1118 return;
382 1119
383 if ((PAGE_SIZE-1) & (unsigned long)addr) { 1120 if ((PAGE_SIZE-1) & (unsigned long)addr) {
384 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 1121 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
385 WARN_ON(1);
386 return; 1122 return;
387 } 1123 }
388 1124
389 area = remove_vm_area(addr); 1125 area = remove_vm_area(addr);
390 if (unlikely(!area)) { 1126 if (unlikely(!area)) {
391 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 1127 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
392 addr); 1128 addr);
393 WARN_ON(1);
394 return; 1129 return;
395 } 1130 }
396 1131
@@ -482,6 +1217,8 @@ void *vmap(struct page **pages, unsigned int count,
482} 1217}
483EXPORT_SYMBOL(vmap); 1218EXPORT_SYMBOL(vmap);
484 1219
1220static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1221 int node, void *caller);
485static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1222static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
486 pgprot_t prot, int node, void *caller) 1223 pgprot_t prot, int node, void *caller)
487{ 1224{
@@ -608,10 +1345,8 @@ void *vmalloc_user(unsigned long size)
608 1345
609 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 1346 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
610 if (ret) { 1347 if (ret) {
611 write_lock(&vmlist_lock); 1348 area = find_vm_area(ret);
612 area = __find_vm_area(ret);
613 area->flags |= VM_USERMAP; 1349 area->flags |= VM_USERMAP;
614 write_unlock(&vmlist_lock);
615 } 1350 }
616 return ret; 1351 return ret;
617} 1352}
@@ -691,10 +1426,8 @@ void *vmalloc_32_user(unsigned long size)
691 1426
692 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); 1427 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
693 if (ret) { 1428 if (ret) {
694 write_lock(&vmlist_lock); 1429 area = find_vm_area(ret);
695 area = __find_vm_area(ret);
696 area->flags |= VM_USERMAP; 1430 area->flags |= VM_USERMAP;
697 write_unlock(&vmlist_lock);
698 } 1431 }
699 return ret; 1432 return ret;
700} 1433}
@@ -795,26 +1528,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
795 struct vm_struct *area; 1528 struct vm_struct *area;
796 unsigned long uaddr = vma->vm_start; 1529 unsigned long uaddr = vma->vm_start;
797 unsigned long usize = vma->vm_end - vma->vm_start; 1530 unsigned long usize = vma->vm_end - vma->vm_start;
798 int ret;
799 1531
800 if ((PAGE_SIZE-1) & (unsigned long)addr) 1532 if ((PAGE_SIZE-1) & (unsigned long)addr)
801 return -EINVAL; 1533 return -EINVAL;
802 1534
803 read_lock(&vmlist_lock); 1535 area = find_vm_area(addr);
804 area = __find_vm_area(addr);
805 if (!area) 1536 if (!area)
806 goto out_einval_locked; 1537 return -EINVAL;
807 1538
808 if (!(area->flags & VM_USERMAP)) 1539 if (!(area->flags & VM_USERMAP))
809 goto out_einval_locked; 1540 return -EINVAL;
810 1541
811 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 1542 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
812 goto out_einval_locked; 1543 return -EINVAL;
813 read_unlock(&vmlist_lock);
814 1544
815 addr += pgoff << PAGE_SHIFT; 1545 addr += pgoff << PAGE_SHIFT;
816 do { 1546 do {
817 struct page *page = vmalloc_to_page(addr); 1547 struct page *page = vmalloc_to_page(addr);
1548 int ret;
1549
818 ret = vm_insert_page(vma, uaddr, page); 1550 ret = vm_insert_page(vma, uaddr, page);
819 if (ret) 1551 if (ret)
820 return ret; 1552 return ret;
@@ -827,11 +1559,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
827 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 1559 /* Prevent "things" like memory migration? VM_flags need a cleanup... */
828 vma->vm_flags |= VM_RESERVED; 1560 vma->vm_flags |= VM_RESERVED;
829 1561
830 return ret; 1562 return 0;
831
832out_einval_locked:
833 read_unlock(&vmlist_lock);
834 return -EINVAL;
835} 1563}
836EXPORT_SYMBOL(remap_vmalloc_range); 1564EXPORT_SYMBOL(remap_vmalloc_range);
837 1565
@@ -931,6 +1659,25 @@ static void s_stop(struct seq_file *m, void *p)
931 read_unlock(&vmlist_lock); 1659 read_unlock(&vmlist_lock);
932} 1660}
933 1661
1662static void show_numa_info(struct seq_file *m, struct vm_struct *v)
1663{
1664 if (NUMA_BUILD) {
1665 unsigned int nr, *counters = m->private;
1666
1667 if (!counters)
1668 return;
1669
1670 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
1671
1672 for (nr = 0; nr < v->nr_pages; nr++)
1673 counters[page_to_nid(v->pages[nr])]++;
1674
1675 for_each_node_state(nr, N_HIGH_MEMORY)
1676 if (counters[nr])
1677 seq_printf(m, " N%u=%u", nr, counters[nr]);
1678 }
1679}
1680
934static int s_show(struct seq_file *m, void *p) 1681static int s_show(struct seq_file *m, void *p)
935{ 1682{
936 struct vm_struct *v = p; 1683 struct vm_struct *v = p;
@@ -967,15 +1714,46 @@ static int s_show(struct seq_file *m, void *p)
967 if (v->flags & VM_VPAGES) 1714 if (v->flags & VM_VPAGES)
968 seq_printf(m, " vpages"); 1715 seq_printf(m, " vpages");
969 1716
1717 show_numa_info(m, v);
970 seq_putc(m, '\n'); 1718 seq_putc(m, '\n');
971 return 0; 1719 return 0;
972} 1720}
973 1721
974const struct seq_operations vmalloc_op = { 1722static const struct seq_operations vmalloc_op = {
975 .start = s_start, 1723 .start = s_start,
976 .next = s_next, 1724 .next = s_next,
977 .stop = s_stop, 1725 .stop = s_stop,
978 .show = s_show, 1726 .show = s_show,
979}; 1727};
1728
1729static int vmalloc_open(struct inode *inode, struct file *file)
1730{
1731 unsigned int *ptr = NULL;
1732 int ret;
1733
1734 if (NUMA_BUILD)
1735 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
1736 ret = seq_open(file, &vmalloc_op);
1737 if (!ret) {
1738 struct seq_file *m = file->private_data;
1739 m->private = ptr;
1740 } else
1741 kfree(ptr);
1742 return ret;
1743}
1744
1745static const struct file_operations proc_vmalloc_operations = {
1746 .open = vmalloc_open,
1747 .read = seq_read,
1748 .llseek = seq_lseek,
1749 .release = seq_release_private,
1750};
1751
1752static int __init proc_vmalloc_init(void)
1753{
1754 proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
1755 return 0;
1756}
1757module_init(proc_vmalloc_init);
980#endif 1758#endif
981 1759
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..3b5860294bb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,8 @@
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
41 43
42#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
43#include <asm/div64.h> 45#include <asm/div64.h>
@@ -77,7 +79,7 @@ struct scan_control {
77 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 79 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
78 unsigned long *scanned, int order, int mode, 80 unsigned long *scanned, int order, int mode,
79 struct zone *z, struct mem_cgroup *mem_cont, 81 struct zone *z, struct mem_cgroup *mem_cont,
80 int active); 82 int active, int file);
81}; 83};
82 84
83#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 85#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -390,17 +392,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
390} 392}
391 393
392/* 394/*
393 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 395 * Same as remove_mapping, but if the page is removed from the mapping, it
394 * someone else has a ref on the page, abort and return 0. If it was 396 * gets returned with a refcount of 0.
395 * successfully detached, return 1. Assumes the caller has a single ref on
396 * this page.
397 */ 397 */
398int remove_mapping(struct address_space *mapping, struct page *page) 398static int __remove_mapping(struct address_space *mapping, struct page *page)
399{ 399{
400 BUG_ON(!PageLocked(page)); 400 BUG_ON(!PageLocked(page));
401 BUG_ON(mapping != page_mapping(page)); 401 BUG_ON(mapping != page_mapping(page));
402 402
403 write_lock_irq(&mapping->tree_lock); 403 spin_lock_irq(&mapping->tree_lock);
404 /* 404 /*
405 * The non racy check for a busy page. 405 * The non racy check for a busy page.
406 * 406 *
@@ -426,32 +426,131 @@ int remove_mapping(struct address_space *mapping, struct page *page)
426 * Note that if SetPageDirty is always performed via set_page_dirty, 426 * Note that if SetPageDirty is always performed via set_page_dirty,
427 * and thus under tree_lock, then this ordering is not required. 427 * and thus under tree_lock, then this ordering is not required.
428 */ 428 */
429 if (unlikely(page_count(page) != 2)) 429 if (!page_freeze_refs(page, 2))
430 goto cannot_free; 430 goto cannot_free;
431 smp_rmb(); 431 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
432 if (unlikely(PageDirty(page))) 432 if (unlikely(PageDirty(page))) {
433 page_unfreeze_refs(page, 2);
433 goto cannot_free; 434 goto cannot_free;
435 }
434 436
435 if (PageSwapCache(page)) { 437 if (PageSwapCache(page)) {
436 swp_entry_t swap = { .val = page_private(page) }; 438 swp_entry_t swap = { .val = page_private(page) };
437 __delete_from_swap_cache(page); 439 __delete_from_swap_cache(page);
438 write_unlock_irq(&mapping->tree_lock); 440 spin_unlock_irq(&mapping->tree_lock);
439 swap_free(swap); 441 swap_free(swap);
440 __put_page(page); /* The pagecache ref */ 442 } else {
441 return 1; 443 __remove_from_page_cache(page);
444 spin_unlock_irq(&mapping->tree_lock);
442 } 445 }
443 446
444 __remove_from_page_cache(page);
445 write_unlock_irq(&mapping->tree_lock);
446 __put_page(page);
447 return 1; 447 return 1;
448 448
449cannot_free: 449cannot_free:
450 write_unlock_irq(&mapping->tree_lock); 450 spin_unlock_irq(&mapping->tree_lock);
451 return 0; 451 return 0;
452} 452}
453 453
454/* 454/*
455 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
456 * someone else has a ref on the page, abort and return 0. If it was
457 * successfully detached, return 1. Assumes the caller has a single ref on
458 * this page.
459 */
460int remove_mapping(struct address_space *mapping, struct page *page)
461{
462 if (__remove_mapping(mapping, page)) {
463 /*
464 * Unfreezing the refcount with 1 rather than 2 effectively
465 * drops the pagecache ref for us without requiring another
466 * atomic operation.
467 */
468 page_unfreeze_refs(page, 1);
469 return 1;
470 }
471 return 0;
472}
473
474/**
475 * putback_lru_page - put previously isolated page onto appropriate LRU list
476 * @page: page to be put back to appropriate lru list
477 *
478 * Add previously isolated @page to appropriate LRU list.
479 * Page may still be unevictable for other reasons.
480 *
481 * lru_lock must not be held, interrupts must be enabled.
482 */
483#ifdef CONFIG_UNEVICTABLE_LRU
484void putback_lru_page(struct page *page)
485{
486 int lru;
487 int active = !!TestClearPageActive(page);
488 int was_unevictable = PageUnevictable(page);
489
490 VM_BUG_ON(PageLRU(page));
491
492redo:
493 ClearPageUnevictable(page);
494
495 if (page_evictable(page, NULL)) {
496 /*
497 * For evictable pages, we can use the cache.
498 * In event of a race, worst case is we end up with an
499 * unevictable page on [in]active list.
500 * We know how to handle that.
501 */
502 lru = active + page_is_file_cache(page);
503 lru_cache_add_lru(page, lru);
504 } else {
505 /*
506 * Put unevictable pages directly on zone's unevictable
507 * list.
508 */
509 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page);
511 }
512 mem_cgroup_move_lists(page, lru);
513
514 /*
515 * page's status can change while we move it among lru. If an evictable
516 * page is on unevictable list, it never be freed. To avoid that,
517 * check after we added it to the list, again.
518 */
519 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
520 if (!isolate_lru_page(page)) {
521 put_page(page);
522 goto redo;
523 }
524 /* This means someone else dropped this page from LRU
525 * So, it will be freed or putback to LRU again. There is
526 * nothing to do here.
527 */
528 }
529
530 if (was_unevictable && lru != LRU_UNEVICTABLE)
531 count_vm_event(UNEVICTABLE_PGRESCUED);
532 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
533 count_vm_event(UNEVICTABLE_PGCULLED);
534
535 put_page(page); /* drop ref from isolate */
536}
537
538#else /* CONFIG_UNEVICTABLE_LRU */
539
540void putback_lru_page(struct page *page)
541{
542 int lru;
543 VM_BUG_ON(PageLRU(page));
544
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page);
549}
550#endif /* CONFIG_UNEVICTABLE_LRU */
551
552
553/*
455 * shrink_page_list() returns the number of reclaimed pages 554 * shrink_page_list() returns the number of reclaimed pages
456 */ 555 */
457static unsigned long shrink_page_list(struct list_head *page_list, 556static unsigned long shrink_page_list(struct list_head *page_list,
@@ -477,13 +576,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
477 page = lru_to_page(page_list); 576 page = lru_to_page(page_list);
478 list_del(&page->lru); 577 list_del(&page->lru);
479 578
480 if (TestSetPageLocked(page)) 579 if (!trylock_page(page))
481 goto keep; 580 goto keep;
482 581
483 VM_BUG_ON(PageActive(page)); 582 VM_BUG_ON(PageActive(page));
484 583
485 sc->nr_scanned++; 584 sc->nr_scanned++;
486 585
586 if (unlikely(!page_evictable(page, NULL)))
587 goto cull_mlocked;
588
487 if (!sc->may_swap && page_mapped(page)) 589 if (!sc->may_swap && page_mapped(page))
488 goto keep_locked; 590 goto keep_locked;
489 591
@@ -520,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
520 * Anonymous process memory has backing store? 622 * Anonymous process memory has backing store?
521 * Try to allocate it some swap space here. 623 * Try to allocate it some swap space here.
522 */ 624 */
523 if (PageAnon(page) && !PageSwapCache(page)) 625 if (PageAnon(page) && !PageSwapCache(page)) {
626 switch (try_to_munlock(page)) {
627 case SWAP_FAIL: /* shouldn't happen */
628 case SWAP_AGAIN:
629 goto keep_locked;
630 case SWAP_MLOCK:
631 goto cull_mlocked;
632 case SWAP_SUCCESS:
633 ; /* fall thru'; add to swap cache */
634 }
524 if (!add_to_swap(page, GFP_ATOMIC)) 635 if (!add_to_swap(page, GFP_ATOMIC))
525 goto activate_locked; 636 goto activate_locked;
637 }
526#endif /* CONFIG_SWAP */ 638#endif /* CONFIG_SWAP */
527 639
528 mapping = page_mapping(page); 640 mapping = page_mapping(page);
@@ -537,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
537 goto activate_locked; 649 goto activate_locked;
538 case SWAP_AGAIN: 650 case SWAP_AGAIN:
539 goto keep_locked; 651 goto keep_locked;
652 case SWAP_MLOCK:
653 goto cull_mlocked;
540 case SWAP_SUCCESS: 654 case SWAP_SUCCESS:
541 ; /* try to free the page below */ 655 ; /* try to free the page below */
542 } 656 }
@@ -563,7 +677,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
563 * A synchronous write - probably a ramdisk. Go 677 * A synchronous write - probably a ramdisk. Go
564 * ahead and try to reclaim the page. 678 * ahead and try to reclaim the page.
565 */ 679 */
566 if (TestSetPageLocked(page)) 680 if (!trylock_page(page))
567 goto keep; 681 goto keep;
568 if (PageDirty(page) || PageWriteback(page)) 682 if (PageDirty(page) || PageWriteback(page))
569 goto keep_locked; 683 goto keep_locked;
@@ -583,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
583 * possible for a page to have PageDirty set, but it is actually 697 * possible for a page to have PageDirty set, but it is actually
584 * clean (all its buffers are clean). This happens if the 698 * clean (all its buffers are clean). This happens if the
585 * buffers were written out directly, with submit_bh(). ext3 699 * buffers were written out directly, with submit_bh(). ext3
586 * will do this, as well as the blockdev mapping. 700 * will do this, as well as the blockdev mapping.
587 * try_to_release_page() will discover that cleanness and will 701 * try_to_release_page() will discover that cleanness and will
588 * drop the buffers and mark the page clean - it can be freed. 702 * drop the buffers and mark the page clean - it can be freed.
589 * 703 *
@@ -597,32 +711,64 @@ static unsigned long shrink_page_list(struct list_head *page_list,
597 if (PagePrivate(page)) { 711 if (PagePrivate(page)) {
598 if (!try_to_release_page(page, sc->gfp_mask)) 712 if (!try_to_release_page(page, sc->gfp_mask))
599 goto activate_locked; 713 goto activate_locked;
600 if (!mapping && page_count(page) == 1) 714 if (!mapping && page_count(page) == 1) {
601 goto free_it; 715 unlock_page(page);
716 if (put_page_testzero(page))
717 goto free_it;
718 else {
719 /*
720 * rare race with speculative reference.
721 * the speculative reference will free
722 * this page shortly, so we may
723 * increment nr_reclaimed here (and
724 * leave it off the LRU).
725 */
726 nr_reclaimed++;
727 continue;
728 }
729 }
602 } 730 }
603 731
604 if (!mapping || !remove_mapping(mapping, page)) 732 if (!mapping || !__remove_mapping(mapping, page))
605 goto keep_locked; 733 goto keep_locked;
606 734
735 /*
736 * At this point, we have no other references and there is
737 * no way to pick any more up (removed from LRU, removed
738 * from pagecache). Can use non-atomic bitops now (and
739 * we obviously don't have to worry about waking up a process
740 * waiting on the page lock, because there are no references.
741 */
742 __clear_page_locked(page);
607free_it: 743free_it:
608 unlock_page(page);
609 nr_reclaimed++; 744 nr_reclaimed++;
610 if (!pagevec_add(&freed_pvec, page)) 745 if (!pagevec_add(&freed_pvec, page)) {
611 __pagevec_release_nonlru(&freed_pvec); 746 __pagevec_free(&freed_pvec);
747 pagevec_reinit(&freed_pvec);
748 }
749 continue;
750
751cull_mlocked:
752 unlock_page(page);
753 putback_lru_page(page);
612 continue; 754 continue;
613 755
614activate_locked: 756activate_locked:
757 /* Not a candidate for swapping, so reclaim swap space. */
758 if (PageSwapCache(page) && vm_swap_full())
759 remove_exclusive_swap_page_ref(page);
760 VM_BUG_ON(PageActive(page));
615 SetPageActive(page); 761 SetPageActive(page);
616 pgactivate++; 762 pgactivate++;
617keep_locked: 763keep_locked:
618 unlock_page(page); 764 unlock_page(page);
619keep: 765keep:
620 list_add(&page->lru, &ret_pages); 766 list_add(&page->lru, &ret_pages);
621 VM_BUG_ON(PageLRU(page)); 767 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
622 } 768 }
623 list_splice(&ret_pages, page_list); 769 list_splice(&ret_pages, page_list);
624 if (pagevec_count(&freed_pvec)) 770 if (pagevec_count(&freed_pvec))
625 __pagevec_release_nonlru(&freed_pvec); 771 __pagevec_free(&freed_pvec);
626 count_vm_events(PGACTIVATE, pgactivate); 772 count_vm_events(PGACTIVATE, pgactivate);
627 return nr_reclaimed; 773 return nr_reclaimed;
628} 774}
@@ -642,7 +788,7 @@ keep:
642 * 788 *
643 * returns 0 on success, -ve errno on failure. 789 * returns 0 on success, -ve errno on failure.
644 */ 790 */
645int __isolate_lru_page(struct page *page, int mode) 791int __isolate_lru_page(struct page *page, int mode, int file)
646{ 792{
647 int ret = -EINVAL; 793 int ret = -EINVAL;
648 794
@@ -658,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode)
658 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 804 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
659 return ret; 805 return ret;
660 806
807 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
808 return ret;
809
810 /*
811 * When this function is being called for lumpy reclaim, we
812 * initially look into all LRU pages, active, inactive and
813 * unevictable; only give shrink_page_list evictable pages.
814 */
815 if (PageUnevictable(page))
816 return ret;
817
661 ret = -EBUSY; 818 ret = -EBUSY;
662 if (likely(get_page_unless_zero(page))) { 819 if (likely(get_page_unless_zero(page))) {
663 /* 820 /*
@@ -688,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode)
688 * @scanned: The number of pages that were scanned. 845 * @scanned: The number of pages that were scanned.
689 * @order: The caller's attempted allocation order 846 * @order: The caller's attempted allocation order
690 * @mode: One of the LRU isolation modes 847 * @mode: One of the LRU isolation modes
848 * @file: True [1] if isolating file [!anon] pages
691 * 849 *
692 * returns how many pages were moved onto *@dst. 850 * returns how many pages were moved onto *@dst.
693 */ 851 */
694static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 852static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
695 struct list_head *src, struct list_head *dst, 853 struct list_head *src, struct list_head *dst,
696 unsigned long *scanned, int order, int mode) 854 unsigned long *scanned, int order, int mode, int file)
697{ 855{
698 unsigned long nr_taken = 0; 856 unsigned long nr_taken = 0;
699 unsigned long scan; 857 unsigned long scan;
@@ -710,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
710 868
711 VM_BUG_ON(!PageLRU(page)); 869 VM_BUG_ON(!PageLRU(page));
712 870
713 switch (__isolate_lru_page(page, mode)) { 871 switch (__isolate_lru_page(page, mode, file)) {
714 case 0: 872 case 0:
715 list_move(&page->lru, dst); 873 list_move(&page->lru, dst);
716 nr_taken++; 874 nr_taken++;
@@ -753,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
753 break; 911 break;
754 912
755 cursor_page = pfn_to_page(pfn); 913 cursor_page = pfn_to_page(pfn);
914
756 /* Check that we have not crossed a zone boundary. */ 915 /* Check that we have not crossed a zone boundary. */
757 if (unlikely(page_zone_id(cursor_page) != zone_id)) 916 if (unlikely(page_zone_id(cursor_page) != zone_id))
758 continue; 917 continue;
759 switch (__isolate_lru_page(cursor_page, mode)) { 918 switch (__isolate_lru_page(cursor_page, mode, file)) {
760 case 0: 919 case 0:
761 list_move(&cursor_page->lru, dst); 920 list_move(&cursor_page->lru, dst);
762 nr_taken++; 921 nr_taken++;
@@ -767,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
767 /* else it is being freed elsewhere */ 926 /* else it is being freed elsewhere */
768 list_move(&cursor_page->lru, src); 927 list_move(&cursor_page->lru, src);
769 default: 928 default:
770 break; 929 break; /* ! on LRU or wrong list */
771 } 930 }
772 } 931 }
773 } 932 }
@@ -781,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr,
781 unsigned long *scanned, int order, 940 unsigned long *scanned, int order,
782 int mode, struct zone *z, 941 int mode, struct zone *z,
783 struct mem_cgroup *mem_cont, 942 struct mem_cgroup *mem_cont,
784 int active) 943 int active, int file)
785{ 944{
945 int lru = LRU_BASE;
786 if (active) 946 if (active)
787 return isolate_lru_pages(nr, &z->active_list, dst, 947 lru += LRU_ACTIVE;
788 scanned, order, mode); 948 if (file)
789 else 949 lru += LRU_FILE;
790 return isolate_lru_pages(nr, &z->inactive_list, dst, 950 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
791 scanned, order, mode); 951 mode, !!file);
792} 952}
793 953
794/* 954/*
795 * clear_active_flags() is a helper for shrink_active_list(), clearing 955 * clear_active_flags() is a helper for shrink_active_list(), clearing
796 * any active bits from the pages in the list. 956 * any active bits from the pages in the list.
797 */ 957 */
798static unsigned long clear_active_flags(struct list_head *page_list) 958static unsigned long clear_active_flags(struct list_head *page_list,
959 unsigned int *count)
799{ 960{
800 int nr_active = 0; 961 int nr_active = 0;
962 int lru;
801 struct page *page; 963 struct page *page;
802 964
803 list_for_each_entry(page, page_list, lru) 965 list_for_each_entry(page, page_list, lru) {
966 lru = page_is_file_cache(page);
804 if (PageActive(page)) { 967 if (PageActive(page)) {
968 lru += LRU_ACTIVE;
805 ClearPageActive(page); 969 ClearPageActive(page);
806 nr_active++; 970 nr_active++;
807 } 971 }
972 count[lru]++;
973 }
808 974
809 return nr_active; 975 return nr_active;
810} 976}
811 977
978/**
979 * isolate_lru_page - tries to isolate a page from its LRU list
980 * @page: page to isolate from its LRU list
981 *
982 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
983 * vmstat statistic corresponding to whatever LRU list the page was on.
984 *
985 * Returns 0 if the page was removed from an LRU list.
986 * Returns -EBUSY if the page was not on an LRU list.
987 *
988 * The returned page will have PageLRU() cleared. If it was found on
989 * the active list, it will have PageActive set. If it was found on
990 * the unevictable list, it will have the PageUnevictable bit set. That flag
991 * may need to be cleared by the caller before letting the page go.
992 *
993 * The vmstat statistic corresponding to the list on which the page was
994 * found will be decremented.
995 *
996 * Restrictions:
997 * (1) Must be called with an elevated refcount on the page. This is a
998 * fundamentnal difference from isolate_lru_pages (which is called
999 * without a stable reference).
1000 * (2) the lru_lock must not be held.
1001 * (3) interrupts must be enabled.
1002 */
1003int isolate_lru_page(struct page *page)
1004{
1005 int ret = -EBUSY;
1006
1007 if (PageLRU(page)) {
1008 struct zone *zone = page_zone(page);
1009
1010 spin_lock_irq(&zone->lru_lock);
1011 if (PageLRU(page) && get_page_unless_zero(page)) {
1012 int lru = page_lru(page);
1013 ret = 0;
1014 ClearPageLRU(page);
1015
1016 del_page_from_lru_list(zone, page, lru);
1017 }
1018 spin_unlock_irq(&zone->lru_lock);
1019 }
1020 return ret;
1021}
1022
812/* 1023/*
813 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1024 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
814 * of reclaimed pages 1025 * of reclaimed pages
815 */ 1026 */
816static unsigned long shrink_inactive_list(unsigned long max_scan, 1027static unsigned long shrink_inactive_list(unsigned long max_scan,
817 struct zone *zone, struct scan_control *sc) 1028 struct zone *zone, struct scan_control *sc,
1029 int priority, int file)
818{ 1030{
819 LIST_HEAD(page_list); 1031 LIST_HEAD(page_list);
820 struct pagevec pvec; 1032 struct pagevec pvec;
@@ -831,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
831 unsigned long nr_scan; 1043 unsigned long nr_scan;
832 unsigned long nr_freed; 1044 unsigned long nr_freed;
833 unsigned long nr_active; 1045 unsigned long nr_active;
1046 unsigned int count[NR_LRU_LISTS] = { 0, };
1047 int mode = ISOLATE_INACTIVE;
1048
1049 /*
1050 * If we need a large contiguous chunk of memory, or have
1051 * trouble getting a small set of contiguous pages, we
1052 * will reclaim both active and inactive pages.
1053 *
1054 * We use the same threshold as pageout congestion_wait below.
1055 */
1056 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1057 mode = ISOLATE_BOTH;
1058 else if (sc->order && priority < DEF_PRIORITY - 2)
1059 mode = ISOLATE_BOTH;
834 1060
835 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1061 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
836 &page_list, &nr_scan, sc->order, 1062 &page_list, &nr_scan, sc->order, mode,
837 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 1063 zone, sc->mem_cgroup, 0, file);
838 ISOLATE_BOTH : ISOLATE_INACTIVE, 1064 nr_active = clear_active_flags(&page_list, count);
839 zone, sc->mem_cgroup, 0);
840 nr_active = clear_active_flags(&page_list);
841 __count_vm_events(PGDEACTIVATE, nr_active); 1065 __count_vm_events(PGDEACTIVATE, nr_active);
842 1066
843 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 1067 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
844 __mod_zone_page_state(zone, NR_INACTIVE, 1068 -count[LRU_ACTIVE_FILE]);
845 -(nr_taken - nr_active)); 1069 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
846 if (scan_global_lru(sc)) 1070 -count[LRU_INACTIVE_FILE]);
1071 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1072 -count[LRU_ACTIVE_ANON]);
1073 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1074 -count[LRU_INACTIVE_ANON]);
1075
1076 if (scan_global_lru(sc)) {
847 zone->pages_scanned += nr_scan; 1077 zone->pages_scanned += nr_scan;
1078 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1079 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1080 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1081 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1082 }
848 spin_unlock_irq(&zone->lru_lock); 1083 spin_unlock_irq(&zone->lru_lock);
849 1084
850 nr_scanned += nr_scan; 1085 nr_scanned += nr_scan;
@@ -864,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
864 * The attempt at page out may have made some 1099 * The attempt at page out may have made some
865 * of the pages active, mark them inactive again. 1100 * of the pages active, mark them inactive again.
866 */ 1101 */
867 nr_active = clear_active_flags(&page_list); 1102 nr_active = clear_active_flags(&page_list, count);
868 count_vm_events(PGDEACTIVATE, nr_active); 1103 count_vm_events(PGDEACTIVATE, nr_active);
869 1104
870 nr_freed += shrink_page_list(&page_list, sc, 1105 nr_freed += shrink_page_list(&page_list, sc,
@@ -889,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
889 * Put back any unfreeable pages. 1124 * Put back any unfreeable pages.
890 */ 1125 */
891 while (!list_empty(&page_list)) { 1126 while (!list_empty(&page_list)) {
1127 int lru;
892 page = lru_to_page(&page_list); 1128 page = lru_to_page(&page_list);
893 VM_BUG_ON(PageLRU(page)); 1129 VM_BUG_ON(PageLRU(page));
894 SetPageLRU(page);
895 list_del(&page->lru); 1130 list_del(&page->lru);
896 if (PageActive(page)) 1131 if (unlikely(!page_evictable(page, NULL))) {
897 add_page_to_active_list(zone, page); 1132 spin_unlock_irq(&zone->lru_lock);
898 else 1133 putback_lru_page(page);
899 add_page_to_inactive_list(zone, page); 1134 spin_lock_irq(&zone->lru_lock);
1135 continue;
1136 }
1137 SetPageLRU(page);
1138 lru = page_lru(page);
1139 add_page_to_lru_list(zone, page, lru);
1140 mem_cgroup_move_lists(page, lru);
1141 if (PageActive(page) && scan_global_lru(sc)) {
1142 int file = !!page_is_file_cache(page);
1143 zone->recent_rotated[file]++;
1144 }
900 if (!pagevec_add(&pvec, page)) { 1145 if (!pagevec_add(&pvec, page)) {
901 spin_unlock_irq(&zone->lru_lock); 1146 spin_unlock_irq(&zone->lru_lock);
902 __pagevec_release(&pvec); 1147 __pagevec_release(&pvec);
@@ -927,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
927 1172
928static inline int zone_is_near_oom(struct zone *zone) 1173static inline int zone_is_near_oom(struct zone *zone)
929{ 1174{
930 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 1175 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
931 + zone_page_state(zone, NR_INACTIVE))*3;
932}
933
934/*
935 * Determine we should try to reclaim mapped pages.
936 * This is called only when sc->mem_cgroup is NULL.
937 */
938static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
939 int priority)
940{
941 long mapped_ratio;
942 long distress;
943 long swap_tendency;
944 long imbalance;
945 int reclaim_mapped = 0;
946 int prev_priority;
947
948 if (scan_global_lru(sc) && zone_is_near_oom(zone))
949 return 1;
950 /*
951 * `distress' is a measure of how much trouble we're having
952 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
953 */
954 if (scan_global_lru(sc))
955 prev_priority = zone->prev_priority;
956 else
957 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
958
959 distress = 100 >> min(prev_priority, priority);
960
961 /*
962 * The point of this algorithm is to decide when to start
963 * reclaiming mapped memory instead of just pagecache. Work out
964 * how much memory
965 * is mapped.
966 */
967 if (scan_global_lru(sc))
968 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
969 global_page_state(NR_ANON_PAGES)) * 100) /
970 vm_total_pages;
971 else
972 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
973
974 /*
975 * Now decide how much we really want to unmap some pages. The
976 * mapped ratio is downgraded - just because there's a lot of
977 * mapped memory doesn't necessarily mean that page reclaim
978 * isn't succeeding.
979 *
980 * The distress ratio is important - we don't want to start
981 * going oom.
982 *
983 * A 100% value of vm_swappiness overrides this algorithm
984 * altogether.
985 */
986 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
987
988 /*
989 * If there's huge imbalance between active and inactive
990 * (think active 100 times larger than inactive) we should
991 * become more permissive, or the system will take too much
992 * cpu before it start swapping during memory pressure.
993 * Distress is about avoiding early-oom, this is about
994 * making swappiness graceful despite setting it to low
995 * values.
996 *
997 * Avoid div by zero with nr_inactive+1, and max resulting
998 * value is vm_total_pages.
999 */
1000 if (scan_global_lru(sc)) {
1001 imbalance = zone_page_state(zone, NR_ACTIVE);
1002 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1003 } else
1004 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1005
1006 /*
1007 * Reduce the effect of imbalance if swappiness is low,
1008 * this means for a swappiness very low, the imbalance
1009 * must be much higher than 100 for this logic to make
1010 * the difference.
1011 *
1012 * Max temporary value is vm_total_pages*100.
1013 */
1014 imbalance *= (vm_swappiness + 1);
1015 imbalance /= 100;
1016
1017 /*
1018 * If not much of the ram is mapped, makes the imbalance
1019 * less relevant, it's high priority we refill the inactive
1020 * list with mapped pages only in presence of high ratio of
1021 * mapped pages.
1022 *
1023 * Max temporary value is vm_total_pages*100.
1024 */
1025 imbalance *= mapped_ratio;
1026 imbalance /= 100;
1027
1028 /* apply imbalance feedback to swap_tendency */
1029 swap_tendency += imbalance;
1030
1031 /*
1032 * Now use this metric to decide whether to start moving mapped
1033 * memory onto the inactive list.
1034 */
1035 if (swap_tendency >= 100)
1036 reclaim_mapped = 1;
1037
1038 return reclaim_mapped;
1039} 1176}
1040 1177
1041/* 1178/*
@@ -1058,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1058 1195
1059 1196
1060static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1197static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1061 struct scan_control *sc, int priority) 1198 struct scan_control *sc, int priority, int file)
1062{ 1199{
1063 unsigned long pgmoved; 1200 unsigned long pgmoved;
1064 int pgdeactivate = 0; 1201 int pgdeactivate = 0;
1065 unsigned long pgscanned; 1202 unsigned long pgscanned;
1066 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1203 LIST_HEAD(l_hold); /* The pages which were snipped off */
1067 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 1204 LIST_HEAD(l_inactive);
1068 LIST_HEAD(l_active); /* Pages to go onto the active_list */
1069 struct page *page; 1205 struct page *page;
1070 struct pagevec pvec; 1206 struct pagevec pvec;
1071 int reclaim_mapped = 0; 1207 enum lru_list lru;
1072
1073 if (sc->may_swap)
1074 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
1075 1208
1076 lru_add_drain(); 1209 lru_add_drain();
1077 spin_lock_irq(&zone->lru_lock); 1210 spin_lock_irq(&zone->lru_lock);
1078 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1211 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1079 ISOLATE_ACTIVE, zone, 1212 ISOLATE_ACTIVE, zone,
1080 sc->mem_cgroup, 1); 1213 sc->mem_cgroup, 1, file);
1081 /* 1214 /*
1082 * zone->pages_scanned is used for detect zone's oom 1215 * zone->pages_scanned is used for detect zone's oom
1083 * mem_cgroup remembers nr_scan by itself. 1216 * mem_cgroup remembers nr_scan by itself.
1084 */ 1217 */
1085 if (scan_global_lru(sc)) 1218 if (scan_global_lru(sc)) {
1086 zone->pages_scanned += pgscanned; 1219 zone->pages_scanned += pgscanned;
1220 zone->recent_scanned[!!file] += pgmoved;
1221 }
1087 1222
1088 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1223 if (file)
1224 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1225 else
1226 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1089 spin_unlock_irq(&zone->lru_lock); 1227 spin_unlock_irq(&zone->lru_lock);
1090 1228
1229 pgmoved = 0;
1091 while (!list_empty(&l_hold)) { 1230 while (!list_empty(&l_hold)) {
1092 cond_resched(); 1231 cond_resched();
1093 page = lru_to_page(&l_hold); 1232 page = lru_to_page(&l_hold);
1094 list_del(&page->lru); 1233 list_del(&page->lru);
1095 if (page_mapped(page)) { 1234
1096 if (!reclaim_mapped || 1235 if (unlikely(!page_evictable(page, NULL))) {
1097 (total_swap_pages == 0 && PageAnon(page)) || 1236 putback_lru_page(page);
1098 page_referenced(page, 0, sc->mem_cgroup)) { 1237 continue;
1099 list_add(&page->lru, &l_active);
1100 continue;
1101 }
1102 } 1238 }
1239
1240 /* page_referenced clears PageReferenced */
1241 if (page_mapping_inuse(page) &&
1242 page_referenced(page, 0, sc->mem_cgroup))
1243 pgmoved++;
1244
1103 list_add(&page->lru, &l_inactive); 1245 list_add(&page->lru, &l_inactive);
1104 } 1246 }
1105 1247
1248 /*
1249 * Count referenced pages from currently used mappings as
1250 * rotated, even though they are moved to the inactive list.
1251 * This helps balance scan pressure between file and anonymous
1252 * pages in get_scan_ratio.
1253 */
1254 zone->recent_rotated[!!file] += pgmoved;
1255
1256 /*
1257 * Move the pages to the [file or anon] inactive list.
1258 */
1106 pagevec_init(&pvec, 1); 1259 pagevec_init(&pvec, 1);
1260
1107 pgmoved = 0; 1261 pgmoved = 0;
1262 lru = LRU_BASE + file * LRU_FILE;
1108 spin_lock_irq(&zone->lru_lock); 1263 spin_lock_irq(&zone->lru_lock);
1109 while (!list_empty(&l_inactive)) { 1264 while (!list_empty(&l_inactive)) {
1110 page = lru_to_page(&l_inactive); 1265 page = lru_to_page(&l_inactive);
@@ -1114,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1114 VM_BUG_ON(!PageActive(page)); 1269 VM_BUG_ON(!PageActive(page));
1115 ClearPageActive(page); 1270 ClearPageActive(page);
1116 1271
1117 list_move(&page->lru, &zone->inactive_list); 1272 list_move(&page->lru, &zone->lru[lru].list);
1118 mem_cgroup_move_lists(page, false); 1273 mem_cgroup_move_lists(page, lru);
1119 pgmoved++; 1274 pgmoved++;
1120 if (!pagevec_add(&pvec, page)) { 1275 if (!pagevec_add(&pvec, page)) {
1121 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1276 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1122 spin_unlock_irq(&zone->lru_lock); 1277 spin_unlock_irq(&zone->lru_lock);
1123 pgdeactivate += pgmoved; 1278 pgdeactivate += pgmoved;
1124 pgmoved = 0; 1279 pgmoved = 0;
@@ -1128,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1128 spin_lock_irq(&zone->lru_lock); 1283 spin_lock_irq(&zone->lru_lock);
1129 } 1284 }
1130 } 1285 }
1131 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1286 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1132 pgdeactivate += pgmoved; 1287 pgdeactivate += pgmoved;
1133 if (buffer_heads_over_limit) { 1288 if (buffer_heads_over_limit) {
1134 spin_unlock_irq(&zone->lru_lock); 1289 spin_unlock_irq(&zone->lru_lock);
1135 pagevec_strip(&pvec); 1290 pagevec_strip(&pvec);
1136 spin_lock_irq(&zone->lru_lock); 1291 spin_lock_irq(&zone->lru_lock);
1137 } 1292 }
1138
1139 pgmoved = 0;
1140 while (!list_empty(&l_active)) {
1141 page = lru_to_page(&l_active);
1142 prefetchw_prev_lru_page(page, &l_active, flags);
1143 VM_BUG_ON(PageLRU(page));
1144 SetPageLRU(page);
1145 VM_BUG_ON(!PageActive(page));
1146
1147 list_move(&page->lru, &zone->active_list);
1148 mem_cgroup_move_lists(page, true);
1149 pgmoved++;
1150 if (!pagevec_add(&pvec, page)) {
1151 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1152 pgmoved = 0;
1153 spin_unlock_irq(&zone->lru_lock);
1154 __pagevec_release(&pvec);
1155 spin_lock_irq(&zone->lru_lock);
1156 }
1157 }
1158 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1159
1160 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1293 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1161 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1294 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1162 spin_unlock_irq(&zone->lru_lock); 1295 spin_unlock_irq(&zone->lru_lock);
1296 if (vm_swap_full())
1297 pagevec_swap_free(&pvec);
1163 1298
1164 pagevec_release(&pvec); 1299 pagevec_release(&pvec);
1165} 1300}
1166 1301
1302static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1303 struct zone *zone, struct scan_control *sc, int priority)
1304{
1305 int file = is_file_lru(lru);
1306
1307 if (lru == LRU_ACTIVE_FILE) {
1308 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1309 return 0;
1310 }
1311
1312 if (lru == LRU_ACTIVE_ANON &&
1313 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1314 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1315 return 0;
1316 }
1317 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1318}
1319
1320/*
1321 * Determine how aggressively the anon and file LRU lists should be
1322 * scanned. The relative value of each set of LRU lists is determined
1323 * by looking at the fraction of the pages scanned we did rotate back
1324 * onto the active list instead of evict.
1325 *
1326 * percent[0] specifies how much pressure to put on ram/swap backed
1327 * memory, while percent[1] determines pressure on the file LRUs.
1328 */
1329static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1330 unsigned long *percent)
1331{
1332 unsigned long anon, file, free;
1333 unsigned long anon_prio, file_prio;
1334 unsigned long ap, fp;
1335
1336 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1337 zone_page_state(zone, NR_INACTIVE_ANON);
1338 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1339 zone_page_state(zone, NR_INACTIVE_FILE);
1340 free = zone_page_state(zone, NR_FREE_PAGES);
1341
1342 /* If we have no swap space, do not bother scanning anon pages. */
1343 if (nr_swap_pages <= 0) {
1344 percent[0] = 0;
1345 percent[1] = 100;
1346 return;
1347 }
1348
1349 /* If we have very few page cache pages, force-scan anon pages. */
1350 if (unlikely(file + free <= zone->pages_high)) {
1351 percent[0] = 100;
1352 percent[1] = 0;
1353 return;
1354 }
1355
1356 /*
1357 * OK, so we have swap space and a fair amount of page cache
1358 * pages. We use the recently rotated / recently scanned
1359 * ratios to determine how valuable each cache is.
1360 *
1361 * Because workloads change over time (and to avoid overflow)
1362 * we keep these statistics as a floating average, which ends
1363 * up weighing recent references more than old ones.
1364 *
1365 * anon in [0], file in [1]
1366 */
1367 if (unlikely(zone->recent_scanned[0] > anon / 4)) {
1368 spin_lock_irq(&zone->lru_lock);
1369 zone->recent_scanned[0] /= 2;
1370 zone->recent_rotated[0] /= 2;
1371 spin_unlock_irq(&zone->lru_lock);
1372 }
1373
1374 if (unlikely(zone->recent_scanned[1] > file / 4)) {
1375 spin_lock_irq(&zone->lru_lock);
1376 zone->recent_scanned[1] /= 2;
1377 zone->recent_rotated[1] /= 2;
1378 spin_unlock_irq(&zone->lru_lock);
1379 }
1380
1381 /*
1382 * With swappiness at 100, anonymous and file have the same priority.
1383 * This scanning priority is essentially the inverse of IO cost.
1384 */
1385 anon_prio = sc->swappiness;
1386 file_prio = 200 - sc->swappiness;
1387
1388 /*
1389 * anon recent_rotated[0]
1390 * %anon = 100 * ----------- / ----------------- * IO cost
1391 * anon + file rotate_sum
1392 */
1393 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
1394 ap /= zone->recent_rotated[0] + 1;
1395
1396 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
1397 fp /= zone->recent_rotated[1] + 1;
1398
1399 /* Normalize to percentages */
1400 percent[0] = 100 * ap / (ap + fp + 1);
1401 percent[1] = 100 - percent[0];
1402}
1403
1404
1167/* 1405/*
1168 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1406 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1169 */ 1407 */
1170static unsigned long shrink_zone(int priority, struct zone *zone, 1408static unsigned long shrink_zone(int priority, struct zone *zone,
1171 struct scan_control *sc) 1409 struct scan_control *sc)
1172{ 1410{
1173 unsigned long nr_active; 1411 unsigned long nr[NR_LRU_LISTS];
1174 unsigned long nr_inactive;
1175 unsigned long nr_to_scan; 1412 unsigned long nr_to_scan;
1176 unsigned long nr_reclaimed = 0; 1413 unsigned long nr_reclaimed = 0;
1414 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1415 enum lru_list l;
1177 1416
1178 if (scan_global_lru(sc)) { 1417 get_scan_ratio(zone, sc, percent);
1179 /*
1180 * Add one to nr_to_scan just to make sure that the kernel
1181 * will slowly sift through the active list.
1182 */
1183 zone->nr_scan_active +=
1184 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
1185 nr_active = zone->nr_scan_active;
1186 zone->nr_scan_inactive +=
1187 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1188 nr_inactive = zone->nr_scan_inactive;
1189 if (nr_inactive >= sc->swap_cluster_max)
1190 zone->nr_scan_inactive = 0;
1191 else
1192 nr_inactive = 0;
1193
1194 if (nr_active >= sc->swap_cluster_max)
1195 zone->nr_scan_active = 0;
1196 else
1197 nr_active = 0;
1198 } else {
1199 /*
1200 * This reclaim occurs not because zone memory shortage but
1201 * because memory controller hits its limit.
1202 * Then, don't modify zone reclaim related data.
1203 */
1204 nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
1205 zone, priority);
1206
1207 nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
1208 zone, priority);
1209 }
1210 1418
1419 for_each_evictable_lru(l) {
1420 if (scan_global_lru(sc)) {
1421 int file = is_file_lru(l);
1422 int scan;
1211 1423
1212 while (nr_active || nr_inactive) { 1424 scan = zone_page_state(zone, NR_LRU_BASE + l);
1213 if (nr_active) { 1425 if (priority) {
1214 nr_to_scan = min(nr_active, 1426 scan >>= priority;
1215 (unsigned long)sc->swap_cluster_max); 1427 scan = (scan * percent[file]) / 100;
1216 nr_active -= nr_to_scan; 1428 }
1217 shrink_active_list(nr_to_scan, zone, sc, priority); 1429 zone->lru[l].nr_scan += scan;
1430 nr[l] = zone->lru[l].nr_scan;
1431 if (nr[l] >= sc->swap_cluster_max)
1432 zone->lru[l].nr_scan = 0;
1433 else
1434 nr[l] = 0;
1435 } else {
1436 /*
1437 * This reclaim occurs not because zone memory shortage
1438 * but because memory controller hits its limit.
1439 * Don't modify zone reclaim related data.
1440 */
1441 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1442 priority, l);
1218 } 1443 }
1444 }
1219 1445
1220 if (nr_inactive) { 1446 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1221 nr_to_scan = min(nr_inactive, 1447 nr[LRU_INACTIVE_FILE]) {
1448 for_each_evictable_lru(l) {
1449 if (nr[l]) {
1450 nr_to_scan = min(nr[l],
1222 (unsigned long)sc->swap_cluster_max); 1451 (unsigned long)sc->swap_cluster_max);
1223 nr_inactive -= nr_to_scan; 1452 nr[l] -= nr_to_scan;
1224 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 1453
1225 sc); 1454 nr_reclaimed += shrink_list(l, nr_to_scan,
1455 zone, sc, priority);
1456 }
1226 } 1457 }
1227 } 1458 }
1228 1459
1460 /*
1461 * Even if we did not try to evict anon pages at all, we want to
1462 * rebalance the anon lru active/inactive ratio.
1463 */
1464 if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
1465 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1466 else if (!scan_global_lru(sc))
1467 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1468
1229 throttle_vm_writeout(sc->gfp_mask); 1469 throttle_vm_writeout(sc->gfp_mask);
1230 return nr_reclaimed; 1470 return nr_reclaimed;
1231} 1471}
@@ -1286,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1286 1526
1287 return nr_reclaimed; 1527 return nr_reclaimed;
1288} 1528}
1289 1529
1290/* 1530/*
1291 * This is the main entry point to direct page reclaim. 1531 * This is the main entry point to direct page reclaim.
1292 * 1532 *
@@ -1316,6 +1556,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1316 struct zone *zone; 1556 struct zone *zone;
1317 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1557 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1318 1558
1559 delayacct_freepages_start();
1560
1319 if (scan_global_lru(sc)) 1561 if (scan_global_lru(sc))
1320 count_vm_event(ALLOCSTALL); 1562 count_vm_event(ALLOCSTALL);
1321 /* 1563 /*
@@ -1327,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1327 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1569 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1328 continue; 1570 continue;
1329 1571
1330 lru_pages += zone_page_state(zone, NR_ACTIVE) 1572 lru_pages += zone_lru_pages(zone);
1331 + zone_page_state(zone, NR_INACTIVE);
1332 } 1573 }
1333 } 1574 }
1334 1575
@@ -1371,7 +1612,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1371 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1612 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1372 congestion_wait(WRITE, HZ/10); 1613 congestion_wait(WRITE, HZ/10);
1373 } 1614 }
1374 /* top priority shrink_caches still had more to do? don't OOM, then */ 1615 /* top priority shrink_zones still had more to do? don't OOM, then */
1375 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1616 if (!sc->all_unreclaimable && scan_global_lru(sc))
1376 ret = nr_reclaimed; 1617 ret = nr_reclaimed;
1377out: 1618out:
@@ -1396,6 +1637,8 @@ out:
1396 } else 1637 } else
1397 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1638 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1398 1639
1640 delayacct_freepages_end();
1641
1399 return ret; 1642 return ret;
1400} 1643}
1401 1644
@@ -1516,6 +1759,14 @@ loop_again:
1516 priority != DEF_PRIORITY) 1759 priority != DEF_PRIORITY)
1517 continue; 1760 continue;
1518 1761
1762 /*
1763 * Do some background aging of the anon list, to give
1764 * pages a chance to be referenced before reclaiming.
1765 */
1766 if (inactive_anon_is_low(zone))
1767 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1768 &sc, priority, 0);
1769
1519 if (!zone_watermark_ok(zone, order, zone->pages_high, 1770 if (!zone_watermark_ok(zone, order, zone->pages_high,
1520 0, 0)) { 1771 0, 0)) {
1521 end_zone = i; 1772 end_zone = i;
@@ -1528,8 +1779,7 @@ loop_again:
1528 for (i = 0; i <= end_zone; i++) { 1779 for (i = 0; i <= end_zone; i++) {
1529 struct zone *zone = pgdat->node_zones + i; 1780 struct zone *zone = pgdat->node_zones + i;
1530 1781
1531 lru_pages += zone_page_state(zone, NR_ACTIVE) 1782 lru_pages += zone_lru_pages(zone);
1532 + zone_page_state(zone, NR_INACTIVE);
1533 } 1783 }
1534 1784
1535 /* 1785 /*
@@ -1573,8 +1823,7 @@ loop_again:
1573 if (zone_is_all_unreclaimable(zone)) 1823 if (zone_is_all_unreclaimable(zone))
1574 continue; 1824 continue;
1575 if (nr_slab == 0 && zone->pages_scanned >= 1825 if (nr_slab == 0 && zone->pages_scanned >=
1576 (zone_page_state(zone, NR_ACTIVE) 1826 (zone_lru_pages(zone) * 6))
1577 + zone_page_state(zone, NR_INACTIVE)) * 6)
1578 zone_set_flag(zone, 1827 zone_set_flag(zone,
1579 ZONE_ALL_UNRECLAIMABLE); 1828 ZONE_ALL_UNRECLAIMABLE);
1580 /* 1829 /*
@@ -1628,7 +1877,7 @@ out:
1628 1877
1629/* 1878/*
1630 * The background pageout daemon, started as a kernel thread 1879 * The background pageout daemon, started as a kernel thread
1631 * from the init process. 1880 * from the init process.
1632 * 1881 *
1633 * This basically trickles out pages so that we have _some_ 1882 * This basically trickles out pages so that we have _some_
1634 * free memory available even if there is no other activity 1883 * free memory available even if there is no other activity
@@ -1722,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order)
1722 wake_up_interruptible(&pgdat->kswapd_wait); 1971 wake_up_interruptible(&pgdat->kswapd_wait);
1723} 1972}
1724 1973
1974unsigned long global_lru_pages(void)
1975{
1976 return global_page_state(NR_ACTIVE_ANON)
1977 + global_page_state(NR_ACTIVE_FILE)
1978 + global_page_state(NR_INACTIVE_ANON)
1979 + global_page_state(NR_INACTIVE_FILE);
1980}
1981
1725#ifdef CONFIG_PM 1982#ifdef CONFIG_PM
1726/* 1983/*
1727 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1984 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
@@ -1735,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1735{ 1992{
1736 struct zone *zone; 1993 struct zone *zone;
1737 unsigned long nr_to_scan, ret = 0; 1994 unsigned long nr_to_scan, ret = 0;
1995 enum lru_list l;
1738 1996
1739 for_each_zone(zone) { 1997 for_each_zone(zone) {
1740 1998
@@ -1744,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1744 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2002 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
1745 continue; 2003 continue;
1746 2004
1747 /* For pass = 0 we don't shrink the active list */ 2005 for_each_evictable_lru(l) {
1748 if (pass > 0) { 2006 /* For pass = 0, we don't shrink the active list */
1749 zone->nr_scan_active += 2007 if (pass == 0 &&
1750 (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; 2008 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
1751 if (zone->nr_scan_active >= nr_pages || pass > 3) { 2009 continue;
1752 zone->nr_scan_active = 0; 2010
2011 zone->lru[l].nr_scan +=
2012 (zone_page_state(zone, NR_LRU_BASE + l)
2013 >> prio) + 1;
2014 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
2015 zone->lru[l].nr_scan = 0;
1753 nr_to_scan = min(nr_pages, 2016 nr_to_scan = min(nr_pages,
1754 zone_page_state(zone, NR_ACTIVE)); 2017 zone_page_state(zone,
1755 shrink_active_list(nr_to_scan, zone, sc, prio); 2018 NR_LRU_BASE + l));
2019 ret += shrink_list(l, nr_to_scan, zone,
2020 sc, prio);
2021 if (ret >= nr_pages)
2022 return ret;
1756 } 2023 }
1757 } 2024 }
1758
1759 zone->nr_scan_inactive +=
1760 (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
1761 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1762 zone->nr_scan_inactive = 0;
1763 nr_to_scan = min(nr_pages,
1764 zone_page_state(zone, NR_INACTIVE));
1765 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1766 if (ret >= nr_pages)
1767 return ret;
1768 }
1769 } 2025 }
1770 2026
1771 return ret; 2027 return ret;
1772} 2028}
1773 2029
1774static unsigned long count_lru_pages(void)
1775{
1776 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
1777}
1778
1779/* 2030/*
1780 * Try to free `nr_pages' of memory, system-wide, and return the number of 2031 * Try to free `nr_pages' of memory, system-wide, and return the number of
1781 * freed pages. 2032 * freed pages.
@@ -1801,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1801 2052
1802 current->reclaim_state = &reclaim_state; 2053 current->reclaim_state = &reclaim_state;
1803 2054
1804 lru_pages = count_lru_pages(); 2055 lru_pages = global_lru_pages();
1805 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2056 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1806 /* If slab caches are huge, it's better to hit them first */ 2057 /* If slab caches are huge, it's better to hit them first */
1807 while (nr_slab >= lru_pages) { 2058 while (nr_slab >= lru_pages) {
@@ -1844,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1844 2095
1845 reclaim_state.reclaimed_slab = 0; 2096 reclaim_state.reclaimed_slab = 0;
1846 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2097 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1847 count_lru_pages()); 2098 global_lru_pages());
1848 ret += reclaim_state.reclaimed_slab; 2099 ret += reclaim_state.reclaimed_slab;
1849 if (ret >= nr_pages) 2100 if (ret >= nr_pages)
1850 goto out; 2101 goto out;
@@ -1861,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1861 if (!ret) { 2112 if (!ret) {
1862 do { 2113 do {
1863 reclaim_state.reclaimed_slab = 0; 2114 reclaim_state.reclaimed_slab = 0;
1864 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 2115 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
1865 ret += reclaim_state.reclaimed_slab; 2116 ret += reclaim_state.reclaimed_slab;
1866 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 2117 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1867 } 2118 }
@@ -1940,7 +2191,7 @@ module_init(kswapd_init)
1940int zone_reclaim_mode __read_mostly; 2191int zone_reclaim_mode __read_mostly;
1941 2192
1942#define RECLAIM_OFF 0 2193#define RECLAIM_OFF 0
1943#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 2194#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
1944#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 2195#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1945#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 2196#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1946 2197
@@ -2089,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2089 return ret; 2340 return ret;
2090} 2341}
2091#endif 2342#endif
2343
2344#ifdef CONFIG_UNEVICTABLE_LRU
2345/*
2346 * page_evictable - test whether a page is evictable
2347 * @page: the page to test
2348 * @vma: the VMA in which the page is or will be mapped, may be NULL
2349 *
2350 * Test whether page is evictable--i.e., should be placed on active/inactive
2351 * lists vs unevictable list. The vma argument is !NULL when called from the
2352 * fault path to determine how to instantate a new page.
2353 *
2354 * Reasons page might not be evictable:
2355 * (1) page's mapping marked unevictable
2356 * (2) page is part of an mlocked VMA
2357 *
2358 */
2359int page_evictable(struct page *page, struct vm_area_struct *vma)
2360{
2361
2362 if (mapping_unevictable(page_mapping(page)))
2363 return 0;
2364
2365 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2366 return 0;
2367
2368 return 1;
2369}
2370
2371static void show_page_path(struct page *page)
2372{
2373 char buf[256];
2374 if (page_is_file_cache(page)) {
2375 struct address_space *mapping = page->mapping;
2376 struct dentry *dentry;
2377 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
2378
2379 spin_lock(&mapping->i_mmap_lock);
2380 dentry = d_find_alias(mapping->host);
2381 printk(KERN_INFO "rescued: %s %lu\n",
2382 dentry_path(dentry, buf, 256), pgoff);
2383 spin_unlock(&mapping->i_mmap_lock);
2384 } else {
2385#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
2386 struct anon_vma *anon_vma;
2387 struct vm_area_struct *vma;
2388
2389 anon_vma = page_lock_anon_vma(page);
2390 if (!anon_vma)
2391 return;
2392
2393 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
2394 printk(KERN_INFO "rescued: anon %s\n",
2395 vma->vm_mm->owner->comm);
2396 break;
2397 }
2398 page_unlock_anon_vma(anon_vma);
2399#endif
2400 }
2401}
2402
2403
2404/**
2405 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
2406 * @page: page to check evictability and move to appropriate lru list
2407 * @zone: zone page is in
2408 *
2409 * Checks a page for evictability and moves the page to the appropriate
2410 * zone lru list.
2411 *
2412 * Restrictions: zone->lru_lock must be held, page must be on LRU and must
2413 * have PageUnevictable set.
2414 */
2415static void check_move_unevictable_page(struct page *page, struct zone *zone)
2416{
2417 VM_BUG_ON(PageActive(page));
2418
2419retry:
2420 ClearPageUnevictable(page);
2421 if (page_evictable(page, NULL)) {
2422 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
2423
2424 show_page_path(page);
2425
2426 __dec_zone_state(zone, NR_UNEVICTABLE);
2427 list_move(&page->lru, &zone->lru[l].list);
2428 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2429 __count_vm_event(UNEVICTABLE_PGRESCUED);
2430 } else {
2431 /*
2432 * rotate unevictable list
2433 */
2434 SetPageUnevictable(page);
2435 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2436 if (page_evictable(page, NULL))
2437 goto retry;
2438 }
2439}
2440
2441/**
2442 * scan_mapping_unevictable_pages - scan an address space for evictable pages
2443 * @mapping: struct address_space to scan for evictable pages
2444 *
2445 * Scan all pages in mapping. Check unevictable pages for
2446 * evictability and move them to the appropriate zone lru list.
2447 */
2448void scan_mapping_unevictable_pages(struct address_space *mapping)
2449{
2450 pgoff_t next = 0;
2451 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
2452 PAGE_CACHE_SHIFT;
2453 struct zone *zone;
2454 struct pagevec pvec;
2455
2456 if (mapping->nrpages == 0)
2457 return;
2458
2459 pagevec_init(&pvec, 0);
2460 while (next < end &&
2461 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
2462 int i;
2463 int pg_scanned = 0;
2464
2465 zone = NULL;
2466
2467 for (i = 0; i < pagevec_count(&pvec); i++) {
2468 struct page *page = pvec.pages[i];
2469 pgoff_t page_index = page->index;
2470 struct zone *pagezone = page_zone(page);
2471
2472 pg_scanned++;
2473 if (page_index > next)
2474 next = page_index;
2475 next++;
2476
2477 if (pagezone != zone) {
2478 if (zone)
2479 spin_unlock_irq(&zone->lru_lock);
2480 zone = pagezone;
2481 spin_lock_irq(&zone->lru_lock);
2482 }
2483
2484 if (PageLRU(page) && PageUnevictable(page))
2485 check_move_unevictable_page(page, zone);
2486 }
2487 if (zone)
2488 spin_unlock_irq(&zone->lru_lock);
2489 pagevec_release(&pvec);
2490
2491 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
2492 }
2493
2494}
2495
2496/**
2497 * scan_zone_unevictable_pages - check unevictable list for evictable pages
2498 * @zone - zone of which to scan the unevictable list
2499 *
2500 * Scan @zone's unevictable LRU lists to check for pages that have become
2501 * evictable. Move those that have to @zone's inactive list where they
2502 * become candidates for reclaim, unless shrink_inactive_zone() decides
2503 * to reactivate them. Pages that are still unevictable are rotated
2504 * back onto @zone's unevictable list.
2505 */
2506#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2507void scan_zone_unevictable_pages(struct zone *zone)
2508{
2509 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2510 unsigned long scan;
2511 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
2512
2513 while (nr_to_scan > 0) {
2514 unsigned long batch_size = min(nr_to_scan,
2515 SCAN_UNEVICTABLE_BATCH_SIZE);
2516
2517 spin_lock_irq(&zone->lru_lock);
2518 for (scan = 0; scan < batch_size; scan++) {
2519 struct page *page = lru_to_page(l_unevictable);
2520
2521 if (!trylock_page(page))
2522 continue;
2523
2524 prefetchw_prev_lru_page(page, l_unevictable, flags);
2525
2526 if (likely(PageLRU(page) && PageUnevictable(page)))
2527 check_move_unevictable_page(page, zone);
2528
2529 unlock_page(page);
2530 }
2531 spin_unlock_irq(&zone->lru_lock);
2532
2533 nr_to_scan -= batch_size;
2534 }
2535}
2536
2537
2538/**
2539 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
2540 *
2541 * A really big hammer: scan all zones' unevictable LRU lists to check for
2542 * pages that have become evictable. Move those back to the zones'
2543 * inactive list where they become candidates for reclaim.
2544 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
2545 * and we add swap to the system. As such, it runs in the context of a task
2546 * that has possibly/probably made some previously unevictable pages
2547 * evictable.
2548 */
2549void scan_all_zones_unevictable_pages(void)
2550{
2551 struct zone *zone;
2552
2553 for_each_zone(zone) {
2554 scan_zone_unevictable_pages(zone);
2555 }
2556}
2557
2558/*
2559 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
2560 * all nodes' unevictable lists for evictable pages
2561 */
2562unsigned long scan_unevictable_pages;
2563
2564int scan_unevictable_handler(struct ctl_table *table, int write,
2565 struct file *file, void __user *buffer,
2566 size_t *length, loff_t *ppos)
2567{
2568 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2569
2570 if (write && *(unsigned long *)table->data)
2571 scan_all_zones_unevictable_pages();
2572
2573 scan_unevictable_pages = 0;
2574 return 0;
2575}
2576
2577/*
2578 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2579 * a specified node's per zone unevictable lists for evictable pages.
2580 */
2581
2582static ssize_t read_scan_unevictable_node(struct sys_device *dev,
2583 struct sysdev_attribute *attr,
2584 char *buf)
2585{
2586 return sprintf(buf, "0\n"); /* always zero; should fit... */
2587}
2588
2589static ssize_t write_scan_unevictable_node(struct sys_device *dev,
2590 struct sysdev_attribute *attr,
2591 const char *buf, size_t count)
2592{
2593 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
2594 struct zone *zone;
2595 unsigned long res;
2596 unsigned long req = strict_strtoul(buf, 10, &res);
2597
2598 if (!req)
2599 return 1; /* zero is no-op */
2600
2601 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2602 if (!populated_zone(zone))
2603 continue;
2604 scan_zone_unevictable_pages(zone);
2605 }
2606 return 1;
2607}
2608
2609
2610static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
2611 read_scan_unevictable_node,
2612 write_scan_unevictable_node);
2613
2614int scan_unevictable_register_node(struct node *node)
2615{
2616 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
2617}
2618
2619void scan_unevictable_unregister_node(struct node *node)
2620{
2621 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2622}
2623
2624#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db9eabb2c5b3..c3ccfda23adc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -8,11 +8,12 @@
8 * Copyright (C) 2006 Silicon Graphics, Inc., 8 * Copyright (C) 2006 Silicon Graphics, Inc.,
9 * Christoph Lameter <christoph@lameter.com> 9 * Christoph Lameter <christoph@lameter.com>
10 */ 10 */
11 11#include <linux/fs.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/vmstat.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17 18
18#ifdef CONFIG_VM_EVENT_COUNTERS 19#ifdef CONFIG_VM_EVENT_COUNTERS
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
26 27
27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
28 29
29 for_each_cpu_mask(cpu, *cpumask) { 30 for_each_cpu_mask_nr(cpu, *cpumask) {
30 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
31 32
32 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -383,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
383#endif 384#endif
384 385
385#ifdef CONFIG_PROC_FS 386#ifdef CONFIG_PROC_FS
386 387#include <linux/proc_fs.h>
387#include <linux/seq_file.h> 388#include <linux/seq_file.h>
388 389
389static char * const migratetype_names[MIGRATE_TYPES] = { 390static char * const migratetype_names[MIGRATE_TYPES] = {
@@ -515,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
515 continue; 516 continue;
516 517
517 page = pfn_to_page(pfn); 518 page = pfn_to_page(pfn);
519#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
520 /*
521 * Ordinarily, memory holes in flatmem still have a valid
522 * memmap for the PFN range. However, an architecture for
523 * embedded systems (e.g. ARM) can free up the memmap backing
524 * holes to save memory on the assumption the memmap is
525 * never used. The page_zone linkages are then broken even
526 * though pfn_valid() returns true. Skip the page if the
527 * linkages are broken. Even if this test passed, the impact
528 * is that the counters for the movable type are off but
529 * fragmentation monitoring is likely meaningless on small
530 * systems.
531 */
532 if (page_zone(page) != zone)
533 continue;
534#endif
518 mtype = get_pageblock_migratetype(page); 535 mtype = get_pageblock_migratetype(page);
519 536
520 count[mtype]++; 537 if (mtype < MIGRATE_TYPES)
538 count[mtype]++;
521 } 539 }
522 540
523 /* Print counts */ 541 /* Print counts */
@@ -563,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
563 return 0; 581 return 0;
564} 582}
565 583
566const struct seq_operations fragmentation_op = { 584static const struct seq_operations fragmentation_op = {
567 .start = frag_start, 585 .start = frag_start,
568 .next = frag_next, 586 .next = frag_next,
569 .stop = frag_stop, 587 .stop = frag_stop,
570 .show = frag_show, 588 .show = frag_show,
571}; 589};
572 590
573const struct seq_operations pagetypeinfo_op = { 591static int fragmentation_open(struct inode *inode, struct file *file)
592{
593 return seq_open(file, &fragmentation_op);
594}
595
596static const struct file_operations fragmentation_file_operations = {
597 .open = fragmentation_open,
598 .read = seq_read,
599 .llseek = seq_lseek,
600 .release = seq_release,
601};
602
603static const struct seq_operations pagetypeinfo_op = {
574 .start = frag_start, 604 .start = frag_start,
575 .next = frag_next, 605 .next = frag_next,
576 .stop = frag_stop, 606 .stop = frag_stop,
577 .show = pagetypeinfo_show, 607 .show = pagetypeinfo_show,
578}; 608};
579 609
610static int pagetypeinfo_open(struct inode *inode, struct file *file)
611{
612 return seq_open(file, &pagetypeinfo_op);
613}
614
615static const struct file_operations pagetypeinfo_file_ops = {
616 .open = pagetypeinfo_open,
617 .read = seq_read,
618 .llseek = seq_lseek,
619 .release = seq_release,
620};
621
580#ifdef CONFIG_ZONE_DMA 622#ifdef CONFIG_ZONE_DMA
581#define TEXT_FOR_DMA(xx) xx "_dma", 623#define TEXT_FOR_DMA(xx) xx "_dma",
582#else 624#else
@@ -601,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = {
601static const char * const vmstat_text[] = { 643static const char * const vmstat_text[] = {
602 /* Zoned VM counters */ 644 /* Zoned VM counters */
603 "nr_free_pages", 645 "nr_free_pages",
604 "nr_inactive", 646 "nr_inactive_anon",
605 "nr_active", 647 "nr_active_anon",
648 "nr_inactive_file",
649 "nr_active_file",
650#ifdef CONFIG_UNEVICTABLE_LRU
651 "nr_unevictable",
652 "nr_mlock",
653#endif
606 "nr_anon_pages", 654 "nr_anon_pages",
607 "nr_mapped", 655 "nr_mapped",
608 "nr_file_pages", 656 "nr_file_pages",
@@ -657,6 +705,16 @@ static const char * const vmstat_text[] = {
657 "htlb_buddy_alloc_success", 705 "htlb_buddy_alloc_success",
658 "htlb_buddy_alloc_fail", 706 "htlb_buddy_alloc_fail",
659#endif 707#endif
708#ifdef CONFIG_UNEVICTABLE_LRU
709 "unevictable_pgs_culled",
710 "unevictable_pgs_scanned",
711 "unevictable_pgs_rescued",
712 "unevictable_pgs_mlocked",
713 "unevictable_pgs_munlocked",
714 "unevictable_pgs_cleared",
715 "unevictable_pgs_stranded",
716 "unevictable_pgs_mlockfreed",
717#endif
660#endif 718#endif
661}; 719};
662 720
@@ -670,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
670 "\n min %lu" 728 "\n min %lu"
671 "\n low %lu" 729 "\n low %lu"
672 "\n high %lu" 730 "\n high %lu"
673 "\n scanned %lu (a: %lu i: %lu)" 731 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
674 "\n spanned %lu" 732 "\n spanned %lu"
675 "\n present %lu", 733 "\n present %lu",
676 zone_page_state(zone, NR_FREE_PAGES), 734 zone_page_state(zone, NR_FREE_PAGES),
@@ -678,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
678 zone->pages_low, 736 zone->pages_low,
679 zone->pages_high, 737 zone->pages_high,
680 zone->pages_scanned, 738 zone->pages_scanned,
681 zone->nr_scan_active, zone->nr_scan_inactive, 739 zone->lru[LRU_ACTIVE_ANON].nr_scan,
740 zone->lru[LRU_INACTIVE_ANON].nr_scan,
741 zone->lru[LRU_ACTIVE_FILE].nr_scan,
742 zone->lru[LRU_INACTIVE_FILE].nr_scan,
682 zone->spanned_pages, 743 zone->spanned_pages,
683 zone->present_pages); 744 zone->present_pages);
684 745
@@ -715,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
715 seq_printf(m, 776 seq_printf(m,
716 "\n all_unreclaimable: %u" 777 "\n all_unreclaimable: %u"
717 "\n prev_priority: %i" 778 "\n prev_priority: %i"
718 "\n start_pfn: %lu", 779 "\n start_pfn: %lu"
780 "\n inactive_ratio: %u",
719 zone_is_all_unreclaimable(zone), 781 zone_is_all_unreclaimable(zone),
720 zone->prev_priority, 782 zone->prev_priority,
721 zone->zone_start_pfn); 783 zone->zone_start_pfn,
784 zone->inactive_ratio);
722 seq_putc(m, '\n'); 785 seq_putc(m, '\n');
723} 786}
724 787
@@ -732,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
732 return 0; 795 return 0;
733} 796}
734 797
735const struct seq_operations zoneinfo_op = { 798static const struct seq_operations zoneinfo_op = {
736 .start = frag_start, /* iterate over all zones. The same as in 799 .start = frag_start, /* iterate over all zones. The same as in
737 * fragmentation. */ 800 * fragmentation. */
738 .next = frag_next, 801 .next = frag_next,
@@ -740,6 +803,18 @@ const struct seq_operations zoneinfo_op = {
740 .show = zoneinfo_show, 803 .show = zoneinfo_show,
741}; 804};
742 805
806static int zoneinfo_open(struct inode *inode, struct file *file)
807{
808 return seq_open(file, &zoneinfo_op);
809}
810
811static const struct file_operations proc_zoneinfo_file_operations = {
812 .open = zoneinfo_open,
813 .read = seq_read,
814 .llseek = seq_lseek,
815 .release = seq_release,
816};
817
743static void *vmstat_start(struct seq_file *m, loff_t *pos) 818static void *vmstat_start(struct seq_file *m, loff_t *pos)
744{ 819{
745 unsigned long *v; 820 unsigned long *v;
@@ -795,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg)
795 m->private = NULL; 870 m->private = NULL;
796} 871}
797 872
798const struct seq_operations vmstat_op = { 873static const struct seq_operations vmstat_op = {
799 .start = vmstat_start, 874 .start = vmstat_start,
800 .next = vmstat_next, 875 .next = vmstat_next,
801 .stop = vmstat_stop, 876 .stop = vmstat_stop,
802 .show = vmstat_show, 877 .show = vmstat_show,
803}; 878};
804 879
880static int vmstat_open(struct inode *inode, struct file *file)
881{
882 return seq_open(file, &vmstat_op);
883}
884
885static const struct file_operations proc_vmstat_file_operations = {
886 .open = vmstat_open,
887 .read = seq_read,
888 .llseek = seq_lseek,
889 .release = seq_release,
890};
805#endif /* CONFIG_PROC_FS */ 891#endif /* CONFIG_PROC_FS */
806 892
807#ifdef CONFIG_SMP 893#ifdef CONFIG_SMP
@@ -859,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
859 945
860static struct notifier_block __cpuinitdata vmstat_notifier = 946static struct notifier_block __cpuinitdata vmstat_notifier =
861 { &vmstat_cpuup_callback, NULL, 0 }; 947 { &vmstat_cpuup_callback, NULL, 0 };
948#endif
862 949
863static int __init setup_vmstat(void) 950static int __init setup_vmstat(void)
864{ 951{
952#ifdef CONFIG_SMP
865 int cpu; 953 int cpu;
866 954
867 refresh_zone_stat_thresholds(); 955 refresh_zone_stat_thresholds();
@@ -869,7 +957,13 @@ static int __init setup_vmstat(void)
869 957
870 for_each_online_cpu(cpu) 958 for_each_online_cpu(cpu)
871 start_cpu_timer(cpu); 959 start_cpu_timer(cpu);
960#endif
961#ifdef CONFIG_PROC_FS
962 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
963 proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
964 proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
965 proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
966#endif
872 return 0; 967 return 0;
873} 968}
874module_init(setup_vmstat) 969module_init(setup_vmstat)
875#endif