diff options
96 files changed, 2792 insertions, 1697 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 71c4da413444..a25cb3fafeba 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -144,9 +144,9 @@ Figure 1 shows the important aspects of the controller | |||
144 | 3. Each page has a pointer to the page_cgroup, which in turn knows the | 144 | 3. Each page has a pointer to the page_cgroup, which in turn knows the |
145 | cgroup it belongs to | 145 | cgroup it belongs to |
146 | 146 | ||
147 | The accounting is done as follows: mem_cgroup_charge() is invoked to set up | 147 | The accounting is done as follows: mem_cgroup_charge_common() is invoked to |
148 | the necessary data structures and check if the cgroup that is being charged | 148 | set up the necessary data structures and check if the cgroup that is being |
149 | is over its limit. If it is, then reclaim is invoked on the cgroup. | 149 | charged is over its limit. If it is, then reclaim is invoked on the cgroup. |
150 | More details can be found in the reclaim section of this document. | 150 | More details can be found in the reclaim section of this document. |
151 | If everything goes well, a page meta-data-structure called page_cgroup is | 151 | If everything goes well, a page meta-data-structure called page_cgroup is |
152 | updated. page_cgroup has its own LRU on cgroup. | 152 | updated. page_cgroup has its own LRU on cgroup. |
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 6d0c2519cf47..c6f993d491b5 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
@@ -161,7 +161,8 @@ a recent addition and not present on older kernels. | |||
161 | in the memory block. | 161 | in the memory block. |
162 | 'state' : read-write | 162 | 'state' : read-write |
163 | at read: contains online/offline state of memory. | 163 | at read: contains online/offline state of memory. |
164 | at write: user can specify "online", "offline" command | 164 | at write: user can specify "online_kernel", |
165 | "online_movable", "online", "offline" command | ||
165 | which will be performed on al sections in the block. | 166 | which will be performed on al sections in the block. |
166 | 'phys_device' : read-only: designed to show the name of physical memory | 167 | 'phys_device' : read-only: designed to show the name of physical memory |
167 | device. This is not well implemented now. | 168 | device. This is not well implemented now. |
@@ -255,6 +256,17 @@ For onlining, you have to write "online" to the section's state file as: | |||
255 | 256 | ||
256 | % echo online > /sys/devices/system/memory/memoryXXX/state | 257 | % echo online > /sys/devices/system/memory/memoryXXX/state |
257 | 258 | ||
259 | This onlining will not change the ZONE type of the target memory section, | ||
260 | If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: | ||
261 | |||
262 | % echo online_movable > /sys/devices/system/memory/memoryXXX/state | ||
263 | (NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) | ||
264 | |||
265 | And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: | ||
266 | |||
267 | % echo online_kernel > /sys/devices/system/memory/memoryXXX/state | ||
268 | (NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) | ||
269 | |||
258 | After this, section memoryXXX's state will be 'online' and the amount of | 270 | After this, section memoryXXX's state will be 'online' and the amount of |
259 | available memory will be increased. | 271 | available memory will be increased. |
260 | 272 | ||
@@ -377,15 +389,18 @@ The third argument is passed by pointer of struct memory_notify. | |||
377 | struct memory_notify { | 389 | struct memory_notify { |
378 | unsigned long start_pfn; | 390 | unsigned long start_pfn; |
379 | unsigned long nr_pages; | 391 | unsigned long nr_pages; |
392 | int status_change_nid_normal; | ||
380 | int status_change_nid; | 393 | int status_change_nid; |
381 | } | 394 | } |
382 | 395 | ||
383 | start_pfn is start_pfn of online/offline memory. | 396 | start_pfn is start_pfn of online/offline memory. |
384 | nr_pages is # of pages of online/offline memory. | 397 | nr_pages is # of pages of online/offline memory. |
398 | status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask | ||
399 | is (will be) set/clear, if this is -1, then nodemask status is not changed. | ||
385 | status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) | 400 | status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) |
386 | set/clear. It means a new(memoryless) node gets new memory by online and a | 401 | set/clear. It means a new(memoryless) node gets new memory by online and a |
387 | node loses all memory. If this is -1, then nodemask status is not changed. | 402 | node loses all memory. If this is -1, then nodemask status is not changed. |
388 | If status_changed_nid >= 0, callback should create/discard structures for the | 403 | If status_changed_nid* >= 0, callback should create/discard structures for the |
389 | node if necessary. | 404 | node if necessary. |
390 | 405 | ||
391 | -------------- | 406 | -------------- |
diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h index cbeb3616a28e..0086b472bc2b 100644 --- a/arch/alpha/include/asm/mman.h +++ b/arch/alpha/include/asm/mman.h | |||
@@ -63,4 +63,15 @@ | |||
63 | /* compatibility flags */ | 63 | /* compatibility flags */ |
64 | #define MAP_FILE 0 | 64 | #define MAP_FILE 0 |
65 | 65 | ||
66 | /* | ||
67 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
68 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
69 | * spaces. | ||
70 | * | ||
71 | * Assume these are all power of twos. | ||
72 | * When 0 use the default page size. | ||
73 | */ | ||
74 | #define MAP_HUGE_SHIFT 26 | ||
75 | #define MAP_HUGE_MASK 0x3f | ||
76 | |||
66 | #endif /* __ALPHA_MMAN_H__ */ | 77 | #endif /* __ALPHA_MMAN_H__ */ |
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 89f2b7f7b042..10062ceadd1c 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c | |||
@@ -11,18 +11,6 @@ | |||
11 | #include <linux/random.h> | 11 | #include <linux/random.h> |
12 | #include <asm/cachetype.h> | 12 | #include <asm/cachetype.h> |
13 | 13 | ||
14 | static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr, | ||
15 | unsigned long pgoff) | ||
16 | { | ||
17 | unsigned long base = addr & ~(SHMLBA-1); | ||
18 | unsigned long off = (pgoff << PAGE_SHIFT) & (SHMLBA-1); | ||
19 | |||
20 | if (base + off <= addr) | ||
21 | return base + off; | ||
22 | |||
23 | return base - off; | ||
24 | } | ||
25 | |||
26 | #define COLOUR_ALIGN(addr,pgoff) \ | 14 | #define COLOUR_ALIGN(addr,pgoff) \ |
27 | ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ | 15 | ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ |
28 | (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1))) | 16 | (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1))) |
@@ -69,9 +57,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
69 | { | 57 | { |
70 | struct mm_struct *mm = current->mm; | 58 | struct mm_struct *mm = current->mm; |
71 | struct vm_area_struct *vma; | 59 | struct vm_area_struct *vma; |
72 | unsigned long start_addr; | ||
73 | int do_align = 0; | 60 | int do_align = 0; |
74 | int aliasing = cache_is_vipt_aliasing(); | 61 | int aliasing = cache_is_vipt_aliasing(); |
62 | struct vm_unmapped_area_info info; | ||
75 | 63 | ||
76 | /* | 64 | /* |
77 | * We only need to do colour alignment if either the I or D | 65 | * We only need to do colour alignment if either the I or D |
@@ -104,46 +92,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
104 | (!vma || addr + len <= vma->vm_start)) | 92 | (!vma || addr + len <= vma->vm_start)) |
105 | return addr; | 93 | return addr; |
106 | } | 94 | } |
107 | if (len > mm->cached_hole_size) { | ||
108 | start_addr = addr = mm->free_area_cache; | ||
109 | } else { | ||
110 | start_addr = addr = mm->mmap_base; | ||
111 | mm->cached_hole_size = 0; | ||
112 | } | ||
113 | 95 | ||
114 | full_search: | 96 | info.flags = 0; |
115 | if (do_align) | 97 | info.length = len; |
116 | addr = COLOUR_ALIGN(addr, pgoff); | 98 | info.low_limit = mm->mmap_base; |
117 | else | 99 | info.high_limit = TASK_SIZE; |
118 | addr = PAGE_ALIGN(addr); | 100 | info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; |
119 | 101 | info.align_offset = pgoff << PAGE_SHIFT; | |
120 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | 102 | return vm_unmapped_area(&info); |
121 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
122 | if (TASK_SIZE - len < addr) { | ||
123 | /* | ||
124 | * Start a new search - just in case we missed | ||
125 | * some holes. | ||
126 | */ | ||
127 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
128 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
129 | mm->cached_hole_size = 0; | ||
130 | goto full_search; | ||
131 | } | ||
132 | return -ENOMEM; | ||
133 | } | ||
134 | if (!vma || addr + len <= vma->vm_start) { | ||
135 | /* | ||
136 | * Remember the place where we stopped the search: | ||
137 | */ | ||
138 | mm->free_area_cache = addr + len; | ||
139 | return addr; | ||
140 | } | ||
141 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
142 | mm->cached_hole_size = vma->vm_start - addr; | ||
143 | addr = vma->vm_end; | ||
144 | if (do_align) | ||
145 | addr = COLOUR_ALIGN(addr, pgoff); | ||
146 | } | ||
147 | } | 103 | } |
148 | 104 | ||
149 | unsigned long | 105 | unsigned long |
@@ -156,6 +112,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
156 | unsigned long addr = addr0; | 112 | unsigned long addr = addr0; |
157 | int do_align = 0; | 113 | int do_align = 0; |
158 | int aliasing = cache_is_vipt_aliasing(); | 114 | int aliasing = cache_is_vipt_aliasing(); |
115 | struct vm_unmapped_area_info info; | ||
159 | 116 | ||
160 | /* | 117 | /* |
161 | * We only need to do colour alignment if either the I or D | 118 | * We only need to do colour alignment if either the I or D |
@@ -187,70 +144,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
187 | return addr; | 144 | return addr; |
188 | } | 145 | } |
189 | 146 | ||
190 | /* check if free_area_cache is useful for us */ | 147 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
191 | if (len <= mm->cached_hole_size) { | 148 | info.length = len; |
192 | mm->cached_hole_size = 0; | 149 | info.low_limit = PAGE_SIZE; |
193 | mm->free_area_cache = mm->mmap_base; | 150 | info.high_limit = mm->mmap_base; |
194 | } | 151 | info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; |
195 | 152 | info.align_offset = pgoff << PAGE_SHIFT; | |
196 | /* either no address requested or can't fit in requested address hole */ | 153 | addr = vm_unmapped_area(&info); |
197 | addr = mm->free_area_cache; | ||
198 | if (do_align) { | ||
199 | unsigned long base = COLOUR_ALIGN_DOWN(addr - len, pgoff); | ||
200 | addr = base + len; | ||
201 | } | ||
202 | |||
203 | /* make sure it can fit in the remaining address space */ | ||
204 | if (addr > len) { | ||
205 | vma = find_vma(mm, addr-len); | ||
206 | if (!vma || addr <= vma->vm_start) | ||
207 | /* remember the address as a hint for next time */ | ||
208 | return (mm->free_area_cache = addr-len); | ||
209 | } | ||
210 | |||
211 | if (mm->mmap_base < len) | ||
212 | goto bottomup; | ||
213 | |||
214 | addr = mm->mmap_base - len; | ||
215 | if (do_align) | ||
216 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
217 | |||
218 | do { | ||
219 | /* | ||
220 | * Lookup failure means no vma is above this address, | ||
221 | * else if new region fits below vma->vm_start, | ||
222 | * return with success: | ||
223 | */ | ||
224 | vma = find_vma(mm, addr); | ||
225 | if (!vma || addr+len <= vma->vm_start) | ||
226 | /* remember the address as a hint for next time */ | ||
227 | return (mm->free_area_cache = addr); | ||
228 | 154 | ||
229 | /* remember the largest hole we saw so far */ | ||
230 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
231 | mm->cached_hole_size = vma->vm_start - addr; | ||
232 | |||
233 | /* try just below the current vma->vm_start */ | ||
234 | addr = vma->vm_start - len; | ||
235 | if (do_align) | ||
236 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
237 | } while (len < vma->vm_start); | ||
238 | |||
239 | bottomup: | ||
240 | /* | 155 | /* |
241 | * A failed mmap() very likely causes application failure, | 156 | * A failed mmap() very likely causes application failure, |
242 | * so fall back to the bottom-up function here. This scenario | 157 | * so fall back to the bottom-up function here. This scenario |
243 | * can happen with large stack limits and large mmap() | 158 | * can happen with large stack limits and large mmap() |
244 | * allocations. | 159 | * allocations. |
245 | */ | 160 | */ |
246 | mm->cached_hole_size = ~0UL; | 161 | if (addr & ~PAGE_MASK) { |
247 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 162 | VM_BUG_ON(addr != -ENOMEM); |
248 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 163 | info.flags = 0; |
249 | /* | 164 | info.low_limit = mm->mmap_base; |
250 | * Restore the topdown base: | 165 | info.high_limit = TASK_SIZE; |
251 | */ | 166 | addr = vm_unmapped_area(&info); |
252 | mm->free_area_cache = mm->mmap_base; | 167 | } |
253 | mm->cached_hole_size = ~0UL; | ||
254 | 168 | ||
255 | return addr; | 169 | return addr; |
256 | } | 170 | } |
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 06e73bf665e9..c2bbc9a72222 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig | |||
@@ -193,9 +193,6 @@ source "kernel/Kconfig.preempt" | |||
193 | config QUICKLIST | 193 | config QUICKLIST |
194 | def_bool y | 194 | def_bool y |
195 | 195 | ||
196 | config HAVE_ARCH_BOOTMEM | ||
197 | def_bool n | ||
198 | |||
199 | config ARCH_HAVE_MEMORY_PRESENT | 196 | config ARCH_HAVE_MEMORY_PRESENT |
200 | def_bool n | 197 | def_bool n |
201 | 198 | ||
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 46d3da0d4b92..9a936ac9a942 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h | |||
@@ -87,4 +87,15 @@ | |||
87 | /* compatibility flags */ | 87 | /* compatibility flags */ |
88 | #define MAP_FILE 0 | 88 | #define MAP_FILE 0 |
89 | 89 | ||
90 | /* | ||
91 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
92 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
93 | * spaces. | ||
94 | * | ||
95 | * Assume these are all power of twos. | ||
96 | * When 0 use the default page size. | ||
97 | */ | ||
98 | #define MAP_HUGE_SHIFT 26 | ||
99 | #define MAP_HUGE_MASK 0x3f | ||
100 | |||
90 | #endif /* _ASM_MMAN_H */ | 101 | #endif /* _ASM_MMAN_H */ |
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 302d779d5b0d..d9be7540a6be 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c | |||
@@ -45,18 +45,6 @@ static unsigned long mmap_base(unsigned long rnd) | |||
45 | return PAGE_ALIGN(TASK_SIZE - gap - rnd); | 45 | return PAGE_ALIGN(TASK_SIZE - gap - rnd); |
46 | } | 46 | } |
47 | 47 | ||
48 | static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr, | ||
49 | unsigned long pgoff) | ||
50 | { | ||
51 | unsigned long base = addr & ~shm_align_mask; | ||
52 | unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask; | ||
53 | |||
54 | if (base + off <= addr) | ||
55 | return base + off; | ||
56 | |||
57 | return base - off; | ||
58 | } | ||
59 | |||
60 | #define COLOUR_ALIGN(addr, pgoff) \ | 48 | #define COLOUR_ALIGN(addr, pgoff) \ |
61 | ((((addr) + shm_align_mask) & ~shm_align_mask) + \ | 49 | ((((addr) + shm_align_mask) & ~shm_align_mask) + \ |
62 | (((pgoff) << PAGE_SHIFT) & shm_align_mask)) | 50 | (((pgoff) << PAGE_SHIFT) & shm_align_mask)) |
@@ -71,6 +59,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, | |||
71 | struct vm_area_struct *vma; | 59 | struct vm_area_struct *vma; |
72 | unsigned long addr = addr0; | 60 | unsigned long addr = addr0; |
73 | int do_color_align; | 61 | int do_color_align; |
62 | struct vm_unmapped_area_info info; | ||
74 | 63 | ||
75 | if (unlikely(len > TASK_SIZE)) | 64 | if (unlikely(len > TASK_SIZE)) |
76 | return -ENOMEM; | 65 | return -ENOMEM; |
@@ -107,97 +96,31 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, | |||
107 | return addr; | 96 | return addr; |
108 | } | 97 | } |
109 | 98 | ||
110 | if (dir == UP) { | 99 | info.length = len; |
111 | addr = mm->mmap_base; | 100 | info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0; |
112 | if (do_color_align) | 101 | info.align_offset = pgoff << PAGE_SHIFT; |
113 | addr = COLOUR_ALIGN(addr, pgoff); | ||
114 | else | ||
115 | addr = PAGE_ALIGN(addr); | ||
116 | 102 | ||
117 | for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { | 103 | if (dir == DOWN) { |
118 | /* At this point: (!vma || addr < vma->vm_end). */ | 104 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
119 | if (TASK_SIZE - len < addr) | 105 | info.low_limit = PAGE_SIZE; |
120 | return -ENOMEM; | 106 | info.high_limit = mm->mmap_base; |
121 | if (!vma || addr + len <= vma->vm_start) | 107 | addr = vm_unmapped_area(&info); |
122 | return addr; | 108 | |
123 | addr = vma->vm_end; | 109 | if (!(addr & ~PAGE_MASK)) |
124 | if (do_color_align) | 110 | return addr; |
125 | addr = COLOUR_ALIGN(addr, pgoff); | ||
126 | } | ||
127 | } else { | ||
128 | /* check if free_area_cache is useful for us */ | ||
129 | if (len <= mm->cached_hole_size) { | ||
130 | mm->cached_hole_size = 0; | ||
131 | mm->free_area_cache = mm->mmap_base; | ||
132 | } | ||
133 | 111 | ||
134 | /* | ||
135 | * either no address requested, or the mapping can't fit into | ||
136 | * the requested address hole | ||
137 | */ | ||
138 | addr = mm->free_area_cache; | ||
139 | if (do_color_align) { | ||
140 | unsigned long base = | ||
141 | COLOUR_ALIGN_DOWN(addr - len, pgoff); | ||
142 | addr = base + len; | ||
143 | } | ||
144 | |||
145 | /* make sure it can fit in the remaining address space */ | ||
146 | if (likely(addr > len)) { | ||
147 | vma = find_vma(mm, addr - len); | ||
148 | if (!vma || addr <= vma->vm_start) { | ||
149 | /* cache the address as a hint for next time */ | ||
150 | return mm->free_area_cache = addr - len; | ||
151 | } | ||
152 | } | ||
153 | |||
154 | if (unlikely(mm->mmap_base < len)) | ||
155 | goto bottomup; | ||
156 | |||
157 | addr = mm->mmap_base - len; | ||
158 | if (do_color_align) | ||
159 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
160 | |||
161 | do { | ||
162 | /* | ||
163 | * Lookup failure means no vma is above this address, | ||
164 | * else if new region fits below vma->vm_start, | ||
165 | * return with success: | ||
166 | */ | ||
167 | vma = find_vma(mm, addr); | ||
168 | if (likely(!vma || addr + len <= vma->vm_start)) { | ||
169 | /* cache the address as a hint for next time */ | ||
170 | return mm->free_area_cache = addr; | ||
171 | } | ||
172 | |||
173 | /* remember the largest hole we saw so far */ | ||
174 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
175 | mm->cached_hole_size = vma->vm_start - addr; | ||
176 | |||
177 | /* try just below the current vma->vm_start */ | ||
178 | addr = vma->vm_start - len; | ||
179 | if (do_color_align) | ||
180 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
181 | } while (likely(len < vma->vm_start)); | ||
182 | |||
183 | bottomup: | ||
184 | /* | 112 | /* |
185 | * A failed mmap() very likely causes application failure, | 113 | * A failed mmap() very likely causes application failure, |
186 | * so fall back to the bottom-up function here. This scenario | 114 | * so fall back to the bottom-up function here. This scenario |
187 | * can happen with large stack limits and large mmap() | 115 | * can happen with large stack limits and large mmap() |
188 | * allocations. | 116 | * allocations. |
189 | */ | 117 | */ |
190 | mm->cached_hole_size = ~0UL; | ||
191 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
192 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | ||
193 | /* | ||
194 | * Restore the topdown base: | ||
195 | */ | ||
196 | mm->free_area_cache = mm->mmap_base; | ||
197 | mm->cached_hole_size = ~0UL; | ||
198 | |||
199 | return addr; | ||
200 | } | 118 | } |
119 | |||
120 | info.flags = 0; | ||
121 | info.low_limit = mm->mmap_base; | ||
122 | info.high_limit = TASK_SIZE; | ||
123 | return vm_unmapped_area(&info); | ||
201 | } | 124 | } |
202 | 125 | ||
203 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, | 126 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0, |
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 12219ebce869..294d251ca7b2 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h | |||
@@ -70,4 +70,15 @@ | |||
70 | #define MAP_FILE 0 | 70 | #define MAP_FILE 0 |
71 | #define MAP_VARIABLE 0 | 71 | #define MAP_VARIABLE 0 |
72 | 72 | ||
73 | /* | ||
74 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
75 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
76 | * spaces. | ||
77 | * | ||
78 | * Assume these are all power of twos. | ||
79 | * When 0 use the default page size. | ||
80 | */ | ||
81 | #define MAP_HUGE_SHIFT 26 | ||
82 | #define MAP_HUGE_MASK 0x3f | ||
83 | |||
73 | #endif /* __PARISC_MMAN_H__ */ | 84 | #endif /* __PARISC_MMAN_H__ */ |
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index cf357a059ddb..3ce1f864c2d3 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c | |||
@@ -607,7 +607,7 @@ static void register_nodes(void) | |||
607 | 607 | ||
608 | int sysfs_add_device_to_node(struct device *dev, int nid) | 608 | int sysfs_add_device_to_node(struct device *dev, int nid) |
609 | { | 609 | { |
610 | struct node *node = &node_devices[nid]; | 610 | struct node *node = node_devices[nid]; |
611 | return sysfs_create_link(&node->dev.kobj, &dev->kobj, | 611 | return sysfs_create_link(&node->dev.kobj, &dev->kobj, |
612 | kobject_name(&dev->kobj)); | 612 | kobject_name(&dev->kobj)); |
613 | } | 613 | } |
@@ -615,7 +615,7 @@ EXPORT_SYMBOL_GPL(sysfs_add_device_to_node); | |||
615 | 615 | ||
616 | void sysfs_remove_device_from_node(struct device *dev, int nid) | 616 | void sysfs_remove_device_from_node(struct device *dev, int nid) |
617 | { | 617 | { |
618 | struct node *node = &node_devices[nid]; | 618 | struct node *node = node_devices[nid]; |
619 | sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); | 619 | sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); |
620 | } | 620 | } |
621 | EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); | 621 | EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); |
diff --git a/arch/powerpc/platforms/cell/celleb_pci.c b/arch/powerpc/platforms/cell/celleb_pci.c index abc8af43ea7c..173568140a32 100644 --- a/arch/powerpc/platforms/cell/celleb_pci.c +++ b/arch/powerpc/platforms/cell/celleb_pci.c | |||
@@ -401,11 +401,11 @@ error: | |||
401 | } else { | 401 | } else { |
402 | if (config && *config) { | 402 | if (config && *config) { |
403 | size = 256; | 403 | size = 256; |
404 | free_bootmem((unsigned long)(*config), size); | 404 | free_bootmem(__pa(*config), size); |
405 | } | 405 | } |
406 | if (res && *res) { | 406 | if (res && *res) { |
407 | size = sizeof(struct celleb_pci_resource); | 407 | size = sizeof(struct celleb_pci_resource); |
408 | free_bootmem((unsigned long)(*res), size); | 408 | free_bootmem(__pa(*res), size); |
409 | } | 409 | } |
410 | } | 410 | } |
411 | 411 | ||
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 6d5367060a56..39faa4ac9660 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h | |||
@@ -158,6 +158,9 @@ static inline int page_reset_referenced(unsigned long addr) | |||
158 | * race against modification of the referenced bit. This function | 158 | * race against modification of the referenced bit. This function |
159 | * should therefore only be called if it is not mapped in any | 159 | * should therefore only be called if it is not mapped in any |
160 | * address space. | 160 | * address space. |
161 | * | ||
162 | * Note that the bit gets set whenever page content is changed. That means | ||
163 | * also when the page is modified by DMA or from inside the kernel. | ||
161 | */ | 164 | */ |
162 | #define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY | 165 | #define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY |
163 | static inline int page_test_and_clear_dirty(unsigned long pfn, int mapped) | 166 | static inline int page_test_and_clear_dirty(unsigned long pfn, int mapped) |
diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c index 80bf494ddbcb..6777177807c2 100644 --- a/arch/sh/mm/mmap.c +++ b/arch/sh/mm/mmap.c | |||
@@ -30,25 +30,13 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr, | |||
30 | return base + off; | 30 | return base + off; |
31 | } | 31 | } |
32 | 32 | ||
33 | static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr, | ||
34 | unsigned long pgoff) | ||
35 | { | ||
36 | unsigned long base = addr & ~shm_align_mask; | ||
37 | unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask; | ||
38 | |||
39 | if (base + off <= addr) | ||
40 | return base + off; | ||
41 | |||
42 | return base - off; | ||
43 | } | ||
44 | |||
45 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, | 33 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, |
46 | unsigned long len, unsigned long pgoff, unsigned long flags) | 34 | unsigned long len, unsigned long pgoff, unsigned long flags) |
47 | { | 35 | { |
48 | struct mm_struct *mm = current->mm; | 36 | struct mm_struct *mm = current->mm; |
49 | struct vm_area_struct *vma; | 37 | struct vm_area_struct *vma; |
50 | unsigned long start_addr; | ||
51 | int do_colour_align; | 38 | int do_colour_align; |
39 | struct vm_unmapped_area_info info; | ||
52 | 40 | ||
53 | if (flags & MAP_FIXED) { | 41 | if (flags & MAP_FIXED) { |
54 | /* We do not accept a shared mapping if it would violate | 42 | /* We do not accept a shared mapping if it would violate |
@@ -79,47 +67,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
79 | return addr; | 67 | return addr; |
80 | } | 68 | } |
81 | 69 | ||
82 | if (len > mm->cached_hole_size) { | 70 | info.flags = 0; |
83 | start_addr = addr = mm->free_area_cache; | 71 | info.length = len; |
84 | } else { | 72 | info.low_limit = TASK_UNMAPPED_BASE; |
85 | mm->cached_hole_size = 0; | 73 | info.high_limit = TASK_SIZE; |
86 | start_addr = addr = TASK_UNMAPPED_BASE; | 74 | info.align_mask = do_colour_align ? (PAGE_MASK & shm_align_mask) : 0; |
87 | } | 75 | info.align_offset = pgoff << PAGE_SHIFT; |
88 | 76 | return vm_unmapped_area(&info); | |
89 | full_search: | ||
90 | if (do_colour_align) | ||
91 | addr = COLOUR_ALIGN(addr, pgoff); | ||
92 | else | ||
93 | addr = PAGE_ALIGN(mm->free_area_cache); | ||
94 | |||
95 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
96 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
97 | if (unlikely(TASK_SIZE - len < addr)) { | ||
98 | /* | ||
99 | * Start a new search - just in case we missed | ||
100 | * some holes. | ||
101 | */ | ||
102 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
103 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
104 | mm->cached_hole_size = 0; | ||
105 | goto full_search; | ||
106 | } | ||
107 | return -ENOMEM; | ||
108 | } | ||
109 | if (likely(!vma || addr + len <= vma->vm_start)) { | ||
110 | /* | ||
111 | * Remember the place where we stopped the search: | ||
112 | */ | ||
113 | mm->free_area_cache = addr + len; | ||
114 | return addr; | ||
115 | } | ||
116 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
117 | mm->cached_hole_size = vma->vm_start - addr; | ||
118 | |||
119 | addr = vma->vm_end; | ||
120 | if (do_colour_align) | ||
121 | addr = COLOUR_ALIGN(addr, pgoff); | ||
122 | } | ||
123 | } | 77 | } |
124 | 78 | ||
125 | unsigned long | 79 | unsigned long |
@@ -131,6 +85,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
131 | struct mm_struct *mm = current->mm; | 85 | struct mm_struct *mm = current->mm; |
132 | unsigned long addr = addr0; | 86 | unsigned long addr = addr0; |
133 | int do_colour_align; | 87 | int do_colour_align; |
88 | struct vm_unmapped_area_info info; | ||
134 | 89 | ||
135 | if (flags & MAP_FIXED) { | 90 | if (flags & MAP_FIXED) { |
136 | /* We do not accept a shared mapping if it would violate | 91 | /* We do not accept a shared mapping if it would violate |
@@ -162,73 +117,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
162 | return addr; | 117 | return addr; |
163 | } | 118 | } |
164 | 119 | ||
165 | /* check if free_area_cache is useful for us */ | 120 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
166 | if (len <= mm->cached_hole_size) { | 121 | info.length = len; |
167 | mm->cached_hole_size = 0; | 122 | info.low_limit = PAGE_SIZE; |
168 | mm->free_area_cache = mm->mmap_base; | 123 | info.high_limit = mm->mmap_base; |
169 | } | 124 | info.align_mask = do_colour_align ? (PAGE_MASK & shm_align_mask) : 0; |
170 | 125 | info.align_offset = pgoff << PAGE_SHIFT; | |
171 | /* either no address requested or can't fit in requested address hole */ | 126 | addr = vm_unmapped_area(&info); |
172 | addr = mm->free_area_cache; | ||
173 | if (do_colour_align) { | ||
174 | unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff); | ||
175 | 127 | ||
176 | addr = base + len; | ||
177 | } | ||
178 | |||
179 | /* make sure it can fit in the remaining address space */ | ||
180 | if (likely(addr > len)) { | ||
181 | vma = find_vma(mm, addr-len); | ||
182 | if (!vma || addr <= vma->vm_start) { | ||
183 | /* remember the address as a hint for next time */ | ||
184 | return (mm->free_area_cache = addr-len); | ||
185 | } | ||
186 | } | ||
187 | |||
188 | if (unlikely(mm->mmap_base < len)) | ||
189 | goto bottomup; | ||
190 | |||
191 | addr = mm->mmap_base-len; | ||
192 | if (do_colour_align) | ||
193 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
194 | |||
195 | do { | ||
196 | /* | ||
197 | * Lookup failure means no vma is above this address, | ||
198 | * else if new region fits below vma->vm_start, | ||
199 | * return with success: | ||
200 | */ | ||
201 | vma = find_vma(mm, addr); | ||
202 | if (likely(!vma || addr+len <= vma->vm_start)) { | ||
203 | /* remember the address as a hint for next time */ | ||
204 | return (mm->free_area_cache = addr); | ||
205 | } | ||
206 | |||
207 | /* remember the largest hole we saw so far */ | ||
208 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
209 | mm->cached_hole_size = vma->vm_start - addr; | ||
210 | |||
211 | /* try just below the current vma->vm_start */ | ||
212 | addr = vma->vm_start-len; | ||
213 | if (do_colour_align) | ||
214 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
215 | } while (likely(len < vma->vm_start)); | ||
216 | |||
217 | bottomup: | ||
218 | /* | 128 | /* |
219 | * A failed mmap() very likely causes application failure, | 129 | * A failed mmap() very likely causes application failure, |
220 | * so fall back to the bottom-up function here. This scenario | 130 | * so fall back to the bottom-up function here. This scenario |
221 | * can happen with large stack limits and large mmap() | 131 | * can happen with large stack limits and large mmap() |
222 | * allocations. | 132 | * allocations. |
223 | */ | 133 | */ |
224 | mm->cached_hole_size = ~0UL; | 134 | if (addr & ~PAGE_MASK) { |
225 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 135 | VM_BUG_ON(addr != -ENOMEM); |
226 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 136 | info.flags = 0; |
227 | /* | 137 | info.low_limit = TASK_UNMAPPED_BASE; |
228 | * Restore the topdown base: | 138 | info.high_limit = TASK_SIZE; |
229 | */ | 139 | addr = vm_unmapped_area(&info); |
230 | mm->free_area_cache = mm->mmap_base; | 140 | } |
231 | mm->cached_hole_size = ~0UL; | ||
232 | 141 | ||
233 | return addr; | 142 | return addr; |
234 | } | 143 | } |
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 0c9b31b22e07..57277c830151 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c | |||
@@ -34,11 +34,9 @@ asmlinkage unsigned long sys_getpagesize(void) | |||
34 | return PAGE_SIZE; /* Possibly older binaries want 8192 on sun4's? */ | 34 | return PAGE_SIZE; /* Possibly older binaries want 8192 on sun4's? */ |
35 | } | 35 | } |
36 | 36 | ||
37 | #define COLOUR_ALIGN(addr) (((addr)+SHMLBA-1)&~(SHMLBA-1)) | ||
38 | |||
39 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) | 37 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) |
40 | { | 38 | { |
41 | struct vm_area_struct * vmm; | 39 | struct vm_unmapped_area_info info; |
42 | 40 | ||
43 | if (flags & MAP_FIXED) { | 41 | if (flags & MAP_FIXED) { |
44 | /* We do not accept a shared mapping if it would violate | 42 | /* We do not accept a shared mapping if it would violate |
@@ -56,21 +54,14 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi | |||
56 | if (!addr) | 54 | if (!addr) |
57 | addr = TASK_UNMAPPED_BASE; | 55 | addr = TASK_UNMAPPED_BASE; |
58 | 56 | ||
59 | if (flags & MAP_SHARED) | 57 | info.flags = 0; |
60 | addr = COLOUR_ALIGN(addr); | 58 | info.length = len; |
61 | else | 59 | info.low_limit = addr; |
62 | addr = PAGE_ALIGN(addr); | 60 | info.high_limit = TASK_SIZE; |
63 | 61 | info.align_mask = (flags & MAP_SHARED) ? | |
64 | for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { | 62 | (PAGE_MASK & (SHMLBA - 1)) : 0; |
65 | /* At this point: (!vmm || addr < vmm->vm_end). */ | 63 | info.align_offset = pgoff << PAGE_SHIFT; |
66 | if (TASK_SIZE - PAGE_SIZE - len < addr) | 64 | return vm_unmapped_area(&info); |
67 | return -ENOMEM; | ||
68 | if (!vmm || addr + len <= vmm->vm_start) | ||
69 | return addr; | ||
70 | addr = vmm->vm_end; | ||
71 | if (flags & MAP_SHARED) | ||
72 | addr = COLOUR_ALIGN(addr); | ||
73 | } | ||
74 | } | 65 | } |
75 | 66 | ||
76 | /* | 67 | /* |
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 878ef3d5fec5..97309c0ec533 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c | |||
@@ -75,7 +75,7 @@ static inline int invalid_64bit_range(unsigned long addr, unsigned long len) | |||
75 | * the spitfire/niagara VA-hole. | 75 | * the spitfire/niagara VA-hole. |
76 | */ | 76 | */ |
77 | 77 | ||
78 | static inline unsigned long COLOUR_ALIGN(unsigned long addr, | 78 | static inline unsigned long COLOR_ALIGN(unsigned long addr, |
79 | unsigned long pgoff) | 79 | unsigned long pgoff) |
80 | { | 80 | { |
81 | unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1); | 81 | unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1); |
@@ -84,24 +84,13 @@ static inline unsigned long COLOUR_ALIGN(unsigned long addr, | |||
84 | return base + off; | 84 | return base + off; |
85 | } | 85 | } |
86 | 86 | ||
87 | static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr, | ||
88 | unsigned long pgoff) | ||
89 | { | ||
90 | unsigned long base = addr & ~(SHMLBA-1); | ||
91 | unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1); | ||
92 | |||
93 | if (base + off <= addr) | ||
94 | return base + off; | ||
95 | return base - off; | ||
96 | } | ||
97 | |||
98 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) | 87 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) |
99 | { | 88 | { |
100 | struct mm_struct *mm = current->mm; | 89 | struct mm_struct *mm = current->mm; |
101 | struct vm_area_struct * vma; | 90 | struct vm_area_struct * vma; |
102 | unsigned long task_size = TASK_SIZE; | 91 | unsigned long task_size = TASK_SIZE; |
103 | unsigned long start_addr; | ||
104 | int do_color_align; | 92 | int do_color_align; |
93 | struct vm_unmapped_area_info info; | ||
105 | 94 | ||
106 | if (flags & MAP_FIXED) { | 95 | if (flags & MAP_FIXED) { |
107 | /* We do not accept a shared mapping if it would violate | 96 | /* We do not accept a shared mapping if it would violate |
@@ -124,7 +113,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi | |||
124 | 113 | ||
125 | if (addr) { | 114 | if (addr) { |
126 | if (do_color_align) | 115 | if (do_color_align) |
127 | addr = COLOUR_ALIGN(addr, pgoff); | 116 | addr = COLOR_ALIGN(addr, pgoff); |
128 | else | 117 | else |
129 | addr = PAGE_ALIGN(addr); | 118 | addr = PAGE_ALIGN(addr); |
130 | 119 | ||
@@ -134,50 +123,22 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi | |||
134 | return addr; | 123 | return addr; |
135 | } | 124 | } |
136 | 125 | ||
137 | if (len > mm->cached_hole_size) { | 126 | info.flags = 0; |
138 | start_addr = addr = mm->free_area_cache; | 127 | info.length = len; |
139 | } else { | 128 | info.low_limit = TASK_UNMAPPED_BASE; |
140 | start_addr = addr = TASK_UNMAPPED_BASE; | 129 | info.high_limit = min(task_size, VA_EXCLUDE_START); |
141 | mm->cached_hole_size = 0; | 130 | info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; |
131 | info.align_offset = pgoff << PAGE_SHIFT; | ||
132 | addr = vm_unmapped_area(&info); | ||
133 | |||
134 | if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { | ||
135 | VM_BUG_ON(addr != -ENOMEM); | ||
136 | info.low_limit = VA_EXCLUDE_END; | ||
137 | info.high_limit = task_size; | ||
138 | addr = vm_unmapped_area(&info); | ||
142 | } | 139 | } |
143 | 140 | ||
144 | task_size -= len; | 141 | return addr; |
145 | |||
146 | full_search: | ||
147 | if (do_color_align) | ||
148 | addr = COLOUR_ALIGN(addr, pgoff); | ||
149 | else | ||
150 | addr = PAGE_ALIGN(addr); | ||
151 | |||
152 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
153 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
154 | if (addr < VA_EXCLUDE_START && | ||
155 | (addr + len) >= VA_EXCLUDE_START) { | ||
156 | addr = VA_EXCLUDE_END; | ||
157 | vma = find_vma(mm, VA_EXCLUDE_END); | ||
158 | } | ||
159 | if (unlikely(task_size < addr)) { | ||
160 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
161 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
162 | mm->cached_hole_size = 0; | ||
163 | goto full_search; | ||
164 | } | ||
165 | return -ENOMEM; | ||
166 | } | ||
167 | if (likely(!vma || addr + len <= vma->vm_start)) { | ||
168 | /* | ||
169 | * Remember the place where we stopped the search: | ||
170 | */ | ||
171 | mm->free_area_cache = addr + len; | ||
172 | return addr; | ||
173 | } | ||
174 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
175 | mm->cached_hole_size = vma->vm_start - addr; | ||
176 | |||
177 | addr = vma->vm_end; | ||
178 | if (do_color_align) | ||
179 | addr = COLOUR_ALIGN(addr, pgoff); | ||
180 | } | ||
181 | } | 142 | } |
182 | 143 | ||
183 | unsigned long | 144 | unsigned long |
@@ -190,6 +151,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
190 | unsigned long task_size = STACK_TOP32; | 151 | unsigned long task_size = STACK_TOP32; |
191 | unsigned long addr = addr0; | 152 | unsigned long addr = addr0; |
192 | int do_color_align; | 153 | int do_color_align; |
154 | struct vm_unmapped_area_info info; | ||
193 | 155 | ||
194 | /* This should only ever run for 32-bit processes. */ | 156 | /* This should only ever run for 32-bit processes. */ |
195 | BUG_ON(!test_thread_flag(TIF_32BIT)); | 157 | BUG_ON(!test_thread_flag(TIF_32BIT)); |
@@ -214,7 +176,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
214 | /* requesting a specific address */ | 176 | /* requesting a specific address */ |
215 | if (addr) { | 177 | if (addr) { |
216 | if (do_color_align) | 178 | if (do_color_align) |
217 | addr = COLOUR_ALIGN(addr, pgoff); | 179 | addr = COLOR_ALIGN(addr, pgoff); |
218 | else | 180 | else |
219 | addr = PAGE_ALIGN(addr); | 181 | addr = PAGE_ALIGN(addr); |
220 | 182 | ||
@@ -224,73 +186,27 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
224 | return addr; | 186 | return addr; |
225 | } | 187 | } |
226 | 188 | ||
227 | /* check if free_area_cache is useful for us */ | 189 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
228 | if (len <= mm->cached_hole_size) { | 190 | info.length = len; |
229 | mm->cached_hole_size = 0; | 191 | info.low_limit = PAGE_SIZE; |
230 | mm->free_area_cache = mm->mmap_base; | 192 | info.high_limit = mm->mmap_base; |
231 | } | 193 | info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; |
232 | 194 | info.align_offset = pgoff << PAGE_SHIFT; | |
233 | /* either no address requested or can't fit in requested address hole */ | 195 | addr = vm_unmapped_area(&info); |
234 | addr = mm->free_area_cache; | ||
235 | if (do_color_align) { | ||
236 | unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff); | ||
237 | 196 | ||
238 | addr = base + len; | ||
239 | } | ||
240 | |||
241 | /* make sure it can fit in the remaining address space */ | ||
242 | if (likely(addr > len)) { | ||
243 | vma = find_vma(mm, addr-len); | ||
244 | if (!vma || addr <= vma->vm_start) { | ||
245 | /* remember the address as a hint for next time */ | ||
246 | return (mm->free_area_cache = addr-len); | ||
247 | } | ||
248 | } | ||
249 | |||
250 | if (unlikely(mm->mmap_base < len)) | ||
251 | goto bottomup; | ||
252 | |||
253 | addr = mm->mmap_base-len; | ||
254 | if (do_color_align) | ||
255 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
256 | |||
257 | do { | ||
258 | /* | ||
259 | * Lookup failure means no vma is above this address, | ||
260 | * else if new region fits below vma->vm_start, | ||
261 | * return with success: | ||
262 | */ | ||
263 | vma = find_vma(mm, addr); | ||
264 | if (likely(!vma || addr+len <= vma->vm_start)) { | ||
265 | /* remember the address as a hint for next time */ | ||
266 | return (mm->free_area_cache = addr); | ||
267 | } | ||
268 | |||
269 | /* remember the largest hole we saw so far */ | ||
270 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
271 | mm->cached_hole_size = vma->vm_start - addr; | ||
272 | |||
273 | /* try just below the current vma->vm_start */ | ||
274 | addr = vma->vm_start-len; | ||
275 | if (do_color_align) | ||
276 | addr = COLOUR_ALIGN_DOWN(addr, pgoff); | ||
277 | } while (likely(len < vma->vm_start)); | ||
278 | |||
279 | bottomup: | ||
280 | /* | 197 | /* |
281 | * A failed mmap() very likely causes application failure, | 198 | * A failed mmap() very likely causes application failure, |
282 | * so fall back to the bottom-up function here. This scenario | 199 | * so fall back to the bottom-up function here. This scenario |
283 | * can happen with large stack limits and large mmap() | 200 | * can happen with large stack limits and large mmap() |
284 | * allocations. | 201 | * allocations. |
285 | */ | 202 | */ |
286 | mm->cached_hole_size = ~0UL; | 203 | if (addr & ~PAGE_MASK) { |
287 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 204 | VM_BUG_ON(addr != -ENOMEM); |
288 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 205 | info.flags = 0; |
289 | /* | 206 | info.low_limit = TASK_UNMAPPED_BASE; |
290 | * Restore the topdown base: | 207 | info.high_limit = STACK_TOP32; |
291 | */ | 208 | addr = vm_unmapped_area(&info); |
292 | mm->free_area_cache = mm->mmap_base; | 209 | } |
293 | mm->cached_hole_size = ~0UL; | ||
294 | 210 | ||
295 | return addr; | 211 | return addr; |
296 | } | 212 | } |
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index f76f83d5ac63..d2b59441ebdd 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c | |||
@@ -30,55 +30,28 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, | |||
30 | unsigned long pgoff, | 30 | unsigned long pgoff, |
31 | unsigned long flags) | 31 | unsigned long flags) |
32 | { | 32 | { |
33 | struct mm_struct *mm = current->mm; | ||
34 | struct vm_area_struct * vma; | ||
35 | unsigned long task_size = TASK_SIZE; | 33 | unsigned long task_size = TASK_SIZE; |
36 | unsigned long start_addr; | 34 | struct vm_unmapped_area_info info; |
37 | 35 | ||
38 | if (test_thread_flag(TIF_32BIT)) | 36 | if (test_thread_flag(TIF_32BIT)) |
39 | task_size = STACK_TOP32; | 37 | task_size = STACK_TOP32; |
40 | if (unlikely(len >= VA_EXCLUDE_START)) | ||
41 | return -ENOMEM; | ||
42 | 38 | ||
43 | if (len > mm->cached_hole_size) { | 39 | info.flags = 0; |
44 | start_addr = addr = mm->free_area_cache; | 40 | info.length = len; |
45 | } else { | 41 | info.low_limit = TASK_UNMAPPED_BASE; |
46 | start_addr = addr = TASK_UNMAPPED_BASE; | 42 | info.high_limit = min(task_size, VA_EXCLUDE_START); |
47 | mm->cached_hole_size = 0; | 43 | info.align_mask = PAGE_MASK & ~HPAGE_MASK; |
44 | info.align_offset = 0; | ||
45 | addr = vm_unmapped_area(&info); | ||
46 | |||
47 | if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { | ||
48 | VM_BUG_ON(addr != -ENOMEM); | ||
49 | info.low_limit = VA_EXCLUDE_END; | ||
50 | info.high_limit = task_size; | ||
51 | addr = vm_unmapped_area(&info); | ||
48 | } | 52 | } |
49 | 53 | ||
50 | task_size -= len; | 54 | return addr; |
51 | |||
52 | full_search: | ||
53 | addr = ALIGN(addr, HPAGE_SIZE); | ||
54 | |||
55 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
56 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
57 | if (addr < VA_EXCLUDE_START && | ||
58 | (addr + len) >= VA_EXCLUDE_START) { | ||
59 | addr = VA_EXCLUDE_END; | ||
60 | vma = find_vma(mm, VA_EXCLUDE_END); | ||
61 | } | ||
62 | if (unlikely(task_size < addr)) { | ||
63 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
64 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
65 | mm->cached_hole_size = 0; | ||
66 | goto full_search; | ||
67 | } | ||
68 | return -ENOMEM; | ||
69 | } | ||
70 | if (likely(!vma || addr + len <= vma->vm_start)) { | ||
71 | /* | ||
72 | * Remember the place where we stopped the search: | ||
73 | */ | ||
74 | mm->free_area_cache = addr + len; | ||
75 | return addr; | ||
76 | } | ||
77 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
78 | mm->cached_hole_size = vma->vm_start - addr; | ||
79 | |||
80 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | ||
81 | } | ||
82 | } | 55 | } |
83 | 56 | ||
84 | static unsigned long | 57 | static unsigned long |
@@ -87,71 +60,34 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
87 | const unsigned long pgoff, | 60 | const unsigned long pgoff, |
88 | const unsigned long flags) | 61 | const unsigned long flags) |
89 | { | 62 | { |
90 | struct vm_area_struct *vma; | ||
91 | struct mm_struct *mm = current->mm; | 63 | struct mm_struct *mm = current->mm; |
92 | unsigned long addr = addr0; | 64 | unsigned long addr = addr0; |
65 | struct vm_unmapped_area_info info; | ||
93 | 66 | ||
94 | /* This should only ever run for 32-bit processes. */ | 67 | /* This should only ever run for 32-bit processes. */ |
95 | BUG_ON(!test_thread_flag(TIF_32BIT)); | 68 | BUG_ON(!test_thread_flag(TIF_32BIT)); |
96 | 69 | ||
97 | /* check if free_area_cache is useful for us */ | 70 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
98 | if (len <= mm->cached_hole_size) { | 71 | info.length = len; |
99 | mm->cached_hole_size = 0; | 72 | info.low_limit = PAGE_SIZE; |
100 | mm->free_area_cache = mm->mmap_base; | 73 | info.high_limit = mm->mmap_base; |
101 | } | 74 | info.align_mask = PAGE_MASK & ~HPAGE_MASK; |
102 | 75 | info.align_offset = 0; | |
103 | /* either no address requested or can't fit in requested address hole */ | 76 | addr = vm_unmapped_area(&info); |
104 | addr = mm->free_area_cache & HPAGE_MASK; | ||
105 | |||
106 | /* make sure it can fit in the remaining address space */ | ||
107 | if (likely(addr > len)) { | ||
108 | vma = find_vma(mm, addr-len); | ||
109 | if (!vma || addr <= vma->vm_start) { | ||
110 | /* remember the address as a hint for next time */ | ||
111 | return (mm->free_area_cache = addr-len); | ||
112 | } | ||
113 | } | ||
114 | |||
115 | if (unlikely(mm->mmap_base < len)) | ||
116 | goto bottomup; | ||
117 | |||
118 | addr = (mm->mmap_base-len) & HPAGE_MASK; | ||
119 | |||
120 | do { | ||
121 | /* | ||
122 | * Lookup failure means no vma is above this address, | ||
123 | * else if new region fits below vma->vm_start, | ||
124 | * return with success: | ||
125 | */ | ||
126 | vma = find_vma(mm, addr); | ||
127 | if (likely(!vma || addr+len <= vma->vm_start)) { | ||
128 | /* remember the address as a hint for next time */ | ||
129 | return (mm->free_area_cache = addr); | ||
130 | } | ||
131 | |||
132 | /* remember the largest hole we saw so far */ | ||
133 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
134 | mm->cached_hole_size = vma->vm_start - addr; | ||
135 | |||
136 | /* try just below the current vma->vm_start */ | ||
137 | addr = (vma->vm_start-len) & HPAGE_MASK; | ||
138 | } while (likely(len < vma->vm_start)); | ||
139 | 77 | ||
140 | bottomup: | ||
141 | /* | 78 | /* |
142 | * A failed mmap() very likely causes application failure, | 79 | * A failed mmap() very likely causes application failure, |
143 | * so fall back to the bottom-up function here. This scenario | 80 | * so fall back to the bottom-up function here. This scenario |
144 | * can happen with large stack limits and large mmap() | 81 | * can happen with large stack limits and large mmap() |
145 | * allocations. | 82 | * allocations. |
146 | */ | 83 | */ |
147 | mm->cached_hole_size = ~0UL; | 84 | if (addr & ~PAGE_MASK) { |
148 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 85 | VM_BUG_ON(addr != -ENOMEM); |
149 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 86 | info.flags = 0; |
150 | /* | 87 | info.low_limit = TASK_UNMAPPED_BASE; |
151 | * Restore the topdown base: | 88 | info.high_limit = STACK_TOP32; |
152 | */ | 89 | addr = vm_unmapped_area(&info); |
153 | mm->free_area_cache = mm->mmap_base; | 90 | } |
154 | mm->cached_hole_size = ~0UL; | ||
155 | 91 | ||
156 | return addr; | 92 | return addr; |
157 | } | 93 | } |
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 812e2d037972..650ccff8378c 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c | |||
@@ -231,42 +231,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |||
231 | unsigned long pgoff, unsigned long flags) | 231 | unsigned long pgoff, unsigned long flags) |
232 | { | 232 | { |
233 | struct hstate *h = hstate_file(file); | 233 | struct hstate *h = hstate_file(file); |
234 | struct mm_struct *mm = current->mm; | 234 | struct vm_unmapped_area_info info; |
235 | struct vm_area_struct *vma; | 235 | |
236 | unsigned long start_addr; | 236 | info.flags = 0; |
237 | 237 | info.length = len; | |
238 | if (len > mm->cached_hole_size) { | 238 | info.low_limit = TASK_UNMAPPED_BASE; |
239 | start_addr = mm->free_area_cache; | 239 | info.high_limit = TASK_SIZE; |
240 | } else { | 240 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
241 | start_addr = TASK_UNMAPPED_BASE; | 241 | info.align_offset = 0; |
242 | mm->cached_hole_size = 0; | 242 | return vm_unmapped_area(&info); |
243 | } | ||
244 | |||
245 | full_search: | ||
246 | addr = ALIGN(start_addr, huge_page_size(h)); | ||
247 | |||
248 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
249 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
250 | if (TASK_SIZE - len < addr) { | ||
251 | /* | ||
252 | * Start a new search - just in case we missed | ||
253 | * some holes. | ||
254 | */ | ||
255 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
256 | start_addr = TASK_UNMAPPED_BASE; | ||
257 | mm->cached_hole_size = 0; | ||
258 | goto full_search; | ||
259 | } | ||
260 | return -ENOMEM; | ||
261 | } | ||
262 | if (!vma || addr + len <= vma->vm_start) { | ||
263 | mm->free_area_cache = addr + len; | ||
264 | return addr; | ||
265 | } | ||
266 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
267 | mm->cached_hole_size = vma->vm_start - addr; | ||
268 | addr = ALIGN(vma->vm_end, huge_page_size(h)); | ||
269 | } | ||
270 | } | 243 | } |
271 | 244 | ||
272 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | 245 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, |
@@ -274,92 +247,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |||
274 | unsigned long pgoff, unsigned long flags) | 247 | unsigned long pgoff, unsigned long flags) |
275 | { | 248 | { |
276 | struct hstate *h = hstate_file(file); | 249 | struct hstate *h = hstate_file(file); |
277 | struct mm_struct *mm = current->mm; | 250 | struct vm_unmapped_area_info info; |
278 | struct vm_area_struct *vma, *prev_vma; | 251 | unsigned long addr; |
279 | unsigned long base = mm->mmap_base, addr = addr0; | ||
280 | unsigned long largest_hole = mm->cached_hole_size; | ||
281 | int first_time = 1; | ||
282 | |||
283 | /* don't allow allocations above current base */ | ||
284 | if (mm->free_area_cache > base) | ||
285 | mm->free_area_cache = base; | ||
286 | |||
287 | if (len <= largest_hole) { | ||
288 | largest_hole = 0; | ||
289 | mm->free_area_cache = base; | ||
290 | } | ||
291 | try_again: | ||
292 | /* make sure it can fit in the remaining address space */ | ||
293 | if (mm->free_area_cache < len) | ||
294 | goto fail; | ||
295 | |||
296 | /* either no address requested or can't fit in requested address hole */ | ||
297 | addr = (mm->free_area_cache - len) & huge_page_mask(h); | ||
298 | do { | ||
299 | /* | ||
300 | * Lookup failure means no vma is above this address, | ||
301 | * i.e. return with success: | ||
302 | */ | ||
303 | vma = find_vma_prev(mm, addr, &prev_vma); | ||
304 | if (!vma) { | ||
305 | return addr; | ||
306 | break; | ||
307 | } | ||
308 | |||
309 | /* | ||
310 | * new region fits between prev_vma->vm_end and | ||
311 | * vma->vm_start, use it: | ||
312 | */ | ||
313 | if (addr + len <= vma->vm_start && | ||
314 | (!prev_vma || (addr >= prev_vma->vm_end))) { | ||
315 | /* remember the address as a hint for next time */ | ||
316 | mm->cached_hole_size = largest_hole; | ||
317 | mm->free_area_cache = addr; | ||
318 | return addr; | ||
319 | } else { | ||
320 | /* pull free_area_cache down to the first hole */ | ||
321 | if (mm->free_area_cache == vma->vm_end) { | ||
322 | mm->free_area_cache = vma->vm_start; | ||
323 | mm->cached_hole_size = largest_hole; | ||
324 | } | ||
325 | } | ||
326 | 252 | ||
327 | /* remember the largest hole we saw so far */ | 253 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
328 | if (addr + largest_hole < vma->vm_start) | 254 | info.length = len; |
329 | largest_hole = vma->vm_start - addr; | 255 | info.low_limit = PAGE_SIZE; |
256 | info.high_limit = current->mm->mmap_base; | ||
257 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); | ||
258 | info.align_offset = 0; | ||
259 | addr = vm_unmapped_area(&info); | ||
330 | 260 | ||
331 | /* try just below the current vma->vm_start */ | ||
332 | addr = (vma->vm_start - len) & huge_page_mask(h); | ||
333 | |||
334 | } while (len <= vma->vm_start); | ||
335 | |||
336 | fail: | ||
337 | /* | ||
338 | * if hint left us with no space for the requested | ||
339 | * mapping then try again: | ||
340 | */ | ||
341 | if (first_time) { | ||
342 | mm->free_area_cache = base; | ||
343 | largest_hole = 0; | ||
344 | first_time = 0; | ||
345 | goto try_again; | ||
346 | } | ||
347 | /* | 261 | /* |
348 | * A failed mmap() very likely causes application failure, | 262 | * A failed mmap() very likely causes application failure, |
349 | * so fall back to the bottom-up function here. This scenario | 263 | * so fall back to the bottom-up function here. This scenario |
350 | * can happen with large stack limits and large mmap() | 264 | * can happen with large stack limits and large mmap() |
351 | * allocations. | 265 | * allocations. |
352 | */ | 266 | */ |
353 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 267 | if (addr & ~PAGE_MASK) { |
354 | mm->cached_hole_size = ~0UL; | 268 | VM_BUG_ON(addr != -ENOMEM); |
355 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, | 269 | info.flags = 0; |
356 | len, pgoff, flags); | 270 | info.low_limit = TASK_UNMAPPED_BASE; |
357 | 271 | info.high_limit = TASK_SIZE; | |
358 | /* | 272 | addr = vm_unmapped_area(&info); |
359 | * Restore the topdown base: | 273 | } |
360 | */ | ||
361 | mm->free_area_cache = base; | ||
362 | mm->cached_hole_size = ~0UL; | ||
363 | 274 | ||
364 | return addr; | 275 | return addr; |
365 | } | 276 | } |
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5939f44fe0c0..9c999c1674fa 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h | |||
@@ -354,12 +354,10 @@ static inline int mmap_is_ia32(void) | |||
354 | return 0; | 354 | return 0; |
355 | } | 355 | } |
356 | 356 | ||
357 | /* The first two values are special, do not change. See align_addr() */ | 357 | /* Do not change the values. See get_align_mask() */ |
358 | enum align_flags { | 358 | enum align_flags { |
359 | ALIGN_VA_32 = BIT(0), | 359 | ALIGN_VA_32 = BIT(0), |
360 | ALIGN_VA_64 = BIT(1), | 360 | ALIGN_VA_64 = BIT(1), |
361 | ALIGN_VDSO = BIT(2), | ||
362 | ALIGN_TOPDOWN = BIT(3), | ||
363 | }; | 361 | }; |
364 | 362 | ||
365 | struct va_alignment { | 363 | struct va_alignment { |
@@ -368,5 +366,5 @@ struct va_alignment { | |||
368 | } ____cacheline_aligned; | 366 | } ____cacheline_aligned; |
369 | 367 | ||
370 | extern struct va_alignment va_align; | 368 | extern struct va_alignment va_align; |
371 | extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); | 369 | extern unsigned long align_vdso_addr(unsigned long); |
372 | #endif /* _ASM_X86_ELF_H */ | 370 | #endif /* _ASM_X86_ELF_H */ |
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 593e51d4643f..513b05f15bb4 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h | |||
@@ -3,6 +3,9 @@ | |||
3 | 3 | ||
4 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ | 4 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ |
5 | 5 | ||
6 | #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) | ||
7 | #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) | ||
8 | |||
6 | #include <asm-generic/mman.h> | 9 | #include <asm-generic/mman.h> |
7 | 10 | ||
8 | #endif /* _ASM_X86_MMAN_H */ | 11 | #endif /* _ASM_X86_MMAN_H */ |
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index b4d3c3927dd8..97ef74b88e0f 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -21,37 +21,23 @@ | |||
21 | 21 | ||
22 | /* | 22 | /* |
23 | * Align a virtual address to avoid aliasing in the I$ on AMD F15h. | 23 | * Align a virtual address to avoid aliasing in the I$ on AMD F15h. |
24 | * | ||
25 | * @flags denotes the allocation direction - bottomup or topdown - | ||
26 | * or vDSO; see call sites below. | ||
27 | */ | 24 | */ |
28 | unsigned long align_addr(unsigned long addr, struct file *filp, | 25 | static unsigned long get_align_mask(void) |
29 | enum align_flags flags) | ||
30 | { | 26 | { |
31 | unsigned long tmp_addr; | ||
32 | |||
33 | /* handle 32- and 64-bit case with a single conditional */ | 27 | /* handle 32- and 64-bit case with a single conditional */ |
34 | if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) | 28 | if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) |
35 | return addr; | 29 | return 0; |
36 | 30 | ||
37 | if (!(current->flags & PF_RANDOMIZE)) | 31 | if (!(current->flags & PF_RANDOMIZE)) |
38 | return addr; | 32 | return 0; |
39 | |||
40 | if (!((flags & ALIGN_VDSO) || filp)) | ||
41 | return addr; | ||
42 | |||
43 | tmp_addr = addr; | ||
44 | |||
45 | /* | ||
46 | * We need an address which is <= than the original | ||
47 | * one only when in topdown direction. | ||
48 | */ | ||
49 | if (!(flags & ALIGN_TOPDOWN)) | ||
50 | tmp_addr += va_align.mask; | ||
51 | 33 | ||
52 | tmp_addr &= ~va_align.mask; | 34 | return va_align.mask; |
35 | } | ||
53 | 36 | ||
54 | return tmp_addr; | 37 | unsigned long align_vdso_addr(unsigned long addr) |
38 | { | ||
39 | unsigned long align_mask = get_align_mask(); | ||
40 | return (addr + align_mask) & ~align_mask; | ||
55 | } | 41 | } |
56 | 42 | ||
57 | static int __init control_va_addr_alignment(char *str) | 43 | static int __init control_va_addr_alignment(char *str) |
@@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
126 | { | 112 | { |
127 | struct mm_struct *mm = current->mm; | 113 | struct mm_struct *mm = current->mm; |
128 | struct vm_area_struct *vma; | 114 | struct vm_area_struct *vma; |
129 | unsigned long start_addr; | 115 | struct vm_unmapped_area_info info; |
130 | unsigned long begin, end; | 116 | unsigned long begin, end; |
131 | 117 | ||
132 | if (flags & MAP_FIXED) | 118 | if (flags & MAP_FIXED) |
@@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
144 | (!vma || addr + len <= vma->vm_start)) | 130 | (!vma || addr + len <= vma->vm_start)) |
145 | return addr; | 131 | return addr; |
146 | } | 132 | } |
147 | if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) | ||
148 | && len <= mm->cached_hole_size) { | ||
149 | mm->cached_hole_size = 0; | ||
150 | mm->free_area_cache = begin; | ||
151 | } | ||
152 | addr = mm->free_area_cache; | ||
153 | if (addr < begin) | ||
154 | addr = begin; | ||
155 | start_addr = addr; | ||
156 | |||
157 | full_search: | ||
158 | |||
159 | addr = align_addr(addr, filp, 0); | ||
160 | |||
161 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
162 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
163 | if (end - len < addr) { | ||
164 | /* | ||
165 | * Start a new search - just in case we missed | ||
166 | * some holes. | ||
167 | */ | ||
168 | if (start_addr != begin) { | ||
169 | start_addr = addr = begin; | ||
170 | mm->cached_hole_size = 0; | ||
171 | goto full_search; | ||
172 | } | ||
173 | return -ENOMEM; | ||
174 | } | ||
175 | if (!vma || addr + len <= vma->vm_start) { | ||
176 | /* | ||
177 | * Remember the place where we stopped the search: | ||
178 | */ | ||
179 | mm->free_area_cache = addr + len; | ||
180 | return addr; | ||
181 | } | ||
182 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
183 | mm->cached_hole_size = vma->vm_start - addr; | ||
184 | 133 | ||
185 | addr = vma->vm_end; | 134 | info.flags = 0; |
186 | addr = align_addr(addr, filp, 0); | 135 | info.length = len; |
187 | } | 136 | info.low_limit = begin; |
137 | info.high_limit = end; | ||
138 | info.align_mask = filp ? get_align_mask() : 0; | ||
139 | info.align_offset = pgoff << PAGE_SHIFT; | ||
140 | return vm_unmapped_area(&info); | ||
188 | } | 141 | } |
189 | 142 | ||
190 | |||
191 | unsigned long | 143 | unsigned long |
192 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | 144 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, |
193 | const unsigned long len, const unsigned long pgoff, | 145 | const unsigned long len, const unsigned long pgoff, |
@@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
195 | { | 147 | { |
196 | struct vm_area_struct *vma; | 148 | struct vm_area_struct *vma; |
197 | struct mm_struct *mm = current->mm; | 149 | struct mm_struct *mm = current->mm; |
198 | unsigned long addr = addr0, start_addr; | 150 | unsigned long addr = addr0; |
151 | struct vm_unmapped_area_info info; | ||
199 | 152 | ||
200 | /* requested length too big for entire address space */ | 153 | /* requested length too big for entire address space */ |
201 | if (len > TASK_SIZE) | 154 | if (len > TASK_SIZE) |
@@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
217 | return addr; | 170 | return addr; |
218 | } | 171 | } |
219 | 172 | ||
220 | /* check if free_area_cache is useful for us */ | 173 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
221 | if (len <= mm->cached_hole_size) { | 174 | info.length = len; |
222 | mm->cached_hole_size = 0; | 175 | info.low_limit = PAGE_SIZE; |
223 | mm->free_area_cache = mm->mmap_base; | 176 | info.high_limit = mm->mmap_base; |
224 | } | 177 | info.align_mask = filp ? get_align_mask() : 0; |
225 | 178 | info.align_offset = pgoff << PAGE_SHIFT; | |
226 | try_again: | 179 | addr = vm_unmapped_area(&info); |
227 | /* either no address requested or can't fit in requested address hole */ | 180 | if (!(addr & ~PAGE_MASK)) |
228 | start_addr = addr = mm->free_area_cache; | 181 | return addr; |
229 | 182 | VM_BUG_ON(addr != -ENOMEM); | |
230 | if (addr < len) | ||
231 | goto fail; | ||
232 | |||
233 | addr -= len; | ||
234 | do { | ||
235 | addr = align_addr(addr, filp, ALIGN_TOPDOWN); | ||
236 | |||
237 | /* | ||
238 | * Lookup failure means no vma is above this address, | ||
239 | * else if new region fits below vma->vm_start, | ||
240 | * return with success: | ||
241 | */ | ||
242 | vma = find_vma(mm, addr); | ||
243 | if (!vma || addr+len <= vma->vm_start) | ||
244 | /* remember the address as a hint for next time */ | ||
245 | return mm->free_area_cache = addr; | ||
246 | |||
247 | /* remember the largest hole we saw so far */ | ||
248 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
249 | mm->cached_hole_size = vma->vm_start - addr; | ||
250 | |||
251 | /* try just below the current vma->vm_start */ | ||
252 | addr = vma->vm_start-len; | ||
253 | } while (len < vma->vm_start); | ||
254 | |||
255 | fail: | ||
256 | /* | ||
257 | * if hint left us with no space for the requested | ||
258 | * mapping then try again: | ||
259 | */ | ||
260 | if (start_addr != mm->mmap_base) { | ||
261 | mm->free_area_cache = mm->mmap_base; | ||
262 | mm->cached_hole_size = 0; | ||
263 | goto try_again; | ||
264 | } | ||
265 | 183 | ||
266 | bottomup: | 184 | bottomup: |
267 | /* | 185 | /* |
@@ -270,14 +188,5 @@ bottomup: | |||
270 | * can happen with large stack limits and large mmap() | 188 | * can happen with large stack limits and large mmap() |
271 | * allocations. | 189 | * allocations. |
272 | */ | 190 | */ |
273 | mm->cached_hole_size = ~0UL; | 191 | return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); |
274 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
275 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | ||
276 | /* | ||
277 | * Restore the topdown base: | ||
278 | */ | ||
279 | mm->free_area_cache = mm->mmap_base; | ||
280 | mm->cached_hole_size = ~0UL; | ||
281 | |||
282 | return addr; | ||
283 | } | 192 | } |
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 937bff5cdaa7..ae1aa71d0115 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -274,42 +274,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |||
274 | unsigned long pgoff, unsigned long flags) | 274 | unsigned long pgoff, unsigned long flags) |
275 | { | 275 | { |
276 | struct hstate *h = hstate_file(file); | 276 | struct hstate *h = hstate_file(file); |
277 | struct mm_struct *mm = current->mm; | 277 | struct vm_unmapped_area_info info; |
278 | struct vm_area_struct *vma; | 278 | |
279 | unsigned long start_addr; | 279 | info.flags = 0; |
280 | 280 | info.length = len; | |
281 | if (len > mm->cached_hole_size) { | 281 | info.low_limit = TASK_UNMAPPED_BASE; |
282 | start_addr = mm->free_area_cache; | 282 | info.high_limit = TASK_SIZE; |
283 | } else { | 283 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
284 | start_addr = TASK_UNMAPPED_BASE; | 284 | info.align_offset = 0; |
285 | mm->cached_hole_size = 0; | 285 | return vm_unmapped_area(&info); |
286 | } | ||
287 | |||
288 | full_search: | ||
289 | addr = ALIGN(start_addr, huge_page_size(h)); | ||
290 | |||
291 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
292 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
293 | if (TASK_SIZE - len < addr) { | ||
294 | /* | ||
295 | * Start a new search - just in case we missed | ||
296 | * some holes. | ||
297 | */ | ||
298 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
299 | start_addr = TASK_UNMAPPED_BASE; | ||
300 | mm->cached_hole_size = 0; | ||
301 | goto full_search; | ||
302 | } | ||
303 | return -ENOMEM; | ||
304 | } | ||
305 | if (!vma || addr + len <= vma->vm_start) { | ||
306 | mm->free_area_cache = addr + len; | ||
307 | return addr; | ||
308 | } | ||
309 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
310 | mm->cached_hole_size = vma->vm_start - addr; | ||
311 | addr = ALIGN(vma->vm_end, huge_page_size(h)); | ||
312 | } | ||
313 | } | 286 | } |
314 | 287 | ||
315 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | 288 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, |
@@ -317,83 +290,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |||
317 | unsigned long pgoff, unsigned long flags) | 290 | unsigned long pgoff, unsigned long flags) |
318 | { | 291 | { |
319 | struct hstate *h = hstate_file(file); | 292 | struct hstate *h = hstate_file(file); |
320 | struct mm_struct *mm = current->mm; | 293 | struct vm_unmapped_area_info info; |
321 | struct vm_area_struct *vma; | 294 | unsigned long addr; |
322 | unsigned long base = mm->mmap_base; | ||
323 | unsigned long addr = addr0; | ||
324 | unsigned long largest_hole = mm->cached_hole_size; | ||
325 | unsigned long start_addr; | ||
326 | |||
327 | /* don't allow allocations above current base */ | ||
328 | if (mm->free_area_cache > base) | ||
329 | mm->free_area_cache = base; | ||
330 | |||
331 | if (len <= largest_hole) { | ||
332 | largest_hole = 0; | ||
333 | mm->free_area_cache = base; | ||
334 | } | ||
335 | try_again: | ||
336 | start_addr = mm->free_area_cache; | ||
337 | |||
338 | /* make sure it can fit in the remaining address space */ | ||
339 | if (mm->free_area_cache < len) | ||
340 | goto fail; | ||
341 | |||
342 | /* either no address requested or can't fit in requested address hole */ | ||
343 | addr = (mm->free_area_cache - len) & huge_page_mask(h); | ||
344 | do { | ||
345 | /* | ||
346 | * Lookup failure means no vma is above this address, | ||
347 | * i.e. return with success: | ||
348 | */ | ||
349 | vma = find_vma(mm, addr); | ||
350 | if (!vma) | ||
351 | return addr; | ||
352 | 295 | ||
353 | if (addr + len <= vma->vm_start) { | 296 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
354 | /* remember the address as a hint for next time */ | 297 | info.length = len; |
355 | mm->cached_hole_size = largest_hole; | 298 | info.low_limit = PAGE_SIZE; |
356 | return (mm->free_area_cache = addr); | 299 | info.high_limit = current->mm->mmap_base; |
357 | } else if (mm->free_area_cache == vma->vm_end) { | 300 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
358 | /* pull free_area_cache down to the first hole */ | 301 | info.align_offset = 0; |
359 | mm->free_area_cache = vma->vm_start; | 302 | addr = vm_unmapped_area(&info); |
360 | mm->cached_hole_size = largest_hole; | ||
361 | } | ||
362 | 303 | ||
363 | /* remember the largest hole we saw so far */ | ||
364 | if (addr + largest_hole < vma->vm_start) | ||
365 | largest_hole = vma->vm_start - addr; | ||
366 | |||
367 | /* try just below the current vma->vm_start */ | ||
368 | addr = (vma->vm_start - len) & huge_page_mask(h); | ||
369 | } while (len <= vma->vm_start); | ||
370 | |||
371 | fail: | ||
372 | /* | ||
373 | * if hint left us with no space for the requested | ||
374 | * mapping then try again: | ||
375 | */ | ||
376 | if (start_addr != base) { | ||
377 | mm->free_area_cache = base; | ||
378 | largest_hole = 0; | ||
379 | goto try_again; | ||
380 | } | ||
381 | /* | 304 | /* |
382 | * A failed mmap() very likely causes application failure, | 305 | * A failed mmap() very likely causes application failure, |
383 | * so fall back to the bottom-up function here. This scenario | 306 | * so fall back to the bottom-up function here. This scenario |
384 | * can happen with large stack limits and large mmap() | 307 | * can happen with large stack limits and large mmap() |
385 | * allocations. | 308 | * allocations. |
386 | */ | 309 | */ |
387 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 310 | if (addr & ~PAGE_MASK) { |
388 | mm->cached_hole_size = ~0UL; | 311 | VM_BUG_ON(addr != -ENOMEM); |
389 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, | 312 | info.flags = 0; |
390 | len, pgoff, flags); | 313 | info.low_limit = TASK_UNMAPPED_BASE; |
391 | 314 | info.high_limit = TASK_SIZE; | |
392 | /* | 315 | addr = vm_unmapped_area(&info); |
393 | * Restore the topdown base: | 316 | } |
394 | */ | ||
395 | mm->free_area_cache = base; | ||
396 | mm->cached_hole_size = ~0UL; | ||
397 | 317 | ||
398 | return addr; | 318 | return addr; |
399 | } | 319 | } |
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 00aaf047b39f..431e87544411 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c | |||
@@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) | |||
141 | * unaligned here as a result of stack start randomization. | 141 | * unaligned here as a result of stack start randomization. |
142 | */ | 142 | */ |
143 | addr = PAGE_ALIGN(addr); | 143 | addr = PAGE_ALIGN(addr); |
144 | addr = align_addr(addr, NULL, ALIGN_VDSO); | 144 | addr = align_vdso_addr(addr); |
145 | 145 | ||
146 | return addr; | 146 | return addr; |
147 | } | 147 | } |
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 25bc6c1309c3..00eed6786d7e 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h | |||
@@ -93,4 +93,15 @@ | |||
93 | /* compatibility flags */ | 93 | /* compatibility flags */ |
94 | #define MAP_FILE 0 | 94 | #define MAP_FILE 0 |
95 | 95 | ||
96 | /* | ||
97 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
98 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
99 | * spaces. | ||
100 | * | ||
101 | * Assume these are all power of twos. | ||
102 | * When 0 use the default page size. | ||
103 | */ | ||
104 | #define MAP_HUGE_SHIFT 26 | ||
105 | #define MAP_HUGE_MASK 0x3f | ||
106 | |||
96 | #endif /* _XTENSA_MMAN_H */ | 107 | #endif /* _XTENSA_MMAN_H */ |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 86c88216a503..987604d56c83 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -70,6 +70,13 @@ void unregister_memory_isolate_notifier(struct notifier_block *nb) | |||
70 | } | 70 | } |
71 | EXPORT_SYMBOL(unregister_memory_isolate_notifier); | 71 | EXPORT_SYMBOL(unregister_memory_isolate_notifier); |
72 | 72 | ||
73 | static void memory_block_release(struct device *dev) | ||
74 | { | ||
75 | struct memory_block *mem = container_of(dev, struct memory_block, dev); | ||
76 | |||
77 | kfree(mem); | ||
78 | } | ||
79 | |||
73 | /* | 80 | /* |
74 | * register_memory - Setup a sysfs device for a memory block | 81 | * register_memory - Setup a sysfs device for a memory block |
75 | */ | 82 | */ |
@@ -80,6 +87,7 @@ int register_memory(struct memory_block *memory) | |||
80 | 87 | ||
81 | memory->dev.bus = &memory_subsys; | 88 | memory->dev.bus = &memory_subsys; |
82 | memory->dev.id = memory->start_section_nr / sections_per_block; | 89 | memory->dev.id = memory->start_section_nr / sections_per_block; |
90 | memory->dev.release = memory_block_release; | ||
83 | 91 | ||
84 | error = device_register(&memory->dev); | 92 | error = device_register(&memory->dev); |
85 | return error; | 93 | return error; |
@@ -246,7 +254,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn, | |||
246 | * OK to have direct references to sparsemem variables in here. | 254 | * OK to have direct references to sparsemem variables in here. |
247 | */ | 255 | */ |
248 | static int | 256 | static int |
249 | memory_block_action(unsigned long phys_index, unsigned long action) | 257 | memory_block_action(unsigned long phys_index, unsigned long action, int online_type) |
250 | { | 258 | { |
251 | unsigned long start_pfn; | 259 | unsigned long start_pfn; |
252 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | 260 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; |
@@ -261,7 +269,7 @@ memory_block_action(unsigned long phys_index, unsigned long action) | |||
261 | if (!pages_correctly_reserved(start_pfn, nr_pages)) | 269 | if (!pages_correctly_reserved(start_pfn, nr_pages)) |
262 | return -EBUSY; | 270 | return -EBUSY; |
263 | 271 | ||
264 | ret = online_pages(start_pfn, nr_pages); | 272 | ret = online_pages(start_pfn, nr_pages, online_type); |
265 | break; | 273 | break; |
266 | case MEM_OFFLINE: | 274 | case MEM_OFFLINE: |
267 | ret = offline_pages(start_pfn, nr_pages); | 275 | ret = offline_pages(start_pfn, nr_pages); |
@@ -276,7 +284,8 @@ memory_block_action(unsigned long phys_index, unsigned long action) | |||
276 | } | 284 | } |
277 | 285 | ||
278 | static int __memory_block_change_state(struct memory_block *mem, | 286 | static int __memory_block_change_state(struct memory_block *mem, |
279 | unsigned long to_state, unsigned long from_state_req) | 287 | unsigned long to_state, unsigned long from_state_req, |
288 | int online_type) | ||
280 | { | 289 | { |
281 | int ret = 0; | 290 | int ret = 0; |
282 | 291 | ||
@@ -288,7 +297,7 @@ static int __memory_block_change_state(struct memory_block *mem, | |||
288 | if (to_state == MEM_OFFLINE) | 297 | if (to_state == MEM_OFFLINE) |
289 | mem->state = MEM_GOING_OFFLINE; | 298 | mem->state = MEM_GOING_OFFLINE; |
290 | 299 | ||
291 | ret = memory_block_action(mem->start_section_nr, to_state); | 300 | ret = memory_block_action(mem->start_section_nr, to_state, online_type); |
292 | 301 | ||
293 | if (ret) { | 302 | if (ret) { |
294 | mem->state = from_state_req; | 303 | mem->state = from_state_req; |
@@ -311,12 +320,14 @@ out: | |||
311 | } | 320 | } |
312 | 321 | ||
313 | static int memory_block_change_state(struct memory_block *mem, | 322 | static int memory_block_change_state(struct memory_block *mem, |
314 | unsigned long to_state, unsigned long from_state_req) | 323 | unsigned long to_state, unsigned long from_state_req, |
324 | int online_type) | ||
315 | { | 325 | { |
316 | int ret; | 326 | int ret; |
317 | 327 | ||
318 | mutex_lock(&mem->state_mutex); | 328 | mutex_lock(&mem->state_mutex); |
319 | ret = __memory_block_change_state(mem, to_state, from_state_req); | 329 | ret = __memory_block_change_state(mem, to_state, from_state_req, |
330 | online_type); | ||
320 | mutex_unlock(&mem->state_mutex); | 331 | mutex_unlock(&mem->state_mutex); |
321 | 332 | ||
322 | return ret; | 333 | return ret; |
@@ -330,10 +341,18 @@ store_mem_state(struct device *dev, | |||
330 | 341 | ||
331 | mem = container_of(dev, struct memory_block, dev); | 342 | mem = container_of(dev, struct memory_block, dev); |
332 | 343 | ||
333 | if (!strncmp(buf, "online", min((int)count, 6))) | 344 | if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) |
334 | ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); | 345 | ret = memory_block_change_state(mem, MEM_ONLINE, |
335 | else if(!strncmp(buf, "offline", min((int)count, 7))) | 346 | MEM_OFFLINE, ONLINE_KERNEL); |
336 | ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | 347 | else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) |
348 | ret = memory_block_change_state(mem, MEM_ONLINE, | ||
349 | MEM_OFFLINE, ONLINE_MOVABLE); | ||
350 | else if (!strncmp(buf, "online", min_t(int, count, 6))) | ||
351 | ret = memory_block_change_state(mem, MEM_ONLINE, | ||
352 | MEM_OFFLINE, ONLINE_KEEP); | ||
353 | else if(!strncmp(buf, "offline", min_t(int, count, 7))) | ||
354 | ret = memory_block_change_state(mem, MEM_OFFLINE, | ||
355 | MEM_ONLINE, -1); | ||
337 | 356 | ||
338 | if (ret) | 357 | if (ret) |
339 | return ret; | 358 | return ret; |
@@ -635,7 +654,6 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section, | |||
635 | mem_remove_simple_file(mem, phys_device); | 654 | mem_remove_simple_file(mem, phys_device); |
636 | mem_remove_simple_file(mem, removable); | 655 | mem_remove_simple_file(mem, removable); |
637 | unregister_memory(mem); | 656 | unregister_memory(mem); |
638 | kfree(mem); | ||
639 | } else | 657 | } else |
640 | kobject_put(&mem->dev.kobj); | 658 | kobject_put(&mem->dev.kobj); |
641 | 659 | ||
@@ -669,7 +687,7 @@ int offline_memory_block(struct memory_block *mem) | |||
669 | 687 | ||
670 | mutex_lock(&mem->state_mutex); | 688 | mutex_lock(&mem->state_mutex); |
671 | if (mem->state != MEM_OFFLINE) | 689 | if (mem->state != MEM_OFFLINE) |
672 | ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | 690 | ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1); |
673 | mutex_unlock(&mem->state_mutex); | 691 | mutex_unlock(&mem->state_mutex); |
674 | 692 | ||
675 | return ret; | 693 | return ret; |
diff --git a/drivers/base/node.c b/drivers/base/node.c index af1a177216f1..294e31626210 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -252,6 +252,24 @@ static inline void hugetlb_register_node(struct node *node) {} | |||
252 | static inline void hugetlb_unregister_node(struct node *node) {} | 252 | static inline void hugetlb_unregister_node(struct node *node) {} |
253 | #endif | 253 | #endif |
254 | 254 | ||
255 | static void node_device_release(struct device *dev) | ||
256 | { | ||
257 | struct node *node = to_node(dev); | ||
258 | |||
259 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS) | ||
260 | /* | ||
261 | * We schedule the work only when a memory section is | ||
262 | * onlined/offlined on this node. When we come here, | ||
263 | * all the memory on this node has been offlined, | ||
264 | * so we won't enqueue new work to this work. | ||
265 | * | ||
266 | * The work is using node->node_work, so we should | ||
267 | * flush work before freeing the memory. | ||
268 | */ | ||
269 | flush_work(&node->node_work); | ||
270 | #endif | ||
271 | kfree(node); | ||
272 | } | ||
255 | 273 | ||
256 | /* | 274 | /* |
257 | * register_node - Setup a sysfs device for a node. | 275 | * register_node - Setup a sysfs device for a node. |
@@ -259,12 +277,13 @@ static inline void hugetlb_unregister_node(struct node *node) {} | |||
259 | * | 277 | * |
260 | * Initialize and register the node device. | 278 | * Initialize and register the node device. |
261 | */ | 279 | */ |
262 | int register_node(struct node *node, int num, struct node *parent) | 280 | static int register_node(struct node *node, int num, struct node *parent) |
263 | { | 281 | { |
264 | int error; | 282 | int error; |
265 | 283 | ||
266 | node->dev.id = num; | 284 | node->dev.id = num; |
267 | node->dev.bus = &node_subsys; | 285 | node->dev.bus = &node_subsys; |
286 | node->dev.release = node_device_release; | ||
268 | error = device_register(&node->dev); | 287 | error = device_register(&node->dev); |
269 | 288 | ||
270 | if (!error){ | 289 | if (!error){ |
@@ -306,7 +325,7 @@ void unregister_node(struct node *node) | |||
306 | device_unregister(&node->dev); | 325 | device_unregister(&node->dev); |
307 | } | 326 | } |
308 | 327 | ||
309 | struct node node_devices[MAX_NUMNODES]; | 328 | struct node *node_devices[MAX_NUMNODES]; |
310 | 329 | ||
311 | /* | 330 | /* |
312 | * register cpu under node | 331 | * register cpu under node |
@@ -323,15 +342,15 @@ int register_cpu_under_node(unsigned int cpu, unsigned int nid) | |||
323 | if (!obj) | 342 | if (!obj) |
324 | return 0; | 343 | return 0; |
325 | 344 | ||
326 | ret = sysfs_create_link(&node_devices[nid].dev.kobj, | 345 | ret = sysfs_create_link(&node_devices[nid]->dev.kobj, |
327 | &obj->kobj, | 346 | &obj->kobj, |
328 | kobject_name(&obj->kobj)); | 347 | kobject_name(&obj->kobj)); |
329 | if (ret) | 348 | if (ret) |
330 | return ret; | 349 | return ret; |
331 | 350 | ||
332 | return sysfs_create_link(&obj->kobj, | 351 | return sysfs_create_link(&obj->kobj, |
333 | &node_devices[nid].dev.kobj, | 352 | &node_devices[nid]->dev.kobj, |
334 | kobject_name(&node_devices[nid].dev.kobj)); | 353 | kobject_name(&node_devices[nid]->dev.kobj)); |
335 | } | 354 | } |
336 | 355 | ||
337 | int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) | 356 | int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) |
@@ -345,10 +364,10 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) | |||
345 | if (!obj) | 364 | if (!obj) |
346 | return 0; | 365 | return 0; |
347 | 366 | ||
348 | sysfs_remove_link(&node_devices[nid].dev.kobj, | 367 | sysfs_remove_link(&node_devices[nid]->dev.kobj, |
349 | kobject_name(&obj->kobj)); | 368 | kobject_name(&obj->kobj)); |
350 | sysfs_remove_link(&obj->kobj, | 369 | sysfs_remove_link(&obj->kobj, |
351 | kobject_name(&node_devices[nid].dev.kobj)); | 370 | kobject_name(&node_devices[nid]->dev.kobj)); |
352 | 371 | ||
353 | return 0; | 372 | return 0; |
354 | } | 373 | } |
@@ -390,15 +409,15 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid) | |||
390 | continue; | 409 | continue; |
391 | if (page_nid != nid) | 410 | if (page_nid != nid) |
392 | continue; | 411 | continue; |
393 | ret = sysfs_create_link_nowarn(&node_devices[nid].dev.kobj, | 412 | ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, |
394 | &mem_blk->dev.kobj, | 413 | &mem_blk->dev.kobj, |
395 | kobject_name(&mem_blk->dev.kobj)); | 414 | kobject_name(&mem_blk->dev.kobj)); |
396 | if (ret) | 415 | if (ret) |
397 | return ret; | 416 | return ret; |
398 | 417 | ||
399 | return sysfs_create_link_nowarn(&mem_blk->dev.kobj, | 418 | return sysfs_create_link_nowarn(&mem_blk->dev.kobj, |
400 | &node_devices[nid].dev.kobj, | 419 | &node_devices[nid]->dev.kobj, |
401 | kobject_name(&node_devices[nid].dev.kobj)); | 420 | kobject_name(&node_devices[nid]->dev.kobj)); |
402 | } | 421 | } |
403 | /* mem section does not span the specified node */ | 422 | /* mem section does not span the specified node */ |
404 | return 0; | 423 | return 0; |
@@ -431,10 +450,10 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, | |||
431 | continue; | 450 | continue; |
432 | if (node_test_and_set(nid, *unlinked_nodes)) | 451 | if (node_test_and_set(nid, *unlinked_nodes)) |
433 | continue; | 452 | continue; |
434 | sysfs_remove_link(&node_devices[nid].dev.kobj, | 453 | sysfs_remove_link(&node_devices[nid]->dev.kobj, |
435 | kobject_name(&mem_blk->dev.kobj)); | 454 | kobject_name(&mem_blk->dev.kobj)); |
436 | sysfs_remove_link(&mem_blk->dev.kobj, | 455 | sysfs_remove_link(&mem_blk->dev.kobj, |
437 | kobject_name(&node_devices[nid].dev.kobj)); | 456 | kobject_name(&node_devices[nid]->dev.kobj)); |
438 | } | 457 | } |
439 | NODEMASK_FREE(unlinked_nodes); | 458 | NODEMASK_FREE(unlinked_nodes); |
440 | return 0; | 459 | return 0; |
@@ -500,7 +519,7 @@ static void node_hugetlb_work(struct work_struct *work) | |||
500 | 519 | ||
501 | static void init_node_hugetlb_work(int nid) | 520 | static void init_node_hugetlb_work(int nid) |
502 | { | 521 | { |
503 | INIT_WORK(&node_devices[nid].node_work, node_hugetlb_work); | 522 | INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work); |
504 | } | 523 | } |
505 | 524 | ||
506 | static int node_memory_callback(struct notifier_block *self, | 525 | static int node_memory_callback(struct notifier_block *self, |
@@ -517,7 +536,7 @@ static int node_memory_callback(struct notifier_block *self, | |||
517 | * when transitioning to/from memoryless state. | 536 | * when transitioning to/from memoryless state. |
518 | */ | 537 | */ |
519 | if (nid != NUMA_NO_NODE) | 538 | if (nid != NUMA_NO_NODE) |
520 | schedule_work(&node_devices[nid].node_work); | 539 | schedule_work(&node_devices[nid]->node_work); |
521 | break; | 540 | break; |
522 | 541 | ||
523 | case MEM_GOING_ONLINE: | 542 | case MEM_GOING_ONLINE: |
@@ -558,9 +577,13 @@ int register_one_node(int nid) | |||
558 | struct node *parent = NULL; | 577 | struct node *parent = NULL; |
559 | 578 | ||
560 | if (p_node != nid) | 579 | if (p_node != nid) |
561 | parent = &node_devices[p_node]; | 580 | parent = node_devices[p_node]; |
581 | |||
582 | node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL); | ||
583 | if (!node_devices[nid]) | ||
584 | return -ENOMEM; | ||
562 | 585 | ||
563 | error = register_node(&node_devices[nid], nid, parent); | 586 | error = register_node(node_devices[nid], nid, parent); |
564 | 587 | ||
565 | /* link cpu under this node */ | 588 | /* link cpu under this node */ |
566 | for_each_present_cpu(cpu) { | 589 | for_each_present_cpu(cpu) { |
@@ -581,7 +604,8 @@ int register_one_node(int nid) | |||
581 | 604 | ||
582 | void unregister_one_node(int nid) | 605 | void unregister_one_node(int nid) |
583 | { | 606 | { |
584 | unregister_node(&node_devices[nid]); | 607 | unregister_node(node_devices[nid]); |
608 | node_devices[nid] = NULL; | ||
585 | } | 609 | } |
586 | 610 | ||
587 | /* | 611 | /* |
@@ -614,23 +638,23 @@ static ssize_t show_node_state(struct device *dev, | |||
614 | { __ATTR(name, 0444, show_node_state, NULL), state } | 638 | { __ATTR(name, 0444, show_node_state, NULL), state } |
615 | 639 | ||
616 | static struct node_attr node_state_attr[] = { | 640 | static struct node_attr node_state_attr[] = { |
617 | _NODE_ATTR(possible, N_POSSIBLE), | 641 | [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE), |
618 | _NODE_ATTR(online, N_ONLINE), | 642 | [N_ONLINE] = _NODE_ATTR(online, N_ONLINE), |
619 | _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY), | 643 | [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY), |
620 | _NODE_ATTR(has_cpu, N_CPU), | ||
621 | #ifdef CONFIG_HIGHMEM | 644 | #ifdef CONFIG_HIGHMEM |
622 | _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), | 645 | [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), |
623 | #endif | 646 | #endif |
647 | [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), | ||
624 | }; | 648 | }; |
625 | 649 | ||
626 | static struct attribute *node_state_attrs[] = { | 650 | static struct attribute *node_state_attrs[] = { |
627 | &node_state_attr[0].attr.attr, | 651 | &node_state_attr[N_POSSIBLE].attr.attr, |
628 | &node_state_attr[1].attr.attr, | 652 | &node_state_attr[N_ONLINE].attr.attr, |
629 | &node_state_attr[2].attr.attr, | 653 | &node_state_attr[N_NORMAL_MEMORY].attr.attr, |
630 | &node_state_attr[3].attr.attr, | ||
631 | #ifdef CONFIG_HIGHMEM | 654 | #ifdef CONFIG_HIGHMEM |
632 | &node_state_attr[4].attr.attr, | 655 | &node_state_attr[N_HIGH_MEMORY].attr.attr, |
633 | #endif | 656 | #endif |
657 | &node_state_attr[N_CPU].attr.attr, | ||
634 | NULL | 658 | NULL |
635 | }; | 659 | }; |
636 | 660 | ||
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index 7d5a6b40b31c..196368009001 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c | |||
@@ -565,7 +565,7 @@ fail_msg_node: | |||
565 | fail_db_node: | 565 | fail_db_node: |
566 | of_node_put(smu->db_node); | 566 | of_node_put(smu->db_node); |
567 | fail_bootmem: | 567 | fail_bootmem: |
568 | free_bootmem((unsigned long)smu, sizeof(struct smu_device)); | 568 | free_bootmem(__pa(smu), sizeof(struct smu_device)); |
569 | smu = NULL; | 569 | smu = NULL; |
570 | fail_np: | 570 | fail_np: |
571 | of_node_put(np); | 571 | of_node_put(np); |
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index b91e4bc332a7..3b91b0fd4de3 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c | |||
@@ -40,7 +40,7 @@ | |||
40 | #include <linux/notifier.h> | 40 | #include <linux/notifier.h> |
41 | 41 | ||
42 | static uint32_t lowmem_debug_level = 2; | 42 | static uint32_t lowmem_debug_level = 2; |
43 | static int lowmem_adj[6] = { | 43 | static short lowmem_adj[6] = { |
44 | 0, | 44 | 0, |
45 | 1, | 45 | 1, |
46 | 6, | 46 | 6, |
@@ -70,9 +70,9 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) | |||
70 | int rem = 0; | 70 | int rem = 0; |
71 | int tasksize; | 71 | int tasksize; |
72 | int i; | 72 | int i; |
73 | int min_score_adj = OOM_SCORE_ADJ_MAX + 1; | 73 | short min_score_adj = OOM_SCORE_ADJ_MAX + 1; |
74 | int selected_tasksize = 0; | 74 | int selected_tasksize = 0; |
75 | int selected_oom_score_adj; | 75 | short selected_oom_score_adj; |
76 | int array_size = ARRAY_SIZE(lowmem_adj); | 76 | int array_size = ARRAY_SIZE(lowmem_adj); |
77 | int other_free = global_page_state(NR_FREE_PAGES); | 77 | int other_free = global_page_state(NR_FREE_PAGES); |
78 | int other_file = global_page_state(NR_FILE_PAGES) - | 78 | int other_file = global_page_state(NR_FILE_PAGES) - |
@@ -90,7 +90,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) | |||
90 | } | 90 | } |
91 | } | 91 | } |
92 | if (sc->nr_to_scan > 0) | 92 | if (sc->nr_to_scan > 0) |
93 | lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %d\n", | 93 | lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n", |
94 | sc->nr_to_scan, sc->gfp_mask, other_free, | 94 | sc->nr_to_scan, sc->gfp_mask, other_free, |
95 | other_file, min_score_adj); | 95 | other_file, min_score_adj); |
96 | rem = global_page_state(NR_ACTIVE_ANON) + | 96 | rem = global_page_state(NR_ACTIVE_ANON) + |
@@ -107,7 +107,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) | |||
107 | rcu_read_lock(); | 107 | rcu_read_lock(); |
108 | for_each_process(tsk) { | 108 | for_each_process(tsk) { |
109 | struct task_struct *p; | 109 | struct task_struct *p; |
110 | int oom_score_adj; | 110 | short oom_score_adj; |
111 | 111 | ||
112 | if (tsk->flags & PF_KTHREAD) | 112 | if (tsk->flags & PF_KTHREAD) |
113 | continue; | 113 | continue; |
@@ -141,11 +141,11 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) | |||
141 | selected = p; | 141 | selected = p; |
142 | selected_tasksize = tasksize; | 142 | selected_tasksize = tasksize; |
143 | selected_oom_score_adj = oom_score_adj; | 143 | selected_oom_score_adj = oom_score_adj; |
144 | lowmem_print(2, "select %d (%s), adj %d, size %d, to kill\n", | 144 | lowmem_print(2, "select %d (%s), adj %hd, size %d, to kill\n", |
145 | p->pid, p->comm, oom_score_adj, tasksize); | 145 | p->pid, p->comm, oom_score_adj, tasksize); |
146 | } | 146 | } |
147 | if (selected) { | 147 | if (selected) { |
148 | lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d\n", | 148 | lowmem_print(1, "send sigkill to %d (%s), adj %hd, size %d\n", |
149 | selected->pid, selected->comm, | 149 | selected->pid, selected->comm, |
150 | selected_oom_score_adj, selected_tasksize); | 150 | selected_oom_score_adj, selected_tasksize); |
151 | lowmem_deathpending_timeout = jiffies + HZ; | 151 | lowmem_deathpending_timeout = jiffies + HZ; |
@@ -176,7 +176,7 @@ static void __exit lowmem_exit(void) | |||
176 | } | 176 | } |
177 | 177 | ||
178 | module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR); | 178 | module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR); |
179 | module_param_array_named(adj, lowmem_adj, int, &lowmem_adj_size, | 179 | module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size, |
180 | S_IRUGO | S_IWUSR); | 180 | S_IRUGO | S_IWUSR); |
181 | module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size, | 181 | module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size, |
182 | S_IRUGO | S_IWUSR); | 182 | S_IRUGO | S_IWUSR); |
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 0908e6044333..2a70558b36ea 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c | |||
@@ -27,13 +27,15 @@ | |||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/balloon_compaction.h> | ||
30 | 31 | ||
31 | /* | 32 | /* |
32 | * Balloon device works in 4K page units. So each page is pointed to by | 33 | * Balloon device works in 4K page units. So each page is pointed to by |
33 | * multiple balloon pages. All memory counters in this driver are in balloon | 34 | * multiple balloon pages. All memory counters in this driver are in balloon |
34 | * page units. | 35 | * page units. |
35 | */ | 36 | */ |
36 | #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) | 37 | #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) |
38 | #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 | ||
37 | 39 | ||
38 | struct virtio_balloon | 40 | struct virtio_balloon |
39 | { | 41 | { |
@@ -52,15 +54,19 @@ struct virtio_balloon | |||
52 | /* Number of balloon pages we've told the Host we're not using. */ | 54 | /* Number of balloon pages we've told the Host we're not using. */ |
53 | unsigned int num_pages; | 55 | unsigned int num_pages; |
54 | /* | 56 | /* |
55 | * The pages we've told the Host we're not using. | 57 | * The pages we've told the Host we're not using are enqueued |
58 | * at vb_dev_info->pages list. | ||
56 | * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE | 59 | * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE |
57 | * to num_pages above. | 60 | * to num_pages above. |
58 | */ | 61 | */ |
59 | struct list_head pages; | 62 | struct balloon_dev_info *vb_dev_info; |
63 | |||
64 | /* Synchronize access/update to this struct virtio_balloon elements */ | ||
65 | struct mutex balloon_lock; | ||
60 | 66 | ||
61 | /* The array of pfns we tell the Host about. */ | 67 | /* The array of pfns we tell the Host about. */ |
62 | unsigned int num_pfns; | 68 | unsigned int num_pfns; |
63 | u32 pfns[256]; | 69 | u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; |
64 | 70 | ||
65 | /* Memory statistics */ | 71 | /* Memory statistics */ |
66 | int need_stats_update; | 72 | int need_stats_update; |
@@ -122,18 +128,21 @@ static void set_page_pfns(u32 pfns[], struct page *page) | |||
122 | 128 | ||
123 | static void fill_balloon(struct virtio_balloon *vb, size_t num) | 129 | static void fill_balloon(struct virtio_balloon *vb, size_t num) |
124 | { | 130 | { |
131 | struct balloon_dev_info *vb_dev_info = vb->vb_dev_info; | ||
132 | |||
125 | /* We can only do one array worth at a time. */ | 133 | /* We can only do one array worth at a time. */ |
126 | num = min(num, ARRAY_SIZE(vb->pfns)); | 134 | num = min(num, ARRAY_SIZE(vb->pfns)); |
127 | 135 | ||
136 | mutex_lock(&vb->balloon_lock); | ||
128 | for (vb->num_pfns = 0; vb->num_pfns < num; | 137 | for (vb->num_pfns = 0; vb->num_pfns < num; |
129 | vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { | 138 | vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { |
130 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY | | 139 | struct page *page = balloon_page_enqueue(vb_dev_info); |
131 | __GFP_NOMEMALLOC | __GFP_NOWARN); | 140 | |
132 | if (!page) { | 141 | if (!page) { |
133 | if (printk_ratelimit()) | 142 | if (printk_ratelimit()) |
134 | dev_printk(KERN_INFO, &vb->vdev->dev, | 143 | dev_printk(KERN_INFO, &vb->vdev->dev, |
135 | "Out of puff! Can't get %zu pages\n", | 144 | "Out of puff! Can't get %u pages\n", |
136 | num); | 145 | VIRTIO_BALLOON_PAGES_PER_PAGE); |
137 | /* Sleep for at least 1/5 of a second before retry. */ | 146 | /* Sleep for at least 1/5 of a second before retry. */ |
138 | msleep(200); | 147 | msleep(200); |
139 | break; | 148 | break; |
@@ -141,14 +150,12 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num) | |||
141 | set_page_pfns(vb->pfns + vb->num_pfns, page); | 150 | set_page_pfns(vb->pfns + vb->num_pfns, page); |
142 | vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; | 151 | vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; |
143 | totalram_pages--; | 152 | totalram_pages--; |
144 | list_add(&page->lru, &vb->pages); | ||
145 | } | 153 | } |
146 | 154 | ||
147 | /* Didn't get any? Oh well. */ | 155 | /* Did we get any? */ |
148 | if (vb->num_pfns == 0) | 156 | if (vb->num_pfns != 0) |
149 | return; | 157 | tell_host(vb, vb->inflate_vq); |
150 | 158 | mutex_unlock(&vb->balloon_lock); | |
151 | tell_host(vb, vb->inflate_vq); | ||
152 | } | 159 | } |
153 | 160 | ||
154 | static void release_pages_by_pfn(const u32 pfns[], unsigned int num) | 161 | static void release_pages_by_pfn(const u32 pfns[], unsigned int num) |
@@ -157,7 +164,7 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num) | |||
157 | 164 | ||
158 | /* Find pfns pointing at start of each page, get pages and free them. */ | 165 | /* Find pfns pointing at start of each page, get pages and free them. */ |
159 | for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { | 166 | for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { |
160 | __free_page(balloon_pfn_to_page(pfns[i])); | 167 | balloon_page_free(balloon_pfn_to_page(pfns[i])); |
161 | totalram_pages++; | 168 | totalram_pages++; |
162 | } | 169 | } |
163 | } | 170 | } |
@@ -165,14 +172,17 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num) | |||
165 | static void leak_balloon(struct virtio_balloon *vb, size_t num) | 172 | static void leak_balloon(struct virtio_balloon *vb, size_t num) |
166 | { | 173 | { |
167 | struct page *page; | 174 | struct page *page; |
175 | struct balloon_dev_info *vb_dev_info = vb->vb_dev_info; | ||
168 | 176 | ||
169 | /* We can only do one array worth at a time. */ | 177 | /* We can only do one array worth at a time. */ |
170 | num = min(num, ARRAY_SIZE(vb->pfns)); | 178 | num = min(num, ARRAY_SIZE(vb->pfns)); |
171 | 179 | ||
180 | mutex_lock(&vb->balloon_lock); | ||
172 | for (vb->num_pfns = 0; vb->num_pfns < num; | 181 | for (vb->num_pfns = 0; vb->num_pfns < num; |
173 | vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { | 182 | vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { |
174 | page = list_first_entry(&vb->pages, struct page, lru); | 183 | page = balloon_page_dequeue(vb_dev_info); |
175 | list_del(&page->lru); | 184 | if (!page) |
185 | break; | ||
176 | set_page_pfns(vb->pfns + vb->num_pfns, page); | 186 | set_page_pfns(vb->pfns + vb->num_pfns, page); |
177 | vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; | 187 | vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; |
178 | } | 188 | } |
@@ -183,6 +193,7 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num) | |||
183 | * is true, we *have* to do it in this order | 193 | * is true, we *have* to do it in this order |
184 | */ | 194 | */ |
185 | tell_host(vb, vb->deflate_vq); | 195 | tell_host(vb, vb->deflate_vq); |
196 | mutex_unlock(&vb->balloon_lock); | ||
186 | release_pages_by_pfn(vb->pfns, vb->num_pfns); | 197 | release_pages_by_pfn(vb->pfns, vb->num_pfns); |
187 | } | 198 | } |
188 | 199 | ||
@@ -339,9 +350,84 @@ static int init_vqs(struct virtio_balloon *vb) | |||
339 | return 0; | 350 | return 0; |
340 | } | 351 | } |
341 | 352 | ||
353 | static const struct address_space_operations virtio_balloon_aops; | ||
354 | #ifdef CONFIG_BALLOON_COMPACTION | ||
355 | /* | ||
356 | * virtballoon_migratepage - perform the balloon page migration on behalf of | ||
357 | * a compation thread. (called under page lock) | ||
358 | * @mapping: the page->mapping which will be assigned to the new migrated page. | ||
359 | * @newpage: page that will replace the isolated page after migration finishes. | ||
360 | * @page : the isolated (old) page that is about to be migrated to newpage. | ||
361 | * @mode : compaction mode -- not used for balloon page migration. | ||
362 | * | ||
363 | * After a ballooned page gets isolated by compaction procedures, this is the | ||
364 | * function that performs the page migration on behalf of a compaction thread | ||
365 | * The page migration for virtio balloon is done in a simple swap fashion which | ||
366 | * follows these two macro steps: | ||
367 | * 1) insert newpage into vb->pages list and update the host about it; | ||
368 | * 2) update the host about the old page removed from vb->pages list; | ||
369 | * | ||
370 | * This function preforms the balloon page migration task. | ||
371 | * Called through balloon_mapping->a_ops->migratepage | ||
372 | */ | ||
373 | int virtballoon_migratepage(struct address_space *mapping, | ||
374 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
375 | { | ||
376 | struct balloon_dev_info *vb_dev_info = balloon_page_device(page); | ||
377 | struct virtio_balloon *vb; | ||
378 | unsigned long flags; | ||
379 | |||
380 | BUG_ON(!vb_dev_info); | ||
381 | |||
382 | vb = vb_dev_info->balloon_device; | ||
383 | |||
384 | /* | ||
385 | * In order to avoid lock contention while migrating pages concurrently | ||
386 | * to leak_balloon() or fill_balloon() we just give up the balloon_lock | ||
387 | * this turn, as it is easier to retry the page migration later. | ||
388 | * This also prevents fill_balloon() getting stuck into a mutex | ||
389 | * recursion in the case it ends up triggering memory compaction | ||
390 | * while it is attempting to inflate the ballon. | ||
391 | */ | ||
392 | if (!mutex_trylock(&vb->balloon_lock)) | ||
393 | return -EAGAIN; | ||
394 | |||
395 | /* balloon's page migration 1st step -- inflate "newpage" */ | ||
396 | spin_lock_irqsave(&vb_dev_info->pages_lock, flags); | ||
397 | balloon_page_insert(newpage, mapping, &vb_dev_info->pages); | ||
398 | vb_dev_info->isolated_pages--; | ||
399 | spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); | ||
400 | vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; | ||
401 | set_page_pfns(vb->pfns, newpage); | ||
402 | tell_host(vb, vb->inflate_vq); | ||
403 | |||
404 | /* | ||
405 | * balloon's page migration 2nd step -- deflate "page" | ||
406 | * | ||
407 | * It's safe to delete page->lru here because this page is at | ||
408 | * an isolated migration list, and this step is expected to happen here | ||
409 | */ | ||
410 | balloon_page_delete(page); | ||
411 | vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; | ||
412 | set_page_pfns(vb->pfns, page); | ||
413 | tell_host(vb, vb->deflate_vq); | ||
414 | |||
415 | mutex_unlock(&vb->balloon_lock); | ||
416 | |||
417 | return MIGRATEPAGE_BALLOON_SUCCESS; | ||
418 | } | ||
419 | |||
420 | /* define the balloon_mapping->a_ops callback to allow balloon page migration */ | ||
421 | static const struct address_space_operations virtio_balloon_aops = { | ||
422 | .migratepage = virtballoon_migratepage, | ||
423 | }; | ||
424 | #endif /* CONFIG_BALLOON_COMPACTION */ | ||
425 | |||
342 | static int virtballoon_probe(struct virtio_device *vdev) | 426 | static int virtballoon_probe(struct virtio_device *vdev) |
343 | { | 427 | { |
344 | struct virtio_balloon *vb; | 428 | struct virtio_balloon *vb; |
429 | struct address_space *vb_mapping; | ||
430 | struct balloon_dev_info *vb_devinfo; | ||
345 | int err; | 431 | int err; |
346 | 432 | ||
347 | vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); | 433 | vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); |
@@ -350,16 +436,37 @@ static int virtballoon_probe(struct virtio_device *vdev) | |||
350 | goto out; | 436 | goto out; |
351 | } | 437 | } |
352 | 438 | ||
353 | INIT_LIST_HEAD(&vb->pages); | ||
354 | vb->num_pages = 0; | 439 | vb->num_pages = 0; |
440 | mutex_init(&vb->balloon_lock); | ||
355 | init_waitqueue_head(&vb->config_change); | 441 | init_waitqueue_head(&vb->config_change); |
356 | init_waitqueue_head(&vb->acked); | 442 | init_waitqueue_head(&vb->acked); |
357 | vb->vdev = vdev; | 443 | vb->vdev = vdev; |
358 | vb->need_stats_update = 0; | 444 | vb->need_stats_update = 0; |
359 | 445 | ||
446 | vb_devinfo = balloon_devinfo_alloc(vb); | ||
447 | if (IS_ERR(vb_devinfo)) { | ||
448 | err = PTR_ERR(vb_devinfo); | ||
449 | goto out_free_vb; | ||
450 | } | ||
451 | |||
452 | vb_mapping = balloon_mapping_alloc(vb_devinfo, | ||
453 | (balloon_compaction_check()) ? | ||
454 | &virtio_balloon_aops : NULL); | ||
455 | if (IS_ERR(vb_mapping)) { | ||
456 | /* | ||
457 | * IS_ERR(vb_mapping) && PTR_ERR(vb_mapping) == -EOPNOTSUPP | ||
458 | * This means !CONFIG_BALLOON_COMPACTION, otherwise we get off. | ||
459 | */ | ||
460 | err = PTR_ERR(vb_mapping); | ||
461 | if (err != -EOPNOTSUPP) | ||
462 | goto out_free_vb_devinfo; | ||
463 | } | ||
464 | |||
465 | vb->vb_dev_info = vb_devinfo; | ||
466 | |||
360 | err = init_vqs(vb); | 467 | err = init_vqs(vb); |
361 | if (err) | 468 | if (err) |
362 | goto out_free_vb; | 469 | goto out_free_vb_mapping; |
363 | 470 | ||
364 | vb->thread = kthread_run(balloon, vb, "vballoon"); | 471 | vb->thread = kthread_run(balloon, vb, "vballoon"); |
365 | if (IS_ERR(vb->thread)) { | 472 | if (IS_ERR(vb->thread)) { |
@@ -371,6 +478,10 @@ static int virtballoon_probe(struct virtio_device *vdev) | |||
371 | 478 | ||
372 | out_del_vqs: | 479 | out_del_vqs: |
373 | vdev->config->del_vqs(vdev); | 480 | vdev->config->del_vqs(vdev); |
481 | out_free_vb_mapping: | ||
482 | balloon_mapping_free(vb_mapping); | ||
483 | out_free_vb_devinfo: | ||
484 | balloon_devinfo_free(vb_devinfo); | ||
374 | out_free_vb: | 485 | out_free_vb: |
375 | kfree(vb); | 486 | kfree(vb); |
376 | out: | 487 | out: |
@@ -396,6 +507,8 @@ static void __devexit virtballoon_remove(struct virtio_device *vdev) | |||
396 | 507 | ||
397 | kthread_stop(vb->thread); | 508 | kthread_stop(vb->thread); |
398 | remove_common(vb); | 509 | remove_common(vb); |
510 | balloon_mapping_free(vb->vb_dev_info->mapping); | ||
511 | balloon_devinfo_free(vb->vb_dev_info); | ||
399 | kfree(vb); | 512 | kfree(vb); |
400 | } | 513 | } |
401 | 514 | ||
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda51995c1e..22a0439e5a86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) | |||
3416 | num_dirty = root->fs_info->dirty_metadata_bytes; | 3416 | num_dirty = root->fs_info->dirty_metadata_bytes; |
3417 | 3417 | ||
3418 | if (num_dirty > thresh) { | 3418 | if (num_dirty > thresh) { |
3419 | balance_dirty_pages_ratelimited_nr( | 3419 | balance_dirty_pages_ratelimited( |
3420 | root->fs_info->btree_inode->i_mapping, 1); | 3420 | root->fs_info->btree_inode->i_mapping); |
3421 | } | 3421 | } |
3422 | return; | 3422 | return; |
3423 | } | 3423 | } |
@@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) | |||
3437 | num_dirty = root->fs_info->dirty_metadata_bytes; | 3437 | num_dirty = root->fs_info->dirty_metadata_bytes; |
3438 | 3438 | ||
3439 | if (num_dirty > thresh) { | 3439 | if (num_dirty > thresh) { |
3440 | balance_dirty_pages_ratelimited_nr( | 3440 | balance_dirty_pages_ratelimited( |
3441 | root->fs_info->btree_inode->i_mapping, 1); | 3441 | root->fs_info->btree_inode->i_mapping); |
3442 | } | 3442 | } |
3443 | return; | 3443 | return; |
3444 | } | 3444 | } |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed88116..a8ee75cb96ee 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1346 | 1346 | ||
1347 | cond_resched(); | 1347 | cond_resched(); |
1348 | 1348 | ||
1349 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, | 1349 | balance_dirty_pages_ratelimited(inode->i_mapping); |
1350 | dirty_pages); | ||
1351 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) | 1350 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) |
1352 | btrfs_btree_balance_dirty(root, 1); | 1351 | btrfs_btree_balance_dirty(root, 1); |
1353 | 1352 | ||
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a59c28d..5b3429ab8ec1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1225 | } | 1225 | } |
1226 | 1226 | ||
1227 | defrag_count += ret; | 1227 | defrag_count += ret; |
1228 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); | 1228 | balance_dirty_pages_ratelimited(inode->i_mapping); |
1229 | mutex_unlock(&inode->i_mutex); | 1229 | mutex_unlock(&inode->i_mutex); |
1230 | 1230 | ||
1231 | if (newer_than) { | 1231 | if (newer_than) { |
diff --git a/fs/buffer.c b/fs/buffer.c index ec0aca8ba6bf..6e9ed48064fc 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -555,7 +555,7 @@ void emergency_thaw_all(void) | |||
555 | */ | 555 | */ |
556 | int sync_mapping_buffers(struct address_space *mapping) | 556 | int sync_mapping_buffers(struct address_space *mapping) |
557 | { | 557 | { |
558 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 558 | struct address_space *buffer_mapping = mapping->private_data; |
559 | 559 | ||
560 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) | 560 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) |
561 | return 0; | 561 | return 0; |
@@ -588,10 +588,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) | |||
588 | struct address_space *buffer_mapping = bh->b_page->mapping; | 588 | struct address_space *buffer_mapping = bh->b_page->mapping; |
589 | 589 | ||
590 | mark_buffer_dirty(bh); | 590 | mark_buffer_dirty(bh); |
591 | if (!mapping->assoc_mapping) { | 591 | if (!mapping->private_data) { |
592 | mapping->assoc_mapping = buffer_mapping; | 592 | mapping->private_data = buffer_mapping; |
593 | } else { | 593 | } else { |
594 | BUG_ON(mapping->assoc_mapping != buffer_mapping); | 594 | BUG_ON(mapping->private_data != buffer_mapping); |
595 | } | 595 | } |
596 | if (!bh->b_assoc_map) { | 596 | if (!bh->b_assoc_map) { |
597 | spin_lock(&buffer_mapping->private_lock); | 597 | spin_lock(&buffer_mapping->private_lock); |
@@ -788,7 +788,7 @@ void invalidate_inode_buffers(struct inode *inode) | |||
788 | if (inode_has_buffers(inode)) { | 788 | if (inode_has_buffers(inode)) { |
789 | struct address_space *mapping = &inode->i_data; | 789 | struct address_space *mapping = &inode->i_data; |
790 | struct list_head *list = &mapping->private_list; | 790 | struct list_head *list = &mapping->private_list; |
791 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 791 | struct address_space *buffer_mapping = mapping->private_data; |
792 | 792 | ||
793 | spin_lock(&buffer_mapping->private_lock); | 793 | spin_lock(&buffer_mapping->private_lock); |
794 | while (!list_empty(list)) | 794 | while (!list_empty(list)) |
@@ -811,7 +811,7 @@ int remove_inode_buffers(struct inode *inode) | |||
811 | if (inode_has_buffers(inode)) { | 811 | if (inode_has_buffers(inode)) { |
812 | struct address_space *mapping = &inode->i_data; | 812 | struct address_space *mapping = &inode->i_data; |
813 | struct list_head *list = &mapping->private_list; | 813 | struct list_head *list = &mapping->private_list; |
814 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 814 | struct address_space *buffer_mapping = mapping->private_data; |
815 | 815 | ||
816 | spin_lock(&buffer_mapping->private_lock); | 816 | spin_lock(&buffer_mapping->private_lock); |
817 | while (!list_empty(list)) { | 817 | while (!list_empty(list)) { |
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e6c2fd53cab2..0f22d09f358d 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c | |||
@@ -768,7 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, | |||
768 | mapping->host = s->s_bdev->bd_inode; | 768 | mapping->host = s->s_bdev->bd_inode; |
769 | mapping->flags = 0; | 769 | mapping->flags = 0; |
770 | mapping_set_gfp_mask(mapping, GFP_NOFS); | 770 | mapping_set_gfp_mask(mapping, GFP_NOFS); |
771 | mapping->assoc_mapping = NULL; | 771 | mapping->private_data = NULL; |
772 | mapping->backing_dev_info = s->s_bdi; | 772 | mapping->backing_dev_info = s->s_bdi; |
773 | mapping->writeback_index = 0; | 773 | mapping->writeback_index = 0; |
774 | } | 774 | } |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5bc355d8243..4a55f35a6ced 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |||
151 | { | 151 | { |
152 | struct mm_struct *mm = current->mm; | 152 | struct mm_struct *mm = current->mm; |
153 | struct vm_area_struct *vma; | 153 | struct vm_area_struct *vma; |
154 | unsigned long start_addr; | ||
155 | struct hstate *h = hstate_file(file); | 154 | struct hstate *h = hstate_file(file); |
155 | struct vm_unmapped_area_info info; | ||
156 | 156 | ||
157 | if (len & ~huge_page_mask(h)) | 157 | if (len & ~huge_page_mask(h)) |
158 | return -EINVAL; | 158 | return -EINVAL; |
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |||
173 | return addr; | 173 | return addr; |
174 | } | 174 | } |
175 | 175 | ||
176 | if (len > mm->cached_hole_size) | 176 | info.flags = 0; |
177 | start_addr = mm->free_area_cache; | 177 | info.length = len; |
178 | else { | 178 | info.low_limit = TASK_UNMAPPED_BASE; |
179 | start_addr = TASK_UNMAPPED_BASE; | 179 | info.high_limit = TASK_SIZE; |
180 | mm->cached_hole_size = 0; | 180 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
181 | } | 181 | info.align_offset = 0; |
182 | 182 | return vm_unmapped_area(&info); | |
183 | full_search: | ||
184 | addr = ALIGN(start_addr, huge_page_size(h)); | ||
185 | |||
186 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
187 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
188 | if (TASK_SIZE - len < addr) { | ||
189 | /* | ||
190 | * Start a new search - just in case we missed | ||
191 | * some holes. | ||
192 | */ | ||
193 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
194 | start_addr = TASK_UNMAPPED_BASE; | ||
195 | mm->cached_hole_size = 0; | ||
196 | goto full_search; | ||
197 | } | ||
198 | return -ENOMEM; | ||
199 | } | ||
200 | |||
201 | if (!vma || addr + len <= vma->vm_start) { | ||
202 | mm->free_area_cache = addr + len; | ||
203 | return addr; | ||
204 | } | ||
205 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
206 | mm->cached_hole_size = vma->vm_start - addr; | ||
207 | addr = ALIGN(vma->vm_end, huge_page_size(h)); | ||
208 | } | ||
209 | } | 183 | } |
210 | #endif | 184 | #endif |
211 | 185 | ||
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, | |||
608 | int rc; | 582 | int rc; |
609 | 583 | ||
610 | rc = migrate_huge_page_move_mapping(mapping, newpage, page); | 584 | rc = migrate_huge_page_move_mapping(mapping, newpage, page); |
611 | if (rc) | 585 | if (rc != MIGRATEPAGE_SUCCESS) |
612 | return rc; | 586 | return rc; |
613 | migrate_page_copy(newpage, page); | 587 | migrate_page_copy(newpage, page); |
614 | 588 | ||
615 | return 0; | 589 | return MIGRATEPAGE_SUCCESS; |
616 | } | 590 | } |
617 | 591 | ||
618 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) | 592 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = { | |||
923 | .kill_sb = kill_litter_super, | 897 | .kill_sb = kill_litter_super, |
924 | }; | 898 | }; |
925 | 899 | ||
926 | static struct vfsmount *hugetlbfs_vfsmount; | 900 | static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; |
927 | 901 | ||
928 | static int can_do_hugetlb_shm(void) | 902 | static int can_do_hugetlb_shm(void) |
929 | { | 903 | { |
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void) | |||
932 | return capable(CAP_IPC_LOCK) || in_group_p(shm_group); | 906 | return capable(CAP_IPC_LOCK) || in_group_p(shm_group); |
933 | } | 907 | } |
934 | 908 | ||
909 | static int get_hstate_idx(int page_size_log) | ||
910 | { | ||
911 | struct hstate *h; | ||
912 | |||
913 | if (!page_size_log) | ||
914 | return default_hstate_idx; | ||
915 | h = size_to_hstate(1 << page_size_log); | ||
916 | if (!h) | ||
917 | return -1; | ||
918 | return h - hstates; | ||
919 | } | ||
920 | |||
935 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 921 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, |
936 | size_t size, vm_flags_t acctflag, | 922 | size_t size, vm_flags_t acctflag, |
937 | struct user_struct **user, int creat_flags) | 923 | struct user_struct **user, |
924 | int creat_flags, int page_size_log) | ||
938 | { | 925 | { |
939 | int error = -ENOMEM; | 926 | int error = -ENOMEM; |
940 | struct file *file; | 927 | struct file *file; |
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, | |||
944 | struct qstr quick_string; | 931 | struct qstr quick_string; |
945 | struct hstate *hstate; | 932 | struct hstate *hstate; |
946 | unsigned long num_pages; | 933 | unsigned long num_pages; |
934 | int hstate_idx; | ||
935 | |||
936 | hstate_idx = get_hstate_idx(page_size_log); | ||
937 | if (hstate_idx < 0) | ||
938 | return ERR_PTR(-ENODEV); | ||
947 | 939 | ||
948 | *user = NULL; | 940 | *user = NULL; |
949 | if (!hugetlbfs_vfsmount) | 941 | if (!hugetlbfs_vfsmount[hstate_idx]) |
950 | return ERR_PTR(-ENOENT); | 942 | return ERR_PTR(-ENOENT); |
951 | 943 | ||
952 | if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { | 944 | if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { |
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, | |||
963 | } | 955 | } |
964 | } | 956 | } |
965 | 957 | ||
966 | root = hugetlbfs_vfsmount->mnt_root; | 958 | root = hugetlbfs_vfsmount[hstate_idx]->mnt_root; |
967 | quick_string.name = name; | 959 | quick_string.name = name; |
968 | quick_string.len = strlen(quick_string.name); | 960 | quick_string.len = strlen(quick_string.name); |
969 | quick_string.hash = 0; | 961 | quick_string.hash = 0; |
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, | |||
971 | if (!path.dentry) | 963 | if (!path.dentry) |
972 | goto out_shm_unlock; | 964 | goto out_shm_unlock; |
973 | 965 | ||
974 | path.mnt = mntget(hugetlbfs_vfsmount); | 966 | path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); |
975 | error = -ENOSPC; | 967 | error = -ENOSPC; |
976 | inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); | 968 | inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); |
977 | if (!inode) | 969 | if (!inode) |
@@ -1011,8 +1003,9 @@ out_shm_unlock: | |||
1011 | 1003 | ||
1012 | static int __init init_hugetlbfs_fs(void) | 1004 | static int __init init_hugetlbfs_fs(void) |
1013 | { | 1005 | { |
1006 | struct hstate *h; | ||
1014 | int error; | 1007 | int error; |
1015 | struct vfsmount *vfsmount; | 1008 | int i; |
1016 | 1009 | ||
1017 | error = bdi_init(&hugetlbfs_backing_dev_info); | 1010 | error = bdi_init(&hugetlbfs_backing_dev_info); |
1018 | if (error) | 1011 | if (error) |
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void) | |||
1029 | if (error) | 1022 | if (error) |
1030 | goto out; | 1023 | goto out; |
1031 | 1024 | ||
1032 | vfsmount = kern_mount(&hugetlbfs_fs_type); | 1025 | i = 0; |
1026 | for_each_hstate(h) { | ||
1027 | char buf[50]; | ||
1028 | unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); | ||
1033 | 1029 | ||
1034 | if (!IS_ERR(vfsmount)) { | 1030 | snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); |
1035 | hugetlbfs_vfsmount = vfsmount; | 1031 | hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, |
1036 | return 0; | 1032 | buf); |
1037 | } | ||
1038 | 1033 | ||
1039 | error = PTR_ERR(vfsmount); | 1034 | if (IS_ERR(hugetlbfs_vfsmount[i])) { |
1035 | pr_err("hugetlb: Cannot mount internal hugetlbfs for " | ||
1036 | "page size %uK", ps_kb); | ||
1037 | error = PTR_ERR(hugetlbfs_vfsmount[i]); | ||
1038 | hugetlbfs_vfsmount[i] = NULL; | ||
1039 | } | ||
1040 | i++; | ||
1041 | } | ||
1042 | /* Non default hstates are optional */ | ||
1043 | if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) | ||
1044 | return 0; | ||
1040 | 1045 | ||
1041 | out: | 1046 | out: |
1042 | kmem_cache_destroy(hugetlbfs_inode_cachep); | 1047 | kmem_cache_destroy(hugetlbfs_inode_cachep); |
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void) | |||
1047 | 1052 | ||
1048 | static void __exit exit_hugetlbfs_fs(void) | 1053 | static void __exit exit_hugetlbfs_fs(void) |
1049 | { | 1054 | { |
1055 | struct hstate *h; | ||
1056 | int i; | ||
1057 | |||
1058 | |||
1050 | /* | 1059 | /* |
1051 | * Make sure all delayed rcu free inodes are flushed before we | 1060 | * Make sure all delayed rcu free inodes are flushed before we |
1052 | * destroy cache. | 1061 | * destroy cache. |
1053 | */ | 1062 | */ |
1054 | rcu_barrier(); | 1063 | rcu_barrier(); |
1055 | kmem_cache_destroy(hugetlbfs_inode_cachep); | 1064 | kmem_cache_destroy(hugetlbfs_inode_cachep); |
1056 | kern_unmount(hugetlbfs_vfsmount); | 1065 | i = 0; |
1066 | for_each_hstate(h) | ||
1067 | kern_unmount(hugetlbfs_vfsmount[i++]); | ||
1057 | unregister_filesystem(&hugetlbfs_fs_type); | 1068 | unregister_filesystem(&hugetlbfs_fs_type); |
1058 | bdi_destroy(&hugetlbfs_backing_dev_info); | 1069 | bdi_destroy(&hugetlbfs_backing_dev_info); |
1059 | } | 1070 | } |
diff --git a/fs/inode.c b/fs/inode.c index 64999f144153..14084b72b259 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) | |||
165 | mapping->host = inode; | 165 | mapping->host = inode; |
166 | mapping->flags = 0; | 166 | mapping->flags = 0; |
167 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); | 167 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); |
168 | mapping->assoc_mapping = NULL; | 168 | mapping->private_data = NULL; |
169 | mapping->backing_dev_info = &default_backing_dev_info; | 169 | mapping->backing_dev_info = &default_backing_dev_info; |
170 | mapping->writeback_index = 0; | 170 | mapping->writeback_index = 0; |
171 | 171 | ||
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3e7b2a0dc0c8..07f76db04ec7 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c | |||
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, | |||
431 | mapping->host = inode; | 431 | mapping->host = inode; |
432 | mapping->flags = 0; | 432 | mapping->flags = 0; |
433 | mapping_set_gfp_mask(mapping, GFP_NOFS); | 433 | mapping_set_gfp_mask(mapping, GFP_NOFS); |
434 | mapping->assoc_mapping = NULL; | 434 | mapping->private_data = NULL; |
435 | mapping->backing_dev_info = bdi; | 435 | mapping->backing_dev_info = bdi; |
436 | mapping->a_ops = &empty_aops; | 436 | mapping->a_ops = &empty_aops; |
437 | } | 437 | } |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5a4ee77cec51..dda089804942 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | |||
2513 | ret = sd.num_spliced; | 2513 | ret = sd.num_spliced; |
2514 | 2514 | ||
2515 | if (ret > 0) { | 2515 | if (ret > 0) { |
2516 | unsigned long nr_pages; | ||
2517 | int err; | 2516 | int err; |
2518 | 2517 | ||
2519 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
2520 | |||
2521 | err = generic_write_sync(out, *ppos, ret); | 2518 | err = generic_write_sync(out, *ppos, ret); |
2522 | if (err) | 2519 | if (err) |
2523 | ret = err; | 2520 | ret = err; |
2524 | else | 2521 | else |
2525 | *ppos += ret; | 2522 | *ppos += ret; |
2526 | 2523 | ||
2527 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); | 2524 | balance_dirty_pages_ratelimited(mapping); |
2528 | } | 2525 | } |
2529 | 2526 | ||
2530 | return ret; | 2527 | return ret; |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e28356a959a..aa63d25157b8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -985,7 +985,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, | |||
985 | { | 985 | { |
986 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); | 986 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); |
987 | char buffer[PROC_NUMBUF]; | 987 | char buffer[PROC_NUMBUF]; |
988 | int oom_score_adj = OOM_SCORE_ADJ_MIN; | 988 | short oom_score_adj = OOM_SCORE_ADJ_MIN; |
989 | unsigned long flags; | 989 | unsigned long flags; |
990 | size_t len; | 990 | size_t len; |
991 | 991 | ||
@@ -996,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, | |||
996 | unlock_task_sighand(task, &flags); | 996 | unlock_task_sighand(task, &flags); |
997 | } | 997 | } |
998 | put_task_struct(task); | 998 | put_task_struct(task); |
999 | len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); | 999 | len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); |
1000 | return simple_read_from_buffer(buf, count, ppos, buffer, len); | 1000 | return simple_read_from_buffer(buf, count, ppos, buffer, len); |
1001 | } | 1001 | } |
1002 | 1002 | ||
@@ -1043,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, | |||
1043 | goto err_task_lock; | 1043 | goto err_task_lock; |
1044 | } | 1044 | } |
1045 | 1045 | ||
1046 | if (oom_score_adj < task->signal->oom_score_adj_min && | 1046 | if ((short)oom_score_adj < task->signal->oom_score_adj_min && |
1047 | !capable(CAP_SYS_RESOURCE)) { | 1047 | !capable(CAP_SYS_RESOURCE)) { |
1048 | err = -EACCES; | 1048 | err = -EACCES; |
1049 | goto err_sighand; | 1049 | goto err_sighand; |
1050 | } | 1050 | } |
1051 | 1051 | ||
1052 | task->signal->oom_score_adj = oom_score_adj; | 1052 | task->signal->oom_score_adj = (short)oom_score_adj; |
1053 | if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) | 1053 | if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) |
1054 | task->signal->oom_score_adj_min = oom_score_adj; | 1054 | task->signal->oom_score_adj_min = (short)oom_score_adj; |
1055 | trace_oom_score_adj_update(task); | 1055 | trace_oom_score_adj_update(task); |
1056 | 1056 | ||
1057 | err_sighand: | 1057 | err_sighand: |
diff --git a/fs/splice.c b/fs/splice.c index 13e5b4776e7a..8890604e3fcd 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, | |||
1024 | ret = sd.num_spliced; | 1024 | ret = sd.num_spliced; |
1025 | 1025 | ||
1026 | if (ret > 0) { | 1026 | if (ret > 0) { |
1027 | unsigned long nr_pages; | ||
1028 | int err; | 1027 | int err; |
1029 | 1028 | ||
1030 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1031 | |||
1032 | err = generic_write_sync(out, *ppos, ret); | 1029 | err = generic_write_sync(out, *ppos, ret); |
1033 | if (err) | 1030 | if (err) |
1034 | ret = err; | 1031 | ret = err; |
1035 | else | 1032 | else |
1036 | *ppos += ret; | 1033 | *ppos += ret; |
1037 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); | 1034 | balance_dirty_pages_ratelimited(mapping); |
1038 | } | 1035 | } |
1039 | sb_end_write(inode->i_sb); | 1036 | sb_end_write(inode->i_sb); |
1040 | 1037 | ||
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h new file mode 100644 index 000000000000..f7f1d7169b11 --- /dev/null +++ b/include/linux/balloon_compaction.h | |||
@@ -0,0 +1,272 @@ | |||
1 | /* | ||
2 | * include/linux/balloon_compaction.h | ||
3 | * | ||
4 | * Common interface definitions for making balloon pages movable by compaction. | ||
5 | * | ||
6 | * Despite being perfectly possible to perform ballooned pages migration, they | ||
7 | * make a special corner case to compaction scans because balloon pages are not | ||
8 | * enlisted at any LRU list like the other pages we do compact / migrate. | ||
9 | * | ||
10 | * As the page isolation scanning step a compaction thread does is a lockless | ||
11 | * procedure (from a page standpoint), it might bring some racy situations while | ||
12 | * performing balloon page compaction. In order to sort out these racy scenarios | ||
13 | * and safely perform balloon's page compaction and migration we must, always, | ||
14 | * ensure following these three simple rules: | ||
15 | * | ||
16 | * i. when updating a balloon's page ->mapping element, strictly do it under | ||
17 | * the following lock order, independently of the far superior | ||
18 | * locking scheme (lru_lock, balloon_lock): | ||
19 | * +-page_lock(page); | ||
20 | * +--spin_lock_irq(&b_dev_info->pages_lock); | ||
21 | * ... page->mapping updates here ... | ||
22 | * | ||
23 | * ii. before isolating or dequeueing a balloon page from the balloon device | ||
24 | * pages list, the page reference counter must be raised by one and the | ||
25 | * extra refcount must be dropped when the page is enqueued back into | ||
26 | * the balloon device page list, thus a balloon page keeps its reference | ||
27 | * counter raised only while it is under our special handling; | ||
28 | * | ||
29 | * iii. after the lockless scan step have selected a potential balloon page for | ||
30 | * isolation, re-test the page->mapping flags and the page ref counter | ||
31 | * under the proper page lock, to ensure isolating a valid balloon page | ||
32 | * (not yet isolated, nor under release procedure) | ||
33 | * | ||
34 | * The functions provided by this interface are placed to help on coping with | ||
35 | * the aforementioned balloon page corner case, as well as to ensure the simple | ||
36 | * set of exposed rules are satisfied while we are dealing with balloon pages | ||
37 | * compaction / migration. | ||
38 | * | ||
39 | * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> | ||
40 | */ | ||
41 | #ifndef _LINUX_BALLOON_COMPACTION_H | ||
42 | #define _LINUX_BALLOON_COMPACTION_H | ||
43 | #include <linux/pagemap.h> | ||
44 | #include <linux/page-flags.h> | ||
45 | #include <linux/migrate.h> | ||
46 | #include <linux/gfp.h> | ||
47 | #include <linux/err.h> | ||
48 | |||
49 | /* | ||
50 | * Balloon device information descriptor. | ||
51 | * This struct is used to allow the common balloon compaction interface | ||
52 | * procedures to find the proper balloon device holding memory pages they'll | ||
53 | * have to cope for page compaction / migration, as well as it serves the | ||
54 | * balloon driver as a page book-keeper for its registered balloon devices. | ||
55 | */ | ||
56 | struct balloon_dev_info { | ||
57 | void *balloon_device; /* balloon device descriptor */ | ||
58 | struct address_space *mapping; /* balloon special page->mapping */ | ||
59 | unsigned long isolated_pages; /* # of isolated pages for migration */ | ||
60 | spinlock_t pages_lock; /* Protection to pages list */ | ||
61 | struct list_head pages; /* Pages enqueued & handled to Host */ | ||
62 | }; | ||
63 | |||
64 | extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); | ||
65 | extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); | ||
66 | extern struct balloon_dev_info *balloon_devinfo_alloc( | ||
67 | void *balloon_dev_descriptor); | ||
68 | |||
69 | static inline void balloon_devinfo_free(struct balloon_dev_info *b_dev_info) | ||
70 | { | ||
71 | kfree(b_dev_info); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * balloon_page_free - release a balloon page back to the page free lists | ||
76 | * @page: ballooned page to be set free | ||
77 | * | ||
78 | * This function must be used to properly set free an isolated/dequeued balloon | ||
79 | * page at the end of a sucessful page migration, or at the balloon driver's | ||
80 | * page release procedure. | ||
81 | */ | ||
82 | static inline void balloon_page_free(struct page *page) | ||
83 | { | ||
84 | /* | ||
85 | * Balloon pages always get an extra refcount before being isolated | ||
86 | * and before being dequeued to help on sorting out fortuite colisions | ||
87 | * between a thread attempting to isolate and another thread attempting | ||
88 | * to release the very same balloon page. | ||
89 | * | ||
90 | * Before we handle the page back to Buddy, lets drop its extra refcnt. | ||
91 | */ | ||
92 | put_page(page); | ||
93 | __free_page(page); | ||
94 | } | ||
95 | |||
96 | #ifdef CONFIG_BALLOON_COMPACTION | ||
97 | extern bool balloon_page_isolate(struct page *page); | ||
98 | extern void balloon_page_putback(struct page *page); | ||
99 | extern int balloon_page_migrate(struct page *newpage, | ||
100 | struct page *page, enum migrate_mode mode); | ||
101 | extern struct address_space | ||
102 | *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, | ||
103 | const struct address_space_operations *a_ops); | ||
104 | |||
105 | static inline void balloon_mapping_free(struct address_space *balloon_mapping) | ||
106 | { | ||
107 | kfree(balloon_mapping); | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * page_flags_cleared - helper to perform balloon @page ->flags tests. | ||
112 | * | ||
113 | * As balloon pages are obtained from buddy and we do not play with page->flags | ||
114 | * at driver level (exception made when we get the page lock for compaction), | ||
115 | * we can safely identify a ballooned page by checking if the | ||
116 | * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared. This approach also | ||
117 | * helps us skip ballooned pages that are locked for compaction or release, thus | ||
118 | * mitigating their racy check at balloon_page_movable() | ||
119 | */ | ||
120 | static inline bool page_flags_cleared(struct page *page) | ||
121 | { | ||
122 | return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP); | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * __is_movable_balloon_page - helper to perform @page mapping->flags tests | ||
127 | */ | ||
128 | static inline bool __is_movable_balloon_page(struct page *page) | ||
129 | { | ||
130 | struct address_space *mapping = page->mapping; | ||
131 | return mapping_balloon(mapping); | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * balloon_page_movable - test page->mapping->flags to identify balloon pages | ||
136 | * that can be moved by compaction/migration. | ||
137 | * | ||
138 | * This function is used at core compaction's page isolation scheme, therefore | ||
139 | * most pages exposed to it are not enlisted as balloon pages and so, to avoid | ||
140 | * undesired side effects like racing against __free_pages(), we cannot afford | ||
141 | * holding the page locked while testing page->mapping->flags here. | ||
142 | * | ||
143 | * As we might return false positives in the case of a balloon page being just | ||
144 | * released under us, the page->mapping->flags need to be re-tested later, | ||
145 | * under the proper page lock, at the functions that will be coping with the | ||
146 | * balloon page case. | ||
147 | */ | ||
148 | static inline bool balloon_page_movable(struct page *page) | ||
149 | { | ||
150 | /* | ||
151 | * Before dereferencing and testing mapping->flags, let's make sure | ||
152 | * this is not a page that uses ->mapping in a different way | ||
153 | */ | ||
154 | if (page_flags_cleared(page) && !page_mapped(page) && | ||
155 | page_count(page) == 1) | ||
156 | return __is_movable_balloon_page(page); | ||
157 | |||
158 | return false; | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * balloon_page_insert - insert a page into the balloon's page list and make | ||
163 | * the page->mapping assignment accordingly. | ||
164 | * @page : page to be assigned as a 'balloon page' | ||
165 | * @mapping : allocated special 'balloon_mapping' | ||
166 | * @head : balloon's device page list head | ||
167 | * | ||
168 | * Caller must ensure the page is locked and the spin_lock protecting balloon | ||
169 | * pages list is held before inserting a page into the balloon device. | ||
170 | */ | ||
171 | static inline void balloon_page_insert(struct page *page, | ||
172 | struct address_space *mapping, | ||
173 | struct list_head *head) | ||
174 | { | ||
175 | page->mapping = mapping; | ||
176 | list_add(&page->lru, head); | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * balloon_page_delete - delete a page from balloon's page list and clear | ||
181 | * the page->mapping assignement accordingly. | ||
182 | * @page : page to be released from balloon's page list | ||
183 | * | ||
184 | * Caller must ensure the page is locked and the spin_lock protecting balloon | ||
185 | * pages list is held before deleting a page from the balloon device. | ||
186 | */ | ||
187 | static inline void balloon_page_delete(struct page *page) | ||
188 | { | ||
189 | page->mapping = NULL; | ||
190 | list_del(&page->lru); | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * balloon_page_device - get the b_dev_info descriptor for the balloon device | ||
195 | * that enqueues the given page. | ||
196 | */ | ||
197 | static inline struct balloon_dev_info *balloon_page_device(struct page *page) | ||
198 | { | ||
199 | struct address_space *mapping = page->mapping; | ||
200 | if (likely(mapping)) | ||
201 | return mapping->private_data; | ||
202 | |||
203 | return NULL; | ||
204 | } | ||
205 | |||
206 | static inline gfp_t balloon_mapping_gfp_mask(void) | ||
207 | { | ||
208 | return GFP_HIGHUSER_MOVABLE; | ||
209 | } | ||
210 | |||
211 | static inline bool balloon_compaction_check(void) | ||
212 | { | ||
213 | return true; | ||
214 | } | ||
215 | |||
216 | #else /* !CONFIG_BALLOON_COMPACTION */ | ||
217 | |||
218 | static inline void *balloon_mapping_alloc(void *balloon_device, | ||
219 | const struct address_space_operations *a_ops) | ||
220 | { | ||
221 | return ERR_PTR(-EOPNOTSUPP); | ||
222 | } | ||
223 | |||
224 | static inline void balloon_mapping_free(struct address_space *balloon_mapping) | ||
225 | { | ||
226 | return; | ||
227 | } | ||
228 | |||
229 | static inline void balloon_page_insert(struct page *page, | ||
230 | struct address_space *mapping, | ||
231 | struct list_head *head) | ||
232 | { | ||
233 | list_add(&page->lru, head); | ||
234 | } | ||
235 | |||
236 | static inline void balloon_page_delete(struct page *page) | ||
237 | { | ||
238 | list_del(&page->lru); | ||
239 | } | ||
240 | |||
241 | static inline bool balloon_page_movable(struct page *page) | ||
242 | { | ||
243 | return false; | ||
244 | } | ||
245 | |||
246 | static inline bool balloon_page_isolate(struct page *page) | ||
247 | { | ||
248 | return false; | ||
249 | } | ||
250 | |||
251 | static inline void balloon_page_putback(struct page *page) | ||
252 | { | ||
253 | return; | ||
254 | } | ||
255 | |||
256 | static inline int balloon_page_migrate(struct page *newpage, | ||
257 | struct page *page, enum migrate_mode mode) | ||
258 | { | ||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | static inline gfp_t balloon_mapping_gfp_mask(void) | ||
263 | { | ||
264 | return GFP_HIGHUSER; | ||
265 | } | ||
266 | |||
267 | static inline bool balloon_compaction_check(void) | ||
268 | { | ||
269 | return false; | ||
270 | } | ||
271 | #endif /* CONFIG_BALLOON_COMPACTION */ | ||
272 | #endif /* _LINUX_BALLOON_COMPACTION_H */ | ||
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 6d6795d46a75..7b74452c5317 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
@@ -51,8 +51,8 @@ extern unsigned long free_all_bootmem(void); | |||
51 | extern void free_bootmem_node(pg_data_t *pgdat, | 51 | extern void free_bootmem_node(pg_data_t *pgdat, |
52 | unsigned long addr, | 52 | unsigned long addr, |
53 | unsigned long size); | 53 | unsigned long size); |
54 | extern void free_bootmem(unsigned long addr, unsigned long size); | 54 | extern void free_bootmem(unsigned long physaddr, unsigned long size); |
55 | extern void free_bootmem_late(unsigned long addr, unsigned long size); | 55 | extern void free_bootmem_late(unsigned long physaddr, unsigned long size); |
56 | 56 | ||
57 | /* | 57 | /* |
58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, | 58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 75fe9a134803..408fb1e77a0a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -418,7 +418,7 @@ struct address_space { | |||
418 | struct backing_dev_info *backing_dev_info; /* device readahead, etc */ | 418 | struct backing_dev_info *backing_dev_info; /* device readahead, etc */ |
419 | spinlock_t private_lock; /* for use by the address_space */ | 419 | spinlock_t private_lock; /* for use by the address_space */ |
420 | struct list_head private_list; /* ditto */ | 420 | struct list_head private_list; /* ditto */ |
421 | struct address_space *assoc_mapping; /* ditto */ | 421 | void *private_data; /* ditto */ |
422 | } __attribute__((aligned(sizeof(long)))); | 422 | } __attribute__((aligned(sizeof(long)))); |
423 | /* | 423 | /* |
424 | * On most architectures that alignment is already the case; but | 424 | * On most architectures that alignment is already the case; but |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d0a79678f169..31e8041274f6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -266,7 +266,7 @@ static inline enum zone_type gfp_zone(gfp_t flags) | |||
266 | 266 | ||
267 | static inline int gfp_zonelist(gfp_t flags) | 267 | static inline int gfp_zonelist(gfp_t flags) |
268 | { | 268 | { |
269 | if (NUMA_BUILD && unlikely(flags & __GFP_THISNODE)) | 269 | if (IS_ENABLED(CONFIG_NUMA) && unlikely(flags & __GFP_THISNODE)) |
270 | return 1; | 270 | return 1; |
271 | 271 | ||
272 | return 0; | 272 | return 0; |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b31cb7da0346..1af477552459 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -8,6 +8,10 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
8 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 8 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
9 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 9 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
10 | struct vm_area_struct *vma); | 10 | struct vm_area_struct *vma); |
11 | extern void huge_pmd_set_accessed(struct mm_struct *mm, | ||
12 | struct vm_area_struct *vma, | ||
13 | unsigned long address, pmd_t *pmd, | ||
14 | pmd_t orig_pmd, int dirty); | ||
11 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 15 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
12 | unsigned long address, pmd_t *pmd, | 16 | unsigned long address, pmd_t *pmd, |
13 | pmd_t orig_pmd); | 17 | pmd_t orig_pmd); |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 225164842ab6..3e7fa1acf09c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -183,7 +183,8 @@ extern const struct file_operations hugetlbfs_file_operations; | |||
183 | extern const struct vm_operations_struct hugetlb_vm_ops; | 183 | extern const struct vm_operations_struct hugetlb_vm_ops; |
184 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, | 184 | struct file *hugetlb_file_setup(const char *name, unsigned long addr, |
185 | size_t size, vm_flags_t acct, | 185 | size_t size, vm_flags_t acct, |
186 | struct user_struct **user, int creat_flags); | 186 | struct user_struct **user, int creat_flags, |
187 | int page_size_log); | ||
187 | 188 | ||
188 | static inline int is_file_hugepages(struct file *file) | 189 | static inline int is_file_hugepages(struct file *file) |
189 | { | 190 | { |
@@ -195,12 +196,14 @@ static inline int is_file_hugepages(struct file *file) | |||
195 | return 0; | 196 | return 0; |
196 | } | 197 | } |
197 | 198 | ||
199 | |||
198 | #else /* !CONFIG_HUGETLBFS */ | 200 | #else /* !CONFIG_HUGETLBFS */ |
199 | 201 | ||
200 | #define is_file_hugepages(file) 0 | 202 | #define is_file_hugepages(file) 0 |
201 | static inline struct file * | 203 | static inline struct file * |
202 | hugetlb_file_setup(const char *name, unsigned long addr, size_t size, | 204 | hugetlb_file_setup(const char *name, unsigned long addr, size_t size, |
203 | vm_flags_t acctflag, struct user_struct **user, int creat_flags) | 205 | vm_flags_t acctflag, struct user_struct **user, int creat_flags, |
206 | int page_size_log) | ||
204 | { | 207 | { |
205 | return ERR_PTR(-ENOSYS); | 208 | return ERR_PTR(-ENOSYS); |
206 | } | 209 | } |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7d8dfc7392f1..dd9900cabf89 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
@@ -687,20 +687,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } | |||
687 | /* Trap pasters of __FUNCTION__ at compile-time */ | 687 | /* Trap pasters of __FUNCTION__ at compile-time */ |
688 | #define __FUNCTION__ (__func__) | 688 | #define __FUNCTION__ (__func__) |
689 | 689 | ||
690 | /* This helps us to avoid #ifdef CONFIG_NUMA */ | ||
691 | #ifdef CONFIG_NUMA | ||
692 | #define NUMA_BUILD 1 | ||
693 | #else | ||
694 | #define NUMA_BUILD 0 | ||
695 | #endif | ||
696 | |||
697 | /* This helps us avoid #ifdef CONFIG_COMPACTION */ | ||
698 | #ifdef CONFIG_COMPACTION | ||
699 | #define COMPACTION_BUILD 1 | ||
700 | #else | ||
701 | #define COMPACTION_BUILD 0 | ||
702 | #endif | ||
703 | |||
704 | /* This helps us to avoid #ifdef CONFIG_SYMBOL_PREFIX */ | 690 | /* This helps us to avoid #ifdef CONFIG_SYMBOL_PREFIX */ |
705 | #ifdef CONFIG_SYMBOL_PREFIX | 691 | #ifdef CONFIG_SYMBOL_PREFIX |
706 | #define SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX | 692 | #define SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX |
diff --git a/include/linux/memory.h b/include/linux/memory.h index ff9a9f8e0ed9..a09216d0dcc7 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h | |||
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn); | |||
53 | struct memory_notify { | 53 | struct memory_notify { |
54 | unsigned long start_pfn; | 54 | unsigned long start_pfn; |
55 | unsigned long nr_pages; | 55 | unsigned long nr_pages; |
56 | int status_change_nid_normal; | ||
56 | int status_change_nid; | 57 | int status_change_nid; |
57 | }; | 58 | }; |
58 | 59 | ||
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 95573ec4ee6c..4a45c4e50025 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -26,6 +26,13 @@ enum { | |||
26 | MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, | 26 | MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, |
27 | }; | 27 | }; |
28 | 28 | ||
29 | /* Types for control the zone type of onlined memory */ | ||
30 | enum { | ||
31 | ONLINE_KEEP, | ||
32 | ONLINE_KERNEL, | ||
33 | ONLINE_MOVABLE, | ||
34 | }; | ||
35 | |||
29 | /* | 36 | /* |
30 | * pgdat resizing functions | 37 | * pgdat resizing functions |
31 | */ | 38 | */ |
@@ -46,6 +53,10 @@ void pgdat_resize_init(struct pglist_data *pgdat) | |||
46 | } | 53 | } |
47 | /* | 54 | /* |
48 | * Zone resizing functions | 55 | * Zone resizing functions |
56 | * | ||
57 | * Note: any attempt to resize a zone should has pgdat_resize_lock() | ||
58 | * zone_span_writelock() both held. This ensure the size of a zone | ||
59 | * can't be changed while pgdat_resize_lock() held. | ||
49 | */ | 60 | */ |
50 | static inline unsigned zone_span_seqbegin(struct zone *zone) | 61 | static inline unsigned zone_span_seqbegin(struct zone *zone) |
51 | { | 62 | { |
@@ -71,7 +82,7 @@ extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); | |||
71 | extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); | 82 | extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); |
72 | extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); | 83 | extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); |
73 | /* VM interface that may be used by firmware interface */ | 84 | /* VM interface that may be used by firmware interface */ |
74 | extern int online_pages(unsigned long, unsigned long); | 85 | extern int online_pages(unsigned long, unsigned long, int); |
75 | extern void __offline_isolated_pages(unsigned long, unsigned long); | 86 | extern void __offline_isolated_pages(unsigned long, unsigned long); |
76 | 87 | ||
77 | typedef void (*online_page_callback_t)(struct page *page); | 88 | typedef void (*online_page_callback_t)(struct page *page); |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ce7e6671968b..0b5865c61efd 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -7,9 +7,27 @@ | |||
7 | 7 | ||
8 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); | 8 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); |
9 | 9 | ||
10 | /* | ||
11 | * Return values from addresss_space_operations.migratepage(): | ||
12 | * - negative errno on page migration failure; | ||
13 | * - zero on page migration success; | ||
14 | * | ||
15 | * The balloon page migration introduces this special case where a 'distinct' | ||
16 | * return code is used to flag a successful page migration to unmap_and_move(). | ||
17 | * This approach is necessary because page migration can race against balloon | ||
18 | * deflation procedure, and for such case we could introduce a nasty page leak | ||
19 | * if a successfully migrated balloon page gets released concurrently with | ||
20 | * migration's unmap_and_move() wrap-up steps. | ||
21 | */ | ||
22 | #define MIGRATEPAGE_SUCCESS 0 | ||
23 | #define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page | ||
24 | * sucessful migration case. | ||
25 | */ | ||
26 | |||
10 | #ifdef CONFIG_MIGRATION | 27 | #ifdef CONFIG_MIGRATION |
11 | 28 | ||
12 | extern void putback_lru_pages(struct list_head *l); | 29 | extern void putback_lru_pages(struct list_head *l); |
30 | extern void putback_movable_pages(struct list_head *l); | ||
13 | extern int migrate_page(struct address_space *, | 31 | extern int migrate_page(struct address_space *, |
14 | struct page *, struct page *, enum migrate_mode); | 32 | struct page *, struct page *, enum migrate_mode); |
15 | extern int migrate_pages(struct list_head *l, new_page_t x, | 33 | extern int migrate_pages(struct list_head *l, new_page_t x, |
@@ -33,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
33 | #else | 51 | #else |
34 | 52 | ||
35 | static inline void putback_lru_pages(struct list_head *l) {} | 53 | static inline void putback_lru_pages(struct list_head *l) {} |
54 | static inline void putback_movable_pages(struct list_head *l) {} | ||
36 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 55 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
37 | unsigned long private, bool offlining, | 56 | unsigned long private, bool offlining, |
38 | enum migrate_mode mode) { return -ENOSYS; } | 57 | enum migrate_mode mode) { return -ENOSYS; } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index bcaab4e6fe91..4af4f0b1be4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1456,6 +1456,37 @@ extern unsigned long vm_mmap(struct file *, unsigned long, | |||
1456 | unsigned long, unsigned long, | 1456 | unsigned long, unsigned long, |
1457 | unsigned long, unsigned long); | 1457 | unsigned long, unsigned long); |
1458 | 1458 | ||
1459 | struct vm_unmapped_area_info { | ||
1460 | #define VM_UNMAPPED_AREA_TOPDOWN 1 | ||
1461 | unsigned long flags; | ||
1462 | unsigned long length; | ||
1463 | unsigned long low_limit; | ||
1464 | unsigned long high_limit; | ||
1465 | unsigned long align_mask; | ||
1466 | unsigned long align_offset; | ||
1467 | }; | ||
1468 | |||
1469 | extern unsigned long unmapped_area(struct vm_unmapped_area_info *info); | ||
1470 | extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); | ||
1471 | |||
1472 | /* | ||
1473 | * Search for an unmapped address range. | ||
1474 | * | ||
1475 | * We are looking for a range that: | ||
1476 | * - does not intersect with any VMA; | ||
1477 | * - is contained within the [low_limit, high_limit) interval; | ||
1478 | * - is at least the desired size. | ||
1479 | * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) | ||
1480 | */ | ||
1481 | static inline unsigned long | ||
1482 | vm_unmapped_area(struct vm_unmapped_area_info *info) | ||
1483 | { | ||
1484 | if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN)) | ||
1485 | return unmapped_area(info); | ||
1486 | else | ||
1487 | return unmapped_area_topdown(info); | ||
1488 | } | ||
1489 | |||
1459 | /* truncate.c */ | 1490 | /* truncate.c */ |
1460 | extern void truncate_inode_pages(struct address_space *, loff_t); | 1491 | extern void truncate_inode_pages(struct address_space *, loff_t); |
1461 | extern void truncate_inode_pages_range(struct address_space *, | 1492 | extern void truncate_inode_pages_range(struct address_space *, |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 31f8a3af7d94..7ade2731b5d6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -224,7 +224,8 @@ struct vm_region { | |||
224 | * library, the executable area etc). | 224 | * library, the executable area etc). |
225 | */ | 225 | */ |
226 | struct vm_area_struct { | 226 | struct vm_area_struct { |
227 | struct mm_struct * vm_mm; /* The address space we belong to. */ | 227 | /* The first cache line has the info for VMA tree walking. */ |
228 | |||
228 | unsigned long vm_start; /* Our start address within vm_mm. */ | 229 | unsigned long vm_start; /* Our start address within vm_mm. */ |
229 | unsigned long vm_end; /* The first byte after our end address | 230 | unsigned long vm_end; /* The first byte after our end address |
230 | within vm_mm. */ | 231 | within vm_mm. */ |
@@ -232,11 +233,22 @@ struct vm_area_struct { | |||
232 | /* linked list of VM areas per task, sorted by address */ | 233 | /* linked list of VM areas per task, sorted by address */ |
233 | struct vm_area_struct *vm_next, *vm_prev; | 234 | struct vm_area_struct *vm_next, *vm_prev; |
234 | 235 | ||
236 | struct rb_node vm_rb; | ||
237 | |||
238 | /* | ||
239 | * Largest free memory gap in bytes to the left of this VMA. | ||
240 | * Either between this VMA and vma->vm_prev, or between one of the | ||
241 | * VMAs below us in the VMA rbtree and its ->vm_prev. This helps | ||
242 | * get_unmapped_area find a free area of the right size. | ||
243 | */ | ||
244 | unsigned long rb_subtree_gap; | ||
245 | |||
246 | /* Second cache line starts here. */ | ||
247 | |||
248 | struct mm_struct *vm_mm; /* The address space we belong to. */ | ||
235 | pgprot_t vm_page_prot; /* Access permissions of this VMA. */ | 249 | pgprot_t vm_page_prot; /* Access permissions of this VMA. */ |
236 | unsigned long vm_flags; /* Flags, see mm.h. */ | 250 | unsigned long vm_flags; /* Flags, see mm.h. */ |
237 | 251 | ||
238 | struct rb_node vm_rb; | ||
239 | |||
240 | /* | 252 | /* |
241 | * For areas with an address space and backing store, | 253 | * For areas with an address space and backing store, |
242 | * linkage into the address_space->i_mmap interval tree, or | 254 | * linkage into the address_space->i_mmap interval tree, or |
@@ -322,6 +334,7 @@ struct mm_struct { | |||
322 | unsigned long task_size; /* size of task vm space */ | 334 | unsigned long task_size; /* size of task vm space */ |
323 | unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ | 335 | unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ |
324 | unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ | 336 | unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ |
337 | unsigned long highest_vm_end; /* highest vma end address */ | ||
325 | pgd_t * pgd; | 338 | pgd_t * pgd; |
326 | atomic_t mm_users; /* How many users with user space? */ | 339 | atomic_t mm_users; /* How many users with user space? */ |
327 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ | 340 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a23923ba8263..0c0b1d608a69 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -63,10 +63,8 @@ enum { | |||
63 | 63 | ||
64 | #ifdef CONFIG_CMA | 64 | #ifdef CONFIG_CMA |
65 | # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) | 65 | # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) |
66 | # define cma_wmark_pages(zone) zone->min_cma_pages | ||
67 | #else | 66 | #else |
68 | # define is_migrate_cma(migratetype) false | 67 | # define is_migrate_cma(migratetype) false |
69 | # define cma_wmark_pages(zone) 0 | ||
70 | #endif | 68 | #endif |
71 | 69 | ||
72 | #define for_each_migratetype_order(order, type) \ | 70 | #define for_each_migratetype_order(order, type) \ |
@@ -383,13 +381,6 @@ struct zone { | |||
383 | /* see spanned/present_pages for more description */ | 381 | /* see spanned/present_pages for more description */ |
384 | seqlock_t span_seqlock; | 382 | seqlock_t span_seqlock; |
385 | #endif | 383 | #endif |
386 | #ifdef CONFIG_CMA | ||
387 | /* | ||
388 | * CMA needs to increase watermark levels during the allocation | ||
389 | * process to make sure that the system is not starved. | ||
390 | */ | ||
391 | unsigned long min_cma_pages; | ||
392 | #endif | ||
393 | struct free_area free_area[MAX_ORDER]; | 384 | struct free_area free_area[MAX_ORDER]; |
394 | 385 | ||
395 | #ifndef CONFIG_SPARSEMEM | 386 | #ifndef CONFIG_SPARSEMEM |
diff --git a/include/linux/node.h b/include/linux/node.h index 624e53cecc02..2115ad5d6f19 100644 --- a/include/linux/node.h +++ b/include/linux/node.h | |||
@@ -27,10 +27,9 @@ struct node { | |||
27 | }; | 27 | }; |
28 | 28 | ||
29 | struct memory_block; | 29 | struct memory_block; |
30 | extern struct node node_devices[]; | 30 | extern struct node *node_devices[]; |
31 | typedef void (*node_registration_func_t)(struct node *); | 31 | typedef void (*node_registration_func_t)(struct node *); |
32 | 32 | ||
33 | extern int register_node(struct node *, int, struct node *); | ||
34 | extern void unregister_node(struct node *node); | 33 | extern void unregister_node(struct node *node); |
35 | #ifdef CONFIG_NUMA | 34 | #ifdef CONFIG_NUMA |
36 | extern int register_one_node(int nid); | 35 | extern int register_one_node(int nid); |
diff --git a/include/linux/oom.h b/include/linux/oom.h index fb9826847b89..da60007075b5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -29,8 +29,23 @@ enum oom_scan_t { | |||
29 | OOM_SCAN_SELECT, /* always select this thread first */ | 29 | OOM_SCAN_SELECT, /* always select this thread first */ |
30 | }; | 30 | }; |
31 | 31 | ||
32 | extern void compare_swap_oom_score_adj(int old_val, int new_val); | 32 | /* Thread is the potential origin of an oom condition; kill first on oom */ |
33 | extern int test_set_oom_score_adj(int new_val); | 33 | #define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) |
34 | |||
35 | static inline void set_current_oom_origin(void) | ||
36 | { | ||
37 | current->signal->oom_flags |= OOM_FLAG_ORIGIN; | ||
38 | } | ||
39 | |||
40 | static inline void clear_current_oom_origin(void) | ||
41 | { | ||
42 | current->signal->oom_flags &= ~OOM_FLAG_ORIGIN; | ||
43 | } | ||
44 | |||
45 | static inline bool oom_task_origin(const struct task_struct *p) | ||
46 | { | ||
47 | return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); | ||
48 | } | ||
34 | 49 | ||
35 | extern unsigned long oom_badness(struct task_struct *p, | 50 | extern unsigned long oom_badness(struct task_struct *p, |
36 | struct mem_cgroup *memcg, const nodemask_t *nodemask, | 51 | struct mem_cgroup *memcg, const nodemask_t *nodemask, |
@@ -49,8 +64,6 @@ extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
49 | extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | 64 | extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, |
50 | unsigned long totalpages, const nodemask_t *nodemask, | 65 | unsigned long totalpages, const nodemask_t *nodemask, |
51 | bool force_kill); | 66 | bool force_kill); |
52 | extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
53 | int order); | ||
54 | 67 | ||
55 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 68 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
56 | int order, nodemask_t *mask, bool force_kill); | 69 | int order, nodemask_t *mask, bool force_kill); |
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 76a9539cfd3f..a92061e08d48 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h | |||
@@ -2,7 +2,8 @@ | |||
2 | #define __LINUX_PAGEISOLATION_H | 2 | #define __LINUX_PAGEISOLATION_H |
3 | 3 | ||
4 | 4 | ||
5 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count); | 5 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
6 | bool skip_hwpoisoned_pages); | ||
6 | void set_pageblock_migratetype(struct page *page, int migratetype); | 7 | void set_pageblock_migratetype(struct page *page, int migratetype); |
7 | int move_freepages_block(struct zone *zone, struct page *page, | 8 | int move_freepages_block(struct zone *zone, struct page *page, |
8 | int migratetype); | 9 | int migratetype); |
@@ -21,7 +22,7 @@ int move_freepages(struct zone *zone, | |||
21 | */ | 22 | */ |
22 | int | 23 | int |
23 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | 24 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
24 | unsigned migratetype); | 25 | unsigned migratetype, bool skip_hwpoisoned_pages); |
25 | 26 | ||
26 | /* | 27 | /* |
27 | * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. | 28 | * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. |
@@ -34,12 +35,13 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
34 | /* | 35 | /* |
35 | * Test all pages in [start_pfn, end_pfn) are isolated or not. | 36 | * Test all pages in [start_pfn, end_pfn) are isolated or not. |
36 | */ | 37 | */ |
37 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); | 38 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, |
39 | bool skip_hwpoisoned_pages); | ||
38 | 40 | ||
39 | /* | 41 | /* |
40 | * Internal functions. Changes pageblock's migrate type. | 42 | * Internal functions. Changes pageblock's migrate type. |
41 | */ | 43 | */ |
42 | int set_migratetype_isolate(struct page *page); | 44 | int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages); |
43 | void unset_migratetype_isolate(struct page *page, unsigned migratetype); | 45 | void unset_migratetype_isolate(struct page *page, unsigned migratetype); |
44 | struct page *alloc_migrate_target(struct page *page, unsigned long private, | 46 | struct page *alloc_migrate_target(struct page *page, unsigned long private, |
45 | int **resultp); | 47 | int **resultp); |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e42c762f0dc7..6da609d14c15 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -24,6 +24,7 @@ enum mapping_flags { | |||
24 | AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ | 24 | AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ |
25 | AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ | 25 | AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ |
26 | AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ | 26 | AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ |
27 | AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */ | ||
27 | }; | 28 | }; |
28 | 29 | ||
29 | static inline void mapping_set_error(struct address_space *mapping, int error) | 30 | static inline void mapping_set_error(struct address_space *mapping, int error) |
@@ -53,6 +54,21 @@ static inline int mapping_unevictable(struct address_space *mapping) | |||
53 | return !!mapping; | 54 | return !!mapping; |
54 | } | 55 | } |
55 | 56 | ||
57 | static inline void mapping_set_balloon(struct address_space *mapping) | ||
58 | { | ||
59 | set_bit(AS_BALLOON_MAP, &mapping->flags); | ||
60 | } | ||
61 | |||
62 | static inline void mapping_clear_balloon(struct address_space *mapping) | ||
63 | { | ||
64 | clear_bit(AS_BALLOON_MAP, &mapping->flags); | ||
65 | } | ||
66 | |||
67 | static inline int mapping_balloon(struct address_space *mapping) | ||
68 | { | ||
69 | return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags); | ||
70 | } | ||
71 | |||
56 | static inline gfp_t mapping_gfp_mask(struct address_space * mapping) | 72 | static inline gfp_t mapping_gfp_mask(struct address_space * mapping) |
57 | { | 73 | { |
58 | return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; | 74 | return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e..3e387df065fc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -631,9 +631,10 @@ struct signal_struct { | |||
631 | struct rw_semaphore group_rwsem; | 631 | struct rw_semaphore group_rwsem; |
632 | #endif | 632 | #endif |
633 | 633 | ||
634 | int oom_score_adj; /* OOM kill score adjustment */ | 634 | oom_flags_t oom_flags; |
635 | int oom_score_adj_min; /* OOM kill score adjustment minimum value. | 635 | short oom_score_adj; /* OOM kill score adjustment */ |
636 | * Only settable by CAP_SYS_RESOURCE. */ | 636 | short oom_score_adj_min; /* OOM kill score adjustment min value. |
637 | * Only settable by CAP_SYS_RESOURCE. */ | ||
637 | 638 | ||
638 | struct mutex cred_guard_mutex; /* guard against foreign influences on | 639 | struct mutex cred_guard_mutex; /* guard against foreign influences on |
639 | * credential calculations | 640 | * credential calculations |
diff --git a/include/linux/shm.h b/include/linux/shm.h index bcf8a6a3ec00..429c1995d756 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h | |||
@@ -29,6 +29,21 @@ struct shmid_kernel /* private to the kernel */ | |||
29 | #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ | 29 | #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ |
30 | #define SHM_NORESERVE 010000 /* don't check for reservations */ | 30 | #define SHM_NORESERVE 010000 /* don't check for reservations */ |
31 | 31 | ||
32 | /* Bits [26:31] are reserved */ | ||
33 | |||
34 | /* | ||
35 | * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
36 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
37 | * spaces. | ||
38 | * | ||
39 | * Assume these are all power of twos. | ||
40 | * When 0 use the default page size. | ||
41 | */ | ||
42 | #define SHM_HUGE_SHIFT 26 | ||
43 | #define SHM_HUGE_MASK 0x3f | ||
44 | #define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) | ||
45 | #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) | ||
46 | |||
32 | #ifdef CONFIG_SYSVIPC | 47 | #ifdef CONFIG_SYSVIPC |
33 | long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, | 48 | long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, |
34 | unsigned long shmlba); | 49 | unsigned long shmlba); |
diff --git a/include/linux/types.h b/include/linux/types.h index 1cc0e4b9a048..4d118ba11349 100644 --- a/include/linux/types.h +++ b/include/linux/types.h | |||
@@ -156,6 +156,7 @@ typedef u32 dma_addr_t; | |||
156 | #endif | 156 | #endif |
157 | typedef unsigned __bitwise__ gfp_t; | 157 | typedef unsigned __bitwise__ gfp_t; |
158 | typedef unsigned __bitwise__ fmode_t; | 158 | typedef unsigned __bitwise__ fmode_t; |
159 | typedef unsigned __bitwise__ oom_flags_t; | ||
159 | 160 | ||
160 | #ifdef CONFIG_PHYS_ADDR_T_64BIT | 161 | #ifdef CONFIG_PHYS_ADDR_T_64BIT |
161 | typedef u64 phys_addr_t; | 162 | typedef u64 phys_addr_t; |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 50c3e8fa06a8..b82a83aba311 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -161,14 +161,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
161 | unsigned long start_time); | 161 | unsigned long start_time); |
162 | 162 | ||
163 | void page_writeback_init(void); | 163 | void page_writeback_init(void); |
164 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 164 | void balance_dirty_pages_ratelimited(struct address_space *mapping); |
165 | unsigned long nr_pages_dirtied); | ||
166 | |||
167 | static inline void | ||
168 | balance_dirty_pages_ratelimited(struct address_space *mapping) | ||
169 | { | ||
170 | balance_dirty_pages_ratelimited_nr(mapping, 1); | ||
171 | } | ||
172 | 165 | ||
173 | typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, | 166 | typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, |
174 | void *data); | 167 | void *data); |
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index dd4ba3b92002..1e974983757e 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h | |||
@@ -14,7 +14,7 @@ TRACE_EVENT(oom_score_adj_update, | |||
14 | TP_STRUCT__entry( | 14 | TP_STRUCT__entry( |
15 | __field( pid_t, pid) | 15 | __field( pid_t, pid) |
16 | __array( char, comm, TASK_COMM_LEN ) | 16 | __array( char, comm, TASK_COMM_LEN ) |
17 | __field( int, oom_score_adj) | 17 | __field( short, oom_score_adj) |
18 | ), | 18 | ), |
19 | 19 | ||
20 | TP_fast_assign( | 20 | TP_fast_assign( |
@@ -23,7 +23,7 @@ TRACE_EVENT(oom_score_adj_update, | |||
23 | __entry->oom_score_adj = task->signal->oom_score_adj; | 23 | __entry->oom_score_adj = task->signal->oom_score_adj; |
24 | ), | 24 | ), |
25 | 25 | ||
26 | TP_printk("pid=%d comm=%s oom_score_adj=%d", | 26 | TP_printk("pid=%d comm=%s oom_score_adj=%hd", |
27 | __entry->pid, __entry->comm, __entry->oom_score_adj) | 27 | __entry->pid, __entry->comm, __entry->oom_score_adj) |
28 | ); | 28 | ); |
29 | 29 | ||
diff --git a/include/trace/events/task.h b/include/trace/events/task.h index b53add02e929..102a646e1996 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h | |||
@@ -15,7 +15,7 @@ TRACE_EVENT(task_newtask, | |||
15 | __field( pid_t, pid) | 15 | __field( pid_t, pid) |
16 | __array( char, comm, TASK_COMM_LEN) | 16 | __array( char, comm, TASK_COMM_LEN) |
17 | __field( unsigned long, clone_flags) | 17 | __field( unsigned long, clone_flags) |
18 | __field( int, oom_score_adj) | 18 | __field( short, oom_score_adj) |
19 | ), | 19 | ), |
20 | 20 | ||
21 | TP_fast_assign( | 21 | TP_fast_assign( |
@@ -25,7 +25,7 @@ TRACE_EVENT(task_newtask, | |||
25 | __entry->oom_score_adj = task->signal->oom_score_adj; | 25 | __entry->oom_score_adj = task->signal->oom_score_adj; |
26 | ), | 26 | ), |
27 | 27 | ||
28 | TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", | 28 | TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", |
29 | __entry->pid, __entry->comm, | 29 | __entry->pid, __entry->comm, |
30 | __entry->clone_flags, __entry->oom_score_adj) | 30 | __entry->clone_flags, __entry->oom_score_adj) |
31 | ); | 31 | ); |
@@ -40,7 +40,7 @@ TRACE_EVENT(task_rename, | |||
40 | __field( pid_t, pid) | 40 | __field( pid_t, pid) |
41 | __array( char, oldcomm, TASK_COMM_LEN) | 41 | __array( char, oldcomm, TASK_COMM_LEN) |
42 | __array( char, newcomm, TASK_COMM_LEN) | 42 | __array( char, newcomm, TASK_COMM_LEN) |
43 | __field( int, oom_score_adj) | 43 | __field( short, oom_score_adj) |
44 | ), | 44 | ), |
45 | 45 | ||
46 | TP_fast_assign( | 46 | TP_fast_assign( |
@@ -50,7 +50,7 @@ TRACE_EVENT(task_rename, | |||
50 | __entry->oom_score_adj = task->signal->oom_score_adj; | 50 | __entry->oom_score_adj = task->signal->oom_score_adj; |
51 | ), | 51 | ), |
52 | 52 | ||
53 | TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", | 53 | TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd", |
54 | __entry->pid, __entry->oldcomm, | 54 | __entry->pid, __entry->oldcomm, |
55 | __entry->newcomm, __entry->oom_score_adj) | 55 | __entry->newcomm, __entry->oom_score_adj) |
56 | ); | 56 | ); |
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index d030d2c2647a..4164529a94f9 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h | |||
@@ -55,4 +55,15 @@ | |||
55 | /* compatibility flags */ | 55 | /* compatibility flags */ |
56 | #define MAP_FILE 0 | 56 | #define MAP_FILE 0 |
57 | 57 | ||
58 | /* | ||
59 | * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. | ||
60 | * This gives us 6 bits, which is enough until someone invents 128 bit address | ||
61 | * spaces. | ||
62 | * | ||
63 | * Assume these are all power of twos. | ||
64 | * When 0 use the default page size. | ||
65 | */ | ||
66 | #define MAP_HUGE_SHIFT 26 | ||
67 | #define MAP_HUGE_MASK 0x3f | ||
68 | |||
58 | #endif /* __ASM_GENERIC_MMAN_COMMON_H */ | 69 | #endif /* __ASM_GENERIC_MMAN_COMMON_H */ |
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 32c8bd6a196d..e9fe6fd2a074 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h | |||
@@ -13,6 +13,8 @@ | |||
13 | #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ | 13 | #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ |
14 | #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ | 14 | #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ |
15 | 15 | ||
16 | /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ | ||
17 | |||
16 | #define MCL_CURRENT 1 /* lock all current mappings */ | 18 | #define MCL_CURRENT 1 /* lock all current mappings */ |
17 | #define MCL_FUTURE 2 /* lock all future mappings */ | 19 | #define MCL_FUTURE 2 /* lock all future mappings */ |
18 | 20 | ||
@@ -495,7 +495,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) | |||
495 | if (shmflg & SHM_NORESERVE) | 495 | if (shmflg & SHM_NORESERVE) |
496 | acctflag = VM_NORESERVE; | 496 | acctflag = VM_NORESERVE; |
497 | file = hugetlb_file_setup(name, 0, size, acctflag, | 497 | file = hugetlb_file_setup(name, 0, size, acctflag, |
498 | &shp->mlock_user, HUGETLB_SHMFS_INODE); | 498 | &shp->mlock_user, HUGETLB_SHMFS_INODE, |
499 | (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); | ||
499 | } else { | 500 | } else { |
500 | /* | 501 | /* |
501 | * Do not allow no accounting for OVERCOMMIT_NEVER, even | 502 | * Do not allow no accounting for OVERCOMMIT_NEVER, even |
diff --git a/lib/cpumask.c b/lib/cpumask.c index 402a54ac35cb..d327b87c99b7 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c | |||
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var); | |||
161 | */ | 161 | */ |
162 | void __init free_bootmem_cpumask_var(cpumask_var_t mask) | 162 | void __init free_bootmem_cpumask_var(cpumask_var_t mask) |
163 | { | 163 | { |
164 | free_bootmem((unsigned long)mask, cpumask_size()); | 164 | free_bootmem(__pa(mask), cpumask_size()); |
165 | } | 165 | } |
166 | #endif | 166 | #endif |
diff --git a/mm/Kconfig b/mm/Kconfig index a3f8dddaaab3..e6651c5de14f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -188,6 +188,21 @@ config SPLIT_PTLOCK_CPUS | |||
188 | default "4" | 188 | default "4" |
189 | 189 | ||
190 | # | 190 | # |
191 | # support for memory balloon compaction | ||
192 | config BALLOON_COMPACTION | ||
193 | bool "Allow for balloon memory compaction/migration" | ||
194 | def_bool y | ||
195 | depends on COMPACTION && VIRTIO_BALLOON | ||
196 | help | ||
197 | Memory fragmentation introduced by ballooning might reduce | ||
198 | significantly the number of 2MB contiguous memory blocks that can be | ||
199 | used within a guest, thus imposing performance penalties associated | ||
200 | with the reduced number of transparent huge pages that could be used | ||
201 | by the guest workload. Allowing the compaction & migration for memory | ||
202 | pages enlisted as being part of memory balloon devices avoids the | ||
203 | scenario aforementioned and helps improving memory defragmentation. | ||
204 | |||
205 | # | ||
191 | # support for memory compaction | 206 | # support for memory compaction |
192 | config COMPACTION | 207 | config COMPACTION |
193 | bool "Allow for memory compaction" | 208 | bool "Allow for memory compaction" |
diff --git a/mm/Makefile b/mm/Makefile index 6b025f80af34..3a4628751f89 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o interval_tree.o $(mmu-y) | 19 | compaction.o balloon_compaction.o \ |
20 | interval_tree.o $(mmu-y) | ||
20 | 21 | ||
21 | obj-y += init-mm.o | 22 | obj-y += init-mm.o |
22 | 23 | ||
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 000000000000..07dbc8ec46cf --- /dev/null +++ b/mm/balloon_compaction.c | |||
@@ -0,0 +1,302 @@ | |||
1 | /* | ||
2 | * mm/balloon_compaction.c | ||
3 | * | ||
4 | * Common interface for making balloon pages movable by compaction. | ||
5 | * | ||
6 | * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> | ||
7 | */ | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/export.h> | ||
11 | #include <linux/balloon_compaction.h> | ||
12 | |||
13 | /* | ||
14 | * balloon_devinfo_alloc - allocates a balloon device information descriptor. | ||
15 | * @balloon_dev_descriptor: pointer to reference the balloon device which | ||
16 | * this struct balloon_dev_info will be servicing. | ||
17 | * | ||
18 | * Driver must call it to properly allocate and initialize an instance of | ||
19 | * struct balloon_dev_info which will be used to reference a balloon device | ||
20 | * as well as to keep track of the balloon device page list. | ||
21 | */ | ||
22 | struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) | ||
23 | { | ||
24 | struct balloon_dev_info *b_dev_info; | ||
25 | b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); | ||
26 | if (!b_dev_info) | ||
27 | return ERR_PTR(-ENOMEM); | ||
28 | |||
29 | b_dev_info->balloon_device = balloon_dev_descriptor; | ||
30 | b_dev_info->mapping = NULL; | ||
31 | b_dev_info->isolated_pages = 0; | ||
32 | spin_lock_init(&b_dev_info->pages_lock); | ||
33 | INIT_LIST_HEAD(&b_dev_info->pages); | ||
34 | |||
35 | return b_dev_info; | ||
36 | } | ||
37 | EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); | ||
38 | |||
39 | /* | ||
40 | * balloon_page_enqueue - allocates a new page and inserts it into the balloon | ||
41 | * page list. | ||
42 | * @b_dev_info: balloon device decriptor where we will insert a new page to | ||
43 | * | ||
44 | * Driver must call it to properly allocate a new enlisted balloon page | ||
45 | * before definetively removing it from the guest system. | ||
46 | * This function returns the page address for the recently enqueued page or | ||
47 | * NULL in the case we fail to allocate a new page this turn. | ||
48 | */ | ||
49 | struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) | ||
50 | { | ||
51 | unsigned long flags; | ||
52 | struct page *page = alloc_page(balloon_mapping_gfp_mask() | | ||
53 | __GFP_NOMEMALLOC | __GFP_NORETRY); | ||
54 | if (!page) | ||
55 | return NULL; | ||
56 | |||
57 | /* | ||
58 | * Block others from accessing the 'page' when we get around to | ||
59 | * establishing additional references. We should be the only one | ||
60 | * holding a reference to the 'page' at this point. | ||
61 | */ | ||
62 | BUG_ON(!trylock_page(page)); | ||
63 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
64 | balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); | ||
65 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
66 | unlock_page(page); | ||
67 | return page; | ||
68 | } | ||
69 | EXPORT_SYMBOL_GPL(balloon_page_enqueue); | ||
70 | |||
71 | /* | ||
72 | * balloon_page_dequeue - removes a page from balloon's page list and returns | ||
73 | * the its address to allow the driver release the page. | ||
74 | * @b_dev_info: balloon device decriptor where we will grab a page from. | ||
75 | * | ||
76 | * Driver must call it to properly de-allocate a previous enlisted balloon page | ||
77 | * before definetively releasing it back to the guest system. | ||
78 | * This function returns the page address for the recently dequeued page or | ||
79 | * NULL in the case we find balloon's page list temporarily empty due to | ||
80 | * compaction isolated pages. | ||
81 | */ | ||
82 | struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | ||
83 | { | ||
84 | struct page *page, *tmp; | ||
85 | unsigned long flags; | ||
86 | bool dequeued_page; | ||
87 | |||
88 | dequeued_page = false; | ||
89 | list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { | ||
90 | /* | ||
91 | * Block others from accessing the 'page' while we get around | ||
92 | * establishing additional references and preparing the 'page' | ||
93 | * to be released by the balloon driver. | ||
94 | */ | ||
95 | if (trylock_page(page)) { | ||
96 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
97 | /* | ||
98 | * Raise the page refcount here to prevent any wrong | ||
99 | * attempt to isolate this page, in case of coliding | ||
100 | * with balloon_page_isolate() just after we release | ||
101 | * the page lock. | ||
102 | * | ||
103 | * balloon_page_free() will take care of dropping | ||
104 | * this extra refcount later. | ||
105 | */ | ||
106 | get_page(page); | ||
107 | balloon_page_delete(page); | ||
108 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
109 | unlock_page(page); | ||
110 | dequeued_page = true; | ||
111 | break; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | if (!dequeued_page) { | ||
116 | /* | ||
117 | * If we are unable to dequeue a balloon page because the page | ||
118 | * list is empty and there is no isolated pages, then something | ||
119 | * went out of track and some balloon pages are lost. | ||
120 | * BUG() here, otherwise the balloon driver may get stuck into | ||
121 | * an infinite loop while attempting to release all its pages. | ||
122 | */ | ||
123 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
124 | if (unlikely(list_empty(&b_dev_info->pages) && | ||
125 | !b_dev_info->isolated_pages)) | ||
126 | BUG(); | ||
127 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
128 | page = NULL; | ||
129 | } | ||
130 | return page; | ||
131 | } | ||
132 | EXPORT_SYMBOL_GPL(balloon_page_dequeue); | ||
133 | |||
134 | #ifdef CONFIG_BALLOON_COMPACTION | ||
135 | /* | ||
136 | * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. | ||
137 | * @b_dev_info: holds the balloon device information descriptor. | ||
138 | * @a_ops: balloon_mapping address_space_operations descriptor. | ||
139 | * | ||
140 | * Driver must call it to properly allocate and initialize an instance of | ||
141 | * struct address_space which will be used as the special page->mapping for | ||
142 | * balloon device enlisted page instances. | ||
143 | */ | ||
144 | struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, | ||
145 | const struct address_space_operations *a_ops) | ||
146 | { | ||
147 | struct address_space *mapping; | ||
148 | |||
149 | mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); | ||
150 | if (!mapping) | ||
151 | return ERR_PTR(-ENOMEM); | ||
152 | |||
153 | /* | ||
154 | * Give a clean 'zeroed' status to all elements of this special | ||
155 | * balloon page->mapping struct address_space instance. | ||
156 | */ | ||
157 | address_space_init_once(mapping); | ||
158 | |||
159 | /* | ||
160 | * Set mapping->flags appropriately, to allow balloon pages | ||
161 | * ->mapping identification. | ||
162 | */ | ||
163 | mapping_set_balloon(mapping); | ||
164 | mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); | ||
165 | |||
166 | /* balloon's page->mapping->a_ops callback descriptor */ | ||
167 | mapping->a_ops = a_ops; | ||
168 | |||
169 | /* | ||
170 | * Establish a pointer reference back to the balloon device descriptor | ||
171 | * this particular page->mapping will be servicing. | ||
172 | * This is used by compaction / migration procedures to identify and | ||
173 | * access the balloon device pageset while isolating / migrating pages. | ||
174 | * | ||
175 | * As some balloon drivers can register multiple balloon devices | ||
176 | * for a single guest, this also helps compaction / migration to | ||
177 | * properly deal with multiple balloon pagesets, when required. | ||
178 | */ | ||
179 | mapping->private_data = b_dev_info; | ||
180 | b_dev_info->mapping = mapping; | ||
181 | |||
182 | return mapping; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(balloon_mapping_alloc); | ||
185 | |||
186 | static inline void __isolate_balloon_page(struct page *page) | ||
187 | { | ||
188 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | ||
189 | unsigned long flags; | ||
190 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
191 | list_del(&page->lru); | ||
192 | b_dev_info->isolated_pages++; | ||
193 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
194 | } | ||
195 | |||
196 | static inline void __putback_balloon_page(struct page *page) | ||
197 | { | ||
198 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | ||
199 | unsigned long flags; | ||
200 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
201 | list_add(&page->lru, &b_dev_info->pages); | ||
202 | b_dev_info->isolated_pages--; | ||
203 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
204 | } | ||
205 | |||
206 | static inline int __migrate_balloon_page(struct address_space *mapping, | ||
207 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
208 | { | ||
209 | return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); | ||
210 | } | ||
211 | |||
212 | /* __isolate_lru_page() counterpart for a ballooned page */ | ||
213 | bool balloon_page_isolate(struct page *page) | ||
214 | { | ||
215 | /* | ||
216 | * Avoid burning cycles with pages that are yet under __free_pages(), | ||
217 | * or just got freed under us. | ||
218 | * | ||
219 | * In case we 'win' a race for a balloon page being freed under us and | ||
220 | * raise its refcount preventing __free_pages() from doing its job | ||
221 | * the put_page() at the end of this block will take care of | ||
222 | * release this page, thus avoiding a nasty leakage. | ||
223 | */ | ||
224 | if (likely(get_page_unless_zero(page))) { | ||
225 | /* | ||
226 | * As balloon pages are not isolated from LRU lists, concurrent | ||
227 | * compaction threads can race against page migration functions | ||
228 | * as well as race against the balloon driver releasing a page. | ||
229 | * | ||
230 | * In order to avoid having an already isolated balloon page | ||
231 | * being (wrongly) re-isolated while it is under migration, | ||
232 | * or to avoid attempting to isolate pages being released by | ||
233 | * the balloon driver, lets be sure we have the page lock | ||
234 | * before proceeding with the balloon page isolation steps. | ||
235 | */ | ||
236 | if (likely(trylock_page(page))) { | ||
237 | /* | ||
238 | * A ballooned page, by default, has just one refcount. | ||
239 | * Prevent concurrent compaction threads from isolating | ||
240 | * an already isolated balloon page by refcount check. | ||
241 | */ | ||
242 | if (__is_movable_balloon_page(page) && | ||
243 | page_count(page) == 2) { | ||
244 | __isolate_balloon_page(page); | ||
245 | unlock_page(page); | ||
246 | return true; | ||
247 | } | ||
248 | unlock_page(page); | ||
249 | } | ||
250 | put_page(page); | ||
251 | } | ||
252 | return false; | ||
253 | } | ||
254 | |||
255 | /* putback_lru_page() counterpart for a ballooned page */ | ||
256 | void balloon_page_putback(struct page *page) | ||
257 | { | ||
258 | /* | ||
259 | * 'lock_page()' stabilizes the page and prevents races against | ||
260 | * concurrent isolation threads attempting to re-isolate it. | ||
261 | */ | ||
262 | lock_page(page); | ||
263 | |||
264 | if (__is_movable_balloon_page(page)) { | ||
265 | __putback_balloon_page(page); | ||
266 | /* drop the extra ref count taken for page isolation */ | ||
267 | put_page(page); | ||
268 | } else { | ||
269 | WARN_ON(1); | ||
270 | dump_page(page); | ||
271 | } | ||
272 | unlock_page(page); | ||
273 | } | ||
274 | |||
275 | /* move_to_new_page() counterpart for a ballooned page */ | ||
276 | int balloon_page_migrate(struct page *newpage, | ||
277 | struct page *page, enum migrate_mode mode) | ||
278 | { | ||
279 | struct address_space *mapping; | ||
280 | int rc = -EAGAIN; | ||
281 | |||
282 | /* | ||
283 | * Block others from accessing the 'newpage' when we get around to | ||
284 | * establishing additional references. We should be the only one | ||
285 | * holding a reference to the 'newpage' at this point. | ||
286 | */ | ||
287 | BUG_ON(!trylock_page(newpage)); | ||
288 | |||
289 | if (WARN_ON(!__is_movable_balloon_page(page))) { | ||
290 | dump_page(page); | ||
291 | unlock_page(newpage); | ||
292 | return rc; | ||
293 | } | ||
294 | |||
295 | mapping = page->mapping; | ||
296 | if (mapping) | ||
297 | rc = __migrate_balloon_page(mapping, newpage, page, mode); | ||
298 | |||
299 | unlock_page(newpage); | ||
300 | return rc; | ||
301 | } | ||
302 | #endif /* CONFIG_BALLOON_COMPACTION */ | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..ecc45958ac0c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
147 | 147 | ||
148 | /* | 148 | /* |
149 | * free_bootmem_late - free bootmem pages directly to page allocator | 149 | * free_bootmem_late - free bootmem pages directly to page allocator |
150 | * @addr: starting address of the range | 150 | * @addr: starting physical address of the range |
151 | * @size: size of the range in bytes | 151 | * @size: size of the range in bytes |
152 | * | 152 | * |
153 | * This is only useful when the bootmem allocator has already been torn | 153 | * This is only useful when the bootmem allocator has already been torn |
154 | * down, but we are still initializing the system. Pages are given directly | 154 | * down, but we are still initializing the system. Pages are given directly |
155 | * to the page allocator, no bootmem metadata is updated because it is gone. | 155 | * to the page allocator, no bootmem metadata is updated because it is gone. |
156 | */ | 156 | */ |
157 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | 157 | void __init free_bootmem_late(unsigned long physaddr, unsigned long size) |
158 | { | 158 | { |
159 | unsigned long cursor, end; | 159 | unsigned long cursor, end; |
160 | 160 | ||
161 | kmemleak_free_part(__va(addr), size); | 161 | kmemleak_free_part(__va(physaddr), size); |
162 | 162 | ||
163 | cursor = PFN_UP(addr); | 163 | cursor = PFN_UP(physaddr); |
164 | end = PFN_DOWN(addr + size); | 164 | end = PFN_DOWN(physaddr + size); |
165 | 165 | ||
166 | for (; cursor < end; cursor++) { | 166 | for (; cursor < end; cursor++) { |
167 | __free_pages_bootmem(pfn_to_page(cursor), 0); | 167 | __free_pages_bootmem(pfn_to_page(cursor), 0); |
@@ -377,21 +377,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
377 | 377 | ||
378 | /** | 378 | /** |
379 | * free_bootmem - mark a page range as usable | 379 | * free_bootmem - mark a page range as usable |
380 | * @addr: starting address of the range | 380 | * @addr: starting physical address of the range |
381 | * @size: size of the range in bytes | 381 | * @size: size of the range in bytes |
382 | * | 382 | * |
383 | * Partial pages will be considered reserved and left as they are. | 383 | * Partial pages will be considered reserved and left as they are. |
384 | * | 384 | * |
385 | * The range must be contiguous but may span node boundaries. | 385 | * The range must be contiguous but may span node boundaries. |
386 | */ | 386 | */ |
387 | void __init free_bootmem(unsigned long addr, unsigned long size) | 387 | void __init free_bootmem(unsigned long physaddr, unsigned long size) |
388 | { | 388 | { |
389 | unsigned long start, end; | 389 | unsigned long start, end; |
390 | 390 | ||
391 | kmemleak_free_part(__va(addr), size); | 391 | kmemleak_free_part(__va(physaddr), size); |
392 | 392 | ||
393 | start = PFN_UP(addr); | 393 | start = PFN_UP(physaddr); |
394 | end = PFN_DOWN(addr + size); | 394 | end = PFN_DOWN(physaddr + size); |
395 | 395 | ||
396 | mark_bootmem(start, end, 0, 0); | 396 | mark_bootmem(start, end, 0, 0); |
397 | } | 397 | } |
diff --git a/mm/compaction.c b/mm/compaction.c index 694eaabaaebd..d24dd2d7bad4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/backing-dev.h> | 14 | #include <linux/backing-dev.h> |
15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include <linux/balloon_compaction.h> | ||
17 | #include "internal.h" | 18 | #include "internal.h" |
18 | 19 | ||
19 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 20 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
@@ -565,9 +566,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
565 | goto next_pageblock; | 566 | goto next_pageblock; |
566 | } | 567 | } |
567 | 568 | ||
568 | /* Check may be lockless but that's ok as we recheck later */ | 569 | /* |
569 | if (!PageLRU(page)) | 570 | * Check may be lockless but that's ok as we recheck later. |
571 | * It's possible to migrate LRU pages and balloon pages | ||
572 | * Skip any other type of page | ||
573 | */ | ||
574 | if (!PageLRU(page)) { | ||
575 | if (unlikely(balloon_page_movable(page))) { | ||
576 | if (locked && balloon_page_isolate(page)) { | ||
577 | /* Successfully isolated */ | ||
578 | cc->finished_update_migrate = true; | ||
579 | list_add(&page->lru, migratelist); | ||
580 | cc->nr_migratepages++; | ||
581 | nr_isolated++; | ||
582 | goto check_compact_cluster; | ||
583 | } | ||
584 | } | ||
570 | continue; | 585 | continue; |
586 | } | ||
571 | 587 | ||
572 | /* | 588 | /* |
573 | * PageLRU is set. lru_lock normally excludes isolation | 589 | * PageLRU is set. lru_lock normally excludes isolation |
@@ -621,6 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
621 | cc->nr_migratepages++; | 637 | cc->nr_migratepages++; |
622 | nr_isolated++; | 638 | nr_isolated++; |
623 | 639 | ||
640 | check_compact_cluster: | ||
624 | /* Avoid isolating too much */ | 641 | /* Avoid isolating too much */ |
625 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | 642 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
626 | ++low_pfn; | 643 | ++low_pfn; |
@@ -986,7 +1003,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
986 | switch (isolate_migratepages(zone, cc)) { | 1003 | switch (isolate_migratepages(zone, cc)) { |
987 | case ISOLATE_ABORT: | 1004 | case ISOLATE_ABORT: |
988 | ret = COMPACT_PARTIAL; | 1005 | ret = COMPACT_PARTIAL; |
989 | putback_lru_pages(&cc->migratepages); | 1006 | putback_movable_pages(&cc->migratepages); |
990 | cc->nr_migratepages = 0; | 1007 | cc->nr_migratepages = 0; |
991 | goto out; | 1008 | goto out; |
992 | case ISOLATE_NONE: | 1009 | case ISOLATE_NONE: |
@@ -1009,9 +1026,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1009 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 1026 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, |
1010 | nr_remaining); | 1027 | nr_remaining); |
1011 | 1028 | ||
1012 | /* Release LRU pages not migrated */ | 1029 | /* Release isolated pages not migrated */ |
1013 | if (err) { | 1030 | if (err) { |
1014 | putback_lru_pages(&cc->migratepages); | 1031 | putback_movable_pages(&cc->migratepages); |
1015 | cc->nr_migratepages = 0; | 1032 | cc->nr_migratepages = 0; |
1016 | if (err == -ENOMEM) { | 1033 | if (err == -ENOMEM) { |
1017 | ret = COMPACT_PARTIAL; | 1034 | ret = COMPACT_PARTIAL; |
diff --git a/mm/dmapool.c b/mm/dmapool.c index da1b0f0b8709..c69781e97cf9 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -332,6 +332,30 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
332 | retval = offset + page->vaddr; | 332 | retval = offset + page->vaddr; |
333 | *handle = offset + page->dma; | 333 | *handle = offset + page->dma; |
334 | #ifdef DMAPOOL_DEBUG | 334 | #ifdef DMAPOOL_DEBUG |
335 | { | ||
336 | int i; | ||
337 | u8 *data = retval; | ||
338 | /* page->offset is stored in first 4 bytes */ | ||
339 | for (i = sizeof(page->offset); i < pool->size; i++) { | ||
340 | if (data[i] == POOL_POISON_FREED) | ||
341 | continue; | ||
342 | if (pool->dev) | ||
343 | dev_err(pool->dev, | ||
344 | "dma_pool_alloc %s, %p (corruped)\n", | ||
345 | pool->name, retval); | ||
346 | else | ||
347 | pr_err("dma_pool_alloc %s, %p (corruped)\n", | ||
348 | pool->name, retval); | ||
349 | |||
350 | /* | ||
351 | * Dump the first 4 bytes even if they are not | ||
352 | * POOL_POISON_FREED | ||
353 | */ | ||
354 | print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, | ||
355 | data, pool->size, 1); | ||
356 | break; | ||
357 | } | ||
358 | } | ||
335 | memset(retval, POOL_POISON_ALLOCATED, pool->size); | 359 | memset(retval, POOL_POISON_ALLOCATED, pool->size); |
336 | #endif | 360 | #endif |
337 | spin_unlock_irqrestore(&pool->lock, flags); | 361 | spin_unlock_irqrestore(&pool->lock, flags); |
diff --git a/mm/highmem.c b/mm/highmem.c index 2da13a5c50e2..d999077431df 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr) | |||
99 | unsigned long addr = (unsigned long)vaddr; | 99 | unsigned long addr = (unsigned long)vaddr; |
100 | 100 | ||
101 | if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { | 101 | if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { |
102 | int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; | 102 | int i = PKMAP_NR(addr); |
103 | return pte_page(pkmap_page_table[i]); | 103 | return pte_page(pkmap_page_table[i]); |
104 | } | 104 | } |
105 | 105 | ||
@@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void) | |||
137 | * So no dangers, even with speculative execution. | 137 | * So no dangers, even with speculative execution. |
138 | */ | 138 | */ |
139 | page = pte_page(pkmap_page_table[i]); | 139 | page = pte_page(pkmap_page_table[i]); |
140 | pte_clear(&init_mm, (unsigned long)page_address(page), | 140 | pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); |
141 | &pkmap_page_table[i]); | ||
142 | 141 | ||
143 | set_page_address(page, NULL); | 142 | set_page_address(page, NULL); |
144 | need_flush = 1; | 143 | need_flush = 1; |
@@ -324,11 +323,7 @@ struct page_address_map { | |||
324 | struct list_head list; | 323 | struct list_head list; |
325 | }; | 324 | }; |
326 | 325 | ||
327 | /* | 326 | static struct page_address_map page_address_maps[LAST_PKMAP]; |
328 | * page_address_map freelist, allocated from page_address_maps. | ||
329 | */ | ||
330 | static struct list_head page_address_pool; /* freelist */ | ||
331 | static spinlock_t pool_lock; /* protects page_address_pool */ | ||
332 | 327 | ||
333 | /* | 328 | /* |
334 | * Hash table bucket | 329 | * Hash table bucket |
@@ -393,14 +388,7 @@ void set_page_address(struct page *page, void *virtual) | |||
393 | 388 | ||
394 | pas = page_slot(page); | 389 | pas = page_slot(page); |
395 | if (virtual) { /* Add */ | 390 | if (virtual) { /* Add */ |
396 | BUG_ON(list_empty(&page_address_pool)); | 391 | pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; |
397 | |||
398 | spin_lock_irqsave(&pool_lock, flags); | ||
399 | pam = list_entry(page_address_pool.next, | ||
400 | struct page_address_map, list); | ||
401 | list_del(&pam->list); | ||
402 | spin_unlock_irqrestore(&pool_lock, flags); | ||
403 | |||
404 | pam->page = page; | 392 | pam->page = page; |
405 | pam->virtual = virtual; | 393 | pam->virtual = virtual; |
406 | 394 | ||
@@ -413,9 +401,6 @@ void set_page_address(struct page *page, void *virtual) | |||
413 | if (pam->page == page) { | 401 | if (pam->page == page) { |
414 | list_del(&pam->list); | 402 | list_del(&pam->list); |
415 | spin_unlock_irqrestore(&pas->lock, flags); | 403 | spin_unlock_irqrestore(&pas->lock, flags); |
416 | spin_lock_irqsave(&pool_lock, flags); | ||
417 | list_add_tail(&pam->list, &page_address_pool); | ||
418 | spin_unlock_irqrestore(&pool_lock, flags); | ||
419 | goto done; | 404 | goto done; |
420 | } | 405 | } |
421 | } | 406 | } |
@@ -425,20 +410,14 @@ done: | |||
425 | return; | 410 | return; |
426 | } | 411 | } |
427 | 412 | ||
428 | static struct page_address_map page_address_maps[LAST_PKMAP]; | ||
429 | |||
430 | void __init page_address_init(void) | 413 | void __init page_address_init(void) |
431 | { | 414 | { |
432 | int i; | 415 | int i; |
433 | 416 | ||
434 | INIT_LIST_HEAD(&page_address_pool); | ||
435 | for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) | ||
436 | list_add(&page_address_maps[i].list, &page_address_pool); | ||
437 | for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { | 417 | for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { |
438 | INIT_LIST_HEAD(&page_address_htable[i].lh); | 418 | INIT_LIST_HEAD(&page_address_htable[i].lh); |
439 | spin_lock_init(&page_address_htable[i].lock); | 419 | spin_lock_init(&page_address_htable[i].lock); |
440 | } | 420 | } |
441 | spin_lock_init(&pool_lock); | ||
442 | } | 421 | } |
443 | 422 | ||
444 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ | 423 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c34b415..5f902e20e8c0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -606,6 +606,15 @@ static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | |||
606 | return pmd; | 606 | return pmd; |
607 | } | 607 | } |
608 | 608 | ||
609 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) | ||
610 | { | ||
611 | pmd_t entry; | ||
612 | entry = mk_pmd(page, vma->vm_page_prot); | ||
613 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
614 | entry = pmd_mkhuge(entry); | ||
615 | return entry; | ||
616 | } | ||
617 | |||
609 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 618 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
610 | struct vm_area_struct *vma, | 619 | struct vm_area_struct *vma, |
611 | unsigned long haddr, pmd_t *pmd, | 620 | unsigned long haddr, pmd_t *pmd, |
@@ -629,9 +638,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
629 | pte_free(mm, pgtable); | 638 | pte_free(mm, pgtable); |
630 | } else { | 639 | } else { |
631 | pmd_t entry; | 640 | pmd_t entry; |
632 | entry = mk_pmd(page, vma->vm_page_prot); | 641 | entry = mk_huge_pmd(page, vma); |
633 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
634 | entry = pmd_mkhuge(entry); | ||
635 | /* | 642 | /* |
636 | * The spinlocking to take the lru_lock inside | 643 | * The spinlocking to take the lru_lock inside |
637 | * page_add_new_anon_rmap() acts as a full memory | 644 | * page_add_new_anon_rmap() acts as a full memory |
@@ -777,6 +784,28 @@ out: | |||
777 | return ret; | 784 | return ret; |
778 | } | 785 | } |
779 | 786 | ||
787 | void huge_pmd_set_accessed(struct mm_struct *mm, | ||
788 | struct vm_area_struct *vma, | ||
789 | unsigned long address, | ||
790 | pmd_t *pmd, pmd_t orig_pmd, | ||
791 | int dirty) | ||
792 | { | ||
793 | pmd_t entry; | ||
794 | unsigned long haddr; | ||
795 | |||
796 | spin_lock(&mm->page_table_lock); | ||
797 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
798 | goto unlock; | ||
799 | |||
800 | entry = pmd_mkyoung(orig_pmd); | ||
801 | haddr = address & HPAGE_PMD_MASK; | ||
802 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) | ||
803 | update_mmu_cache_pmd(vma, address, pmd); | ||
804 | |||
805 | unlock: | ||
806 | spin_unlock(&mm->page_table_lock); | ||
807 | } | ||
808 | |||
780 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 809 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
781 | struct vm_area_struct *vma, | 810 | struct vm_area_struct *vma, |
782 | unsigned long address, | 811 | unsigned long address, |
@@ -951,9 +980,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
951 | } else { | 980 | } else { |
952 | pmd_t entry; | 981 | pmd_t entry; |
953 | VM_BUG_ON(!PageHead(page)); | 982 | VM_BUG_ON(!PageHead(page)); |
954 | entry = mk_pmd(new_page, vma->vm_page_prot); | 983 | entry = mk_huge_pmd(new_page, vma); |
955 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
956 | entry = pmd_mkhuge(entry); | ||
957 | pmdp_clear_flush(vma, haddr, pmd); | 984 | pmdp_clear_flush(vma, haddr, pmd); |
958 | page_add_new_anon_rmap(new_page, vma, haddr); | 985 | page_add_new_anon_rmap(new_page, vma, haddr); |
959 | set_pmd_at(mm, haddr, pmd, entry); | 986 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -1146,22 +1173,14 @@ pmd_t *page_check_address_pmd(struct page *page, | |||
1146 | unsigned long address, | 1173 | unsigned long address, |
1147 | enum page_check_address_pmd_flag flag) | 1174 | enum page_check_address_pmd_flag flag) |
1148 | { | 1175 | { |
1149 | pgd_t *pgd; | ||
1150 | pud_t *pud; | ||
1151 | pmd_t *pmd, *ret = NULL; | 1176 | pmd_t *pmd, *ret = NULL; |
1152 | 1177 | ||
1153 | if (address & ~HPAGE_PMD_MASK) | 1178 | if (address & ~HPAGE_PMD_MASK) |
1154 | goto out; | 1179 | goto out; |
1155 | 1180 | ||
1156 | pgd = pgd_offset(mm, address); | 1181 | pmd = mm_find_pmd(mm, address); |
1157 | if (!pgd_present(*pgd)) | 1182 | if (!pmd) |
1158 | goto out; | ||
1159 | |||
1160 | pud = pud_offset(pgd, address); | ||
1161 | if (!pud_present(*pud)) | ||
1162 | goto out; | 1183 | goto out; |
1163 | |||
1164 | pmd = pmd_offset(pud, address); | ||
1165 | if (pmd_none(*pmd)) | 1184 | if (pmd_none(*pmd)) |
1166 | goto out; | 1185 | goto out; |
1167 | if (pmd_page(*pmd) != page) | 1186 | if (pmd_page(*pmd) != page) |
@@ -1701,64 +1720,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) | |||
1701 | } | 1720 | } |
1702 | } | 1721 | } |
1703 | 1722 | ||
1704 | static void release_all_pte_pages(pte_t *pte) | ||
1705 | { | ||
1706 | release_pte_pages(pte, pte + HPAGE_PMD_NR); | ||
1707 | } | ||
1708 | |||
1709 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | 1723 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, |
1710 | unsigned long address, | 1724 | unsigned long address, |
1711 | pte_t *pte) | 1725 | pte_t *pte) |
1712 | { | 1726 | { |
1713 | struct page *page; | 1727 | struct page *page; |
1714 | pte_t *_pte; | 1728 | pte_t *_pte; |
1715 | int referenced = 0, isolated = 0, none = 0; | 1729 | int referenced = 0, none = 0; |
1716 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 1730 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
1717 | _pte++, address += PAGE_SIZE) { | 1731 | _pte++, address += PAGE_SIZE) { |
1718 | pte_t pteval = *_pte; | 1732 | pte_t pteval = *_pte; |
1719 | if (pte_none(pteval)) { | 1733 | if (pte_none(pteval)) { |
1720 | if (++none <= khugepaged_max_ptes_none) | 1734 | if (++none <= khugepaged_max_ptes_none) |
1721 | continue; | 1735 | continue; |
1722 | else { | 1736 | else |
1723 | release_pte_pages(pte, _pte); | ||
1724 | goto out; | 1737 | goto out; |
1725 | } | ||
1726 | } | 1738 | } |
1727 | if (!pte_present(pteval) || !pte_write(pteval)) { | 1739 | if (!pte_present(pteval) || !pte_write(pteval)) |
1728 | release_pte_pages(pte, _pte); | ||
1729 | goto out; | 1740 | goto out; |
1730 | } | ||
1731 | page = vm_normal_page(vma, address, pteval); | 1741 | page = vm_normal_page(vma, address, pteval); |
1732 | if (unlikely(!page)) { | 1742 | if (unlikely(!page)) |
1733 | release_pte_pages(pte, _pte); | ||
1734 | goto out; | 1743 | goto out; |
1735 | } | 1744 | |
1736 | VM_BUG_ON(PageCompound(page)); | 1745 | VM_BUG_ON(PageCompound(page)); |
1737 | BUG_ON(!PageAnon(page)); | 1746 | BUG_ON(!PageAnon(page)); |
1738 | VM_BUG_ON(!PageSwapBacked(page)); | 1747 | VM_BUG_ON(!PageSwapBacked(page)); |
1739 | 1748 | ||
1740 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 1749 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
1741 | if (page_count(page) != 1) { | 1750 | if (page_count(page) != 1) |
1742 | release_pte_pages(pte, _pte); | ||
1743 | goto out; | 1751 | goto out; |
1744 | } | ||
1745 | /* | 1752 | /* |
1746 | * We can do it before isolate_lru_page because the | 1753 | * We can do it before isolate_lru_page because the |
1747 | * page can't be freed from under us. NOTE: PG_lock | 1754 | * page can't be freed from under us. NOTE: PG_lock |
1748 | * is needed to serialize against split_huge_page | 1755 | * is needed to serialize against split_huge_page |
1749 | * when invoked from the VM. | 1756 | * when invoked from the VM. |
1750 | */ | 1757 | */ |
1751 | if (!trylock_page(page)) { | 1758 | if (!trylock_page(page)) |
1752 | release_pte_pages(pte, _pte); | ||
1753 | goto out; | 1759 | goto out; |
1754 | } | ||
1755 | /* | 1760 | /* |
1756 | * Isolate the page to avoid collapsing an hugepage | 1761 | * Isolate the page to avoid collapsing an hugepage |
1757 | * currently in use by the VM. | 1762 | * currently in use by the VM. |
1758 | */ | 1763 | */ |
1759 | if (isolate_lru_page(page)) { | 1764 | if (isolate_lru_page(page)) { |
1760 | unlock_page(page); | 1765 | unlock_page(page); |
1761 | release_pte_pages(pte, _pte); | ||
1762 | goto out; | 1766 | goto out; |
1763 | } | 1767 | } |
1764 | /* 0 stands for page_is_file_cache(page) == false */ | 1768 | /* 0 stands for page_is_file_cache(page) == false */ |
@@ -1771,12 +1775,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
1771 | mmu_notifier_test_young(vma->vm_mm, address)) | 1775 | mmu_notifier_test_young(vma->vm_mm, address)) |
1772 | referenced = 1; | 1776 | referenced = 1; |
1773 | } | 1777 | } |
1774 | if (unlikely(!referenced)) | 1778 | if (likely(referenced)) |
1775 | release_all_pte_pages(pte); | 1779 | return 1; |
1776 | else | ||
1777 | isolated = 1; | ||
1778 | out: | 1780 | out: |
1779 | return isolated; | 1781 | release_pte_pages(pte, _pte); |
1782 | return 0; | ||
1780 | } | 1783 | } |
1781 | 1784 | ||
1782 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | 1785 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, |
@@ -1918,14 +1921,26 @@ static struct page | |||
1918 | } | 1921 | } |
1919 | #endif | 1922 | #endif |
1920 | 1923 | ||
1924 | static bool hugepage_vma_check(struct vm_area_struct *vma) | ||
1925 | { | ||
1926 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
1927 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
1928 | return false; | ||
1929 | |||
1930 | if (!vma->anon_vma || vma->vm_ops) | ||
1931 | return false; | ||
1932 | if (is_vma_temporary_stack(vma)) | ||
1933 | return false; | ||
1934 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | ||
1935 | return true; | ||
1936 | } | ||
1937 | |||
1921 | static void collapse_huge_page(struct mm_struct *mm, | 1938 | static void collapse_huge_page(struct mm_struct *mm, |
1922 | unsigned long address, | 1939 | unsigned long address, |
1923 | struct page **hpage, | 1940 | struct page **hpage, |
1924 | struct vm_area_struct *vma, | 1941 | struct vm_area_struct *vma, |
1925 | int node) | 1942 | int node) |
1926 | { | 1943 | { |
1927 | pgd_t *pgd; | ||
1928 | pud_t *pud; | ||
1929 | pmd_t *pmd, _pmd; | 1944 | pmd_t *pmd, _pmd; |
1930 | pte_t *pte; | 1945 | pte_t *pte; |
1931 | pgtable_t pgtable; | 1946 | pgtable_t pgtable; |
@@ -1960,28 +1975,12 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1960 | hend = vma->vm_end & HPAGE_PMD_MASK; | 1975 | hend = vma->vm_end & HPAGE_PMD_MASK; |
1961 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | 1976 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) |
1962 | goto out; | 1977 | goto out; |
1963 | 1978 | if (!hugepage_vma_check(vma)) | |
1964 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
1965 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
1966 | goto out; | ||
1967 | |||
1968 | if (!vma->anon_vma || vma->vm_ops) | ||
1969 | goto out; | ||
1970 | if (is_vma_temporary_stack(vma)) | ||
1971 | goto out; | 1979 | goto out; |
1972 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 1980 | pmd = mm_find_pmd(mm, address); |
1973 | 1981 | if (!pmd) | |
1974 | pgd = pgd_offset(mm, address); | ||
1975 | if (!pgd_present(*pgd)) | ||
1976 | goto out; | 1982 | goto out; |
1977 | 1983 | if (pmd_trans_huge(*pmd)) | |
1978 | pud = pud_offset(pgd, address); | ||
1979 | if (!pud_present(*pud)) | ||
1980 | goto out; | ||
1981 | |||
1982 | pmd = pmd_offset(pud, address); | ||
1983 | /* pmd can't go away or become huge under us */ | ||
1984 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1985 | goto out; | 1984 | goto out; |
1986 | 1985 | ||
1987 | anon_vma_lock(vma->anon_vma); | 1986 | anon_vma_lock(vma->anon_vma); |
@@ -2028,9 +2027,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2028 | __SetPageUptodate(new_page); | 2027 | __SetPageUptodate(new_page); |
2029 | pgtable = pmd_pgtable(_pmd); | 2028 | pgtable = pmd_pgtable(_pmd); |
2030 | 2029 | ||
2031 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | 2030 | _pmd = mk_huge_pmd(new_page, vma); |
2032 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
2033 | _pmd = pmd_mkhuge(_pmd); | ||
2034 | 2031 | ||
2035 | /* | 2032 | /* |
2036 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2033 | * spin_lock() below is not the equivalent of smp_wmb(), so |
@@ -2064,8 +2061,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2064 | unsigned long address, | 2061 | unsigned long address, |
2065 | struct page **hpage) | 2062 | struct page **hpage) |
2066 | { | 2063 | { |
2067 | pgd_t *pgd; | ||
2068 | pud_t *pud; | ||
2069 | pmd_t *pmd; | 2064 | pmd_t *pmd; |
2070 | pte_t *pte, *_pte; | 2065 | pte_t *pte, *_pte; |
2071 | int ret = 0, referenced = 0, none = 0; | 2066 | int ret = 0, referenced = 0, none = 0; |
@@ -2076,16 +2071,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2076 | 2071 | ||
2077 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2072 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2078 | 2073 | ||
2079 | pgd = pgd_offset(mm, address); | 2074 | pmd = mm_find_pmd(mm, address); |
2080 | if (!pgd_present(*pgd)) | 2075 | if (!pmd) |
2081 | goto out; | ||
2082 | |||
2083 | pud = pud_offset(pgd, address); | ||
2084 | if (!pud_present(*pud)) | ||
2085 | goto out; | 2076 | goto out; |
2086 | 2077 | if (pmd_trans_huge(*pmd)) | |
2087 | pmd = pmd_offset(pud, address); | ||
2088 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
2089 | goto out; | 2078 | goto out; |
2090 | 2079 | ||
2091 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2080 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
@@ -2193,20 +2182,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2193 | progress++; | 2182 | progress++; |
2194 | break; | 2183 | break; |
2195 | } | 2184 | } |
2196 | 2185 | if (!hugepage_vma_check(vma)) { | |
2197 | if ((!(vma->vm_flags & VM_HUGEPAGE) && | 2186 | skip: |
2198 | !khugepaged_always()) || | ||
2199 | (vma->vm_flags & VM_NOHUGEPAGE)) { | ||
2200 | skip: | ||
2201 | progress++; | 2187 | progress++; |
2202 | continue; | 2188 | continue; |
2203 | } | 2189 | } |
2204 | if (!vma->anon_vma || vma->vm_ops) | ||
2205 | goto skip; | ||
2206 | if (is_vma_temporary_stack(vma)) | ||
2207 | goto skip; | ||
2208 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | ||
2209 | |||
2210 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2190 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2211 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2191 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2212 | if (hstart >= hend) | 2192 | if (hstart >= hend) |
@@ -2379,22 +2359,12 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | |||
2379 | static void split_huge_page_address(struct mm_struct *mm, | 2359 | static void split_huge_page_address(struct mm_struct *mm, |
2380 | unsigned long address) | 2360 | unsigned long address) |
2381 | { | 2361 | { |
2382 | pgd_t *pgd; | ||
2383 | pud_t *pud; | ||
2384 | pmd_t *pmd; | 2362 | pmd_t *pmd; |
2385 | 2363 | ||
2386 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | 2364 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); |
2387 | 2365 | ||
2388 | pgd = pgd_offset(mm, address); | 2366 | pmd = mm_find_pmd(mm, address); |
2389 | if (!pgd_present(*pgd)) | 2367 | if (!pmd) |
2390 | return; | ||
2391 | |||
2392 | pud = pud_offset(pgd, address); | ||
2393 | if (!pud_present(*pud)) | ||
2394 | return; | ||
2395 | |||
2396 | pmd = pmd_offset(pud, address); | ||
2397 | if (!pmd_present(*pmd)) | ||
2398 | return; | 2368 | return; |
2399 | /* | 2369 | /* |
2400 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2370 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59a0059b39e2..1ef2cd4ae3c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void) | |||
1800 | * remove hstate attributes from any nodes that have them. | 1800 | * remove hstate attributes from any nodes that have them. |
1801 | */ | 1801 | */ |
1802 | for (nid = 0; nid < nr_node_ids; nid++) | 1802 | for (nid = 0; nid < nr_node_ids; nid++) |
1803 | hugetlb_unregister_node(&node_devices[nid]); | 1803 | hugetlb_unregister_node(node_devices[nid]); |
1804 | } | 1804 | } |
1805 | 1805 | ||
1806 | /* | 1806 | /* |
@@ -1845,7 +1845,7 @@ static void hugetlb_register_all_nodes(void) | |||
1845 | int nid; | 1845 | int nid; |
1846 | 1846 | ||
1847 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1847 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1848 | struct node *node = &node_devices[nid]; | 1848 | struct node *node = node_devices[nid]; |
1849 | if (node->dev.id == nid) | 1849 | if (node->dev.id == nid) |
1850 | hugetlb_register_node(node); | 1850 | hugetlb_register_node(node); |
1851 | } | 1851 | } |
diff --git a/mm/internal.h b/mm/internal.h index a4fa284f6bc2..52d1fa957194 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page); | |||
92 | extern void putback_lru_page(struct page *page); | 92 | extern void putback_lru_page(struct page *page); |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * in mm/rmap.c: | ||
96 | */ | ||
97 | extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); | ||
98 | |||
99 | /* | ||
95 | * in mm/page_alloc.c | 100 | * in mm/page_alloc.c |
96 | */ | 101 | */ |
97 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 102 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
778 | struct page *kpage, pte_t orig_pte) | 778 | struct page *kpage, pte_t orig_pte) |
779 | { | 779 | { |
780 | struct mm_struct *mm = vma->vm_mm; | 780 | struct mm_struct *mm = vma->vm_mm; |
781 | pgd_t *pgd; | ||
782 | pud_t *pud; | ||
783 | pmd_t *pmd; | 781 | pmd_t *pmd; |
784 | pte_t *ptep; | 782 | pte_t *ptep; |
785 | spinlock_t *ptl; | 783 | spinlock_t *ptl; |
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
792 | if (addr == -EFAULT) | 790 | if (addr == -EFAULT) |
793 | goto out; | 791 | goto out; |
794 | 792 | ||
795 | pgd = pgd_offset(mm, addr); | 793 | pmd = mm_find_pmd(mm, addr); |
796 | if (!pgd_present(*pgd)) | 794 | if (!pmd) |
797 | goto out; | 795 | goto out; |
798 | |||
799 | pud = pud_offset(pgd, addr); | ||
800 | if (!pud_present(*pud)) | ||
801 | goto out; | ||
802 | |||
803 | pmd = pmd_offset(pud, addr); | ||
804 | BUG_ON(pmd_trans_huge(*pmd)); | 796 | BUG_ON(pmd_trans_huge(*pmd)); |
805 | if (!pmd_present(*pmd)) | ||
806 | goto out; | ||
807 | 797 | ||
808 | mmun_start = addr; | 798 | mmun_start = addr; |
809 | mmun_end = addr + PAGE_SIZE; | 799 | mmun_end = addr + PAGE_SIZE; |
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1929 | if (ksm_run != flags) { | 1919 | if (ksm_run != flags) { |
1930 | ksm_run = flags; | 1920 | ksm_run = flags; |
1931 | if (flags & KSM_RUN_UNMERGE) { | 1921 | if (flags & KSM_RUN_UNMERGE) { |
1932 | int oom_score_adj; | 1922 | set_current_oom_origin(); |
1933 | |||
1934 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | ||
1935 | err = unmerge_and_remove_all_rmap_items(); | 1923 | err = unmerge_and_remove_all_rmap_items(); |
1936 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, | 1924 | clear_current_oom_origin(); |
1937 | oom_score_adj); | ||
1938 | if (err) { | 1925 | if (err) { |
1939 | ksm_run = KSM_RUN_STOP; | 1926 | ksm_run = KSM_RUN_STOP; |
1940 | count = err; | 1927 | count = err; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b31..cf6d0df4849c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1498,8 +1498,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1498 | return limit; | 1498 | return limit; |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | 1501 | static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
1502 | int order) | 1502 | int order) |
1503 | { | 1503 | { |
1504 | struct mem_cgroup *iter; | 1504 | struct mem_cgroup *iter; |
1505 | unsigned long chosen_points = 0; | 1505 | unsigned long chosen_points = 0; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8b20278be6a6..108c52fa60f6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -781,16 +781,16 @@ static struct page_state { | |||
781 | { compound, compound, "huge", me_huge_page }, | 781 | { compound, compound, "huge", me_huge_page }, |
782 | #endif | 782 | #endif |
783 | 783 | ||
784 | { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, | 784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, |
785 | { sc|dirty, sc, "swapcache", me_swapcache_clean }, | 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
786 | 786 | ||
787 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | 787 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, |
788 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | 788 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, |
789 | 789 | ||
790 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | 790 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
791 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | 791 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, |
792 | 792 | ||
793 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
795 | 795 | ||
796 | /* | 796 | /* |
@@ -812,14 +812,14 @@ static struct page_state { | |||
812 | #undef slab | 812 | #undef slab |
813 | #undef reserved | 813 | #undef reserved |
814 | 814 | ||
815 | /* | ||
816 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | ||
817 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | ||
818 | */ | ||
815 | static void action_result(unsigned long pfn, char *msg, int result) | 819 | static void action_result(unsigned long pfn, char *msg, int result) |
816 | { | 820 | { |
817 | struct page *page = pfn_to_page(pfn); | 821 | pr_err("MCE %#lx: %s page recovery: %s\n", |
818 | 822 | pfn, msg, action_name[result]); | |
819 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | ||
820 | pfn, | ||
821 | PageDirty(page) ? "dirty " : "", | ||
822 | msg, action_name[result]); | ||
823 | } | 823 | } |
824 | 824 | ||
825 | static int page_action(struct page_state *ps, struct page *p, | 825 | static int page_action(struct page_state *ps, struct page *p, |
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1385 | * Isolate the page, so that it doesn't get reallocated if it | 1385 | * Isolate the page, so that it doesn't get reallocated if it |
1386 | * was free. | 1386 | * was free. |
1387 | */ | 1387 | */ |
1388 | set_migratetype_isolate(p); | 1388 | set_migratetype_isolate(p, true); |
1389 | /* | 1389 | /* |
1390 | * When the target page is a free hugepage, just remove it | 1390 | * When the target page is a free hugepage, just remove it |
1391 | * from free hugepage list. | 1391 | * from free hugepage list. |
diff --git a/mm/memory.c b/mm/memory.c index 221fc9ffcab1..765377385632 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3537,8 +3537,9 @@ retry: | |||
3537 | 3537 | ||
3538 | barrier(); | 3538 | barrier(); |
3539 | if (pmd_trans_huge(orig_pmd)) { | 3539 | if (pmd_trans_huge(orig_pmd)) { |
3540 | if (flags & FAULT_FLAG_WRITE && | 3540 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3541 | !pmd_write(orig_pmd) && | 3541 | |
3542 | if (dirty && !pmd_write(orig_pmd) && | ||
3542 | !pmd_trans_splitting(orig_pmd)) { | 3543 | !pmd_trans_splitting(orig_pmd)) { |
3543 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3544 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3544 | orig_pmd); | 3545 | orig_pmd); |
@@ -3550,6 +3551,9 @@ retry: | |||
3550 | if (unlikely(ret & VM_FAULT_OOM)) | 3551 | if (unlikely(ret & VM_FAULT_OOM)) |
3551 | goto retry; | 3552 | goto retry; |
3552 | return ret; | 3553 | return ret; |
3554 | } else { | ||
3555 | huge_pmd_set_accessed(mm, vma, address, pmd, | ||
3556 | orig_pmd, dirty); | ||
3553 | } | 3557 | } |
3554 | return 0; | 3558 | return 0; |
3555 | } | 3559 | } |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4eeacae2b91..de9cb14ae753 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
205 | zone_span_writelock(zone); | 205 | zone_span_writelock(zone); |
206 | 206 | ||
207 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 207 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; |
208 | if (start_pfn < zone->zone_start_pfn) | 208 | if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) |
209 | zone->zone_start_pfn = start_pfn; | 209 | zone->zone_start_pfn = start_pfn; |
210 | 210 | ||
211 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - | 211 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - |
@@ -214,13 +214,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
214 | zone_span_writeunlock(zone); | 214 | zone_span_writeunlock(zone); |
215 | } | 215 | } |
216 | 216 | ||
217 | static void resize_zone(struct zone *zone, unsigned long start_pfn, | ||
218 | unsigned long end_pfn) | ||
219 | { | ||
220 | zone_span_writelock(zone); | ||
221 | |||
222 | if (end_pfn - start_pfn) { | ||
223 | zone->zone_start_pfn = start_pfn; | ||
224 | zone->spanned_pages = end_pfn - start_pfn; | ||
225 | } else { | ||
226 | /* | ||
227 | * make it consist as free_area_init_core(), | ||
228 | * if spanned_pages = 0, then keep start_pfn = 0 | ||
229 | */ | ||
230 | zone->zone_start_pfn = 0; | ||
231 | zone->spanned_pages = 0; | ||
232 | } | ||
233 | |||
234 | zone_span_writeunlock(zone); | ||
235 | } | ||
236 | |||
237 | static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | ||
238 | unsigned long end_pfn) | ||
239 | { | ||
240 | enum zone_type zid = zone_idx(zone); | ||
241 | int nid = zone->zone_pgdat->node_id; | ||
242 | unsigned long pfn; | ||
243 | |||
244 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||
245 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); | ||
246 | } | ||
247 | |||
248 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | ||
249 | unsigned long start_pfn, unsigned long end_pfn) | ||
250 | { | ||
251 | int ret; | ||
252 | unsigned long flags; | ||
253 | unsigned long z1_start_pfn; | ||
254 | |||
255 | if (!z1->wait_table) { | ||
256 | ret = init_currently_empty_zone(z1, start_pfn, | ||
257 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | ||
258 | if (ret) | ||
259 | return ret; | ||
260 | } | ||
261 | |||
262 | pgdat_resize_lock(z1->zone_pgdat, &flags); | ||
263 | |||
264 | /* can't move pfns which are higher than @z2 */ | ||
265 | if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | ||
266 | goto out_fail; | ||
267 | /* the move out part mast at the left most of @z2 */ | ||
268 | if (start_pfn > z2->zone_start_pfn) | ||
269 | goto out_fail; | ||
270 | /* must included/overlap */ | ||
271 | if (end_pfn <= z2->zone_start_pfn) | ||
272 | goto out_fail; | ||
273 | |||
274 | /* use start_pfn for z1's start_pfn if z1 is empty */ | ||
275 | if (z1->spanned_pages) | ||
276 | z1_start_pfn = z1->zone_start_pfn; | ||
277 | else | ||
278 | z1_start_pfn = start_pfn; | ||
279 | |||
280 | resize_zone(z1, z1_start_pfn, end_pfn); | ||
281 | resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | ||
282 | |||
283 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
284 | |||
285 | fix_zone_id(z1, start_pfn, end_pfn); | ||
286 | |||
287 | return 0; | ||
288 | out_fail: | ||
289 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
290 | return -1; | ||
291 | } | ||
292 | |||
293 | static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | ||
294 | unsigned long start_pfn, unsigned long end_pfn) | ||
295 | { | ||
296 | int ret; | ||
297 | unsigned long flags; | ||
298 | unsigned long z2_end_pfn; | ||
299 | |||
300 | if (!z2->wait_table) { | ||
301 | ret = init_currently_empty_zone(z2, start_pfn, | ||
302 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | ||
303 | if (ret) | ||
304 | return ret; | ||
305 | } | ||
306 | |||
307 | pgdat_resize_lock(z1->zone_pgdat, &flags); | ||
308 | |||
309 | /* can't move pfns which are lower than @z1 */ | ||
310 | if (z1->zone_start_pfn > start_pfn) | ||
311 | goto out_fail; | ||
312 | /* the move out part mast at the right most of @z1 */ | ||
313 | if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | ||
314 | goto out_fail; | ||
315 | /* must included/overlap */ | ||
316 | if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | ||
317 | goto out_fail; | ||
318 | |||
319 | /* use end_pfn for z2's end_pfn if z2 is empty */ | ||
320 | if (z2->spanned_pages) | ||
321 | z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; | ||
322 | else | ||
323 | z2_end_pfn = end_pfn; | ||
324 | |||
325 | resize_zone(z1, z1->zone_start_pfn, start_pfn); | ||
326 | resize_zone(z2, start_pfn, z2_end_pfn); | ||
327 | |||
328 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
329 | |||
330 | fix_zone_id(z2, start_pfn, end_pfn); | ||
331 | |||
332 | return 0; | ||
333 | out_fail: | ||
334 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
335 | return -1; | ||
336 | } | ||
337 | |||
217 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | 338 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
218 | unsigned long end_pfn) | 339 | unsigned long end_pfn) |
219 | { | 340 | { |
220 | unsigned long old_pgdat_end_pfn = | 341 | unsigned long old_pgdat_end_pfn = |
221 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | 342 | pgdat->node_start_pfn + pgdat->node_spanned_pages; |
222 | 343 | ||
223 | if (start_pfn < pgdat->node_start_pfn) | 344 | if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) |
224 | pgdat->node_start_pfn = start_pfn; | 345 | pgdat->node_start_pfn = start_pfn; |
225 | 346 | ||
226 | pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - | 347 | pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - |
@@ -460,8 +581,61 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
460 | return 0; | 581 | return 0; |
461 | } | 582 | } |
462 | 583 | ||
584 | /* ensure every online node has NORMAL memory */ | ||
585 | static bool can_online_high_movable(struct zone *zone) | ||
586 | { | ||
587 | return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); | ||
588 | } | ||
589 | |||
590 | /* check which state of node_states will be changed when online memory */ | ||
591 | static void node_states_check_changes_online(unsigned long nr_pages, | ||
592 | struct zone *zone, struct memory_notify *arg) | ||
593 | { | ||
594 | int nid = zone_to_nid(zone); | ||
595 | enum zone_type zone_last = ZONE_NORMAL; | ||
596 | |||
597 | /* | ||
598 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
599 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | ||
600 | * | ||
601 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
602 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
603 | */ | ||
604 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | ||
605 | zone_last = ZONE_MOVABLE; | ||
606 | |||
607 | /* | ||
608 | * if the memory to be online is in a zone of 0...zone_last, and | ||
609 | * the zones of 0...zone_last don't have memory before online, we will | ||
610 | * need to set the node to node_states[N_NORMAL_MEMORY] after | ||
611 | * the memory is online. | ||
612 | */ | ||
613 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) | ||
614 | arg->status_change_nid_normal = nid; | ||
615 | else | ||
616 | arg->status_change_nid_normal = -1; | ||
617 | |||
618 | /* | ||
619 | * if the node don't have memory befor online, we will need to | ||
620 | * set the node to node_states[N_HIGH_MEMORY] after the memory | ||
621 | * is online. | ||
622 | */ | ||
623 | if (!node_state(nid, N_HIGH_MEMORY)) | ||
624 | arg->status_change_nid = nid; | ||
625 | else | ||
626 | arg->status_change_nid = -1; | ||
627 | } | ||
628 | |||
629 | static void node_states_set_node(int node, struct memory_notify *arg) | ||
630 | { | ||
631 | if (arg->status_change_nid_normal >= 0) | ||
632 | node_set_state(node, N_NORMAL_MEMORY); | ||
633 | |||
634 | node_set_state(node, N_HIGH_MEMORY); | ||
635 | } | ||
463 | 636 | ||
464 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | 637 | |
638 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | ||
465 | { | 639 | { |
466 | unsigned long onlined_pages = 0; | 640 | unsigned long onlined_pages = 0; |
467 | struct zone *zone; | 641 | struct zone *zone; |
@@ -471,13 +645,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
471 | struct memory_notify arg; | 645 | struct memory_notify arg; |
472 | 646 | ||
473 | lock_memory_hotplug(); | 647 | lock_memory_hotplug(); |
648 | /* | ||
649 | * This doesn't need a lock to do pfn_to_page(). | ||
650 | * The section can't be removed here because of the | ||
651 | * memory_block->state_mutex. | ||
652 | */ | ||
653 | zone = page_zone(pfn_to_page(pfn)); | ||
654 | |||
655 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && | ||
656 | !can_online_high_movable(zone)) { | ||
657 | unlock_memory_hotplug(); | ||
658 | return -1; | ||
659 | } | ||
660 | |||
661 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | ||
662 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | ||
663 | unlock_memory_hotplug(); | ||
664 | return -1; | ||
665 | } | ||
666 | } | ||
667 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | ||
668 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | ||
669 | unlock_memory_hotplug(); | ||
670 | return -1; | ||
671 | } | ||
672 | } | ||
673 | |||
674 | /* Previous code may changed the zone of the pfn range */ | ||
675 | zone = page_zone(pfn_to_page(pfn)); | ||
676 | |||
474 | arg.start_pfn = pfn; | 677 | arg.start_pfn = pfn; |
475 | arg.nr_pages = nr_pages; | 678 | arg.nr_pages = nr_pages; |
476 | arg.status_change_nid = -1; | 679 | node_states_check_changes_online(nr_pages, zone, &arg); |
477 | 680 | ||
478 | nid = page_to_nid(pfn_to_page(pfn)); | 681 | nid = page_to_nid(pfn_to_page(pfn)); |
479 | if (node_present_pages(nid) == 0) | ||
480 | arg.status_change_nid = nid; | ||
481 | 682 | ||
482 | ret = memory_notify(MEM_GOING_ONLINE, &arg); | 683 | ret = memory_notify(MEM_GOING_ONLINE, &arg); |
483 | ret = notifier_to_errno(ret); | 684 | ret = notifier_to_errno(ret); |
@@ -487,23 +688,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
487 | return ret; | 688 | return ret; |
488 | } | 689 | } |
489 | /* | 690 | /* |
490 | * This doesn't need a lock to do pfn_to_page(). | ||
491 | * The section can't be removed here because of the | ||
492 | * memory_block->state_mutex. | ||
493 | */ | ||
494 | zone = page_zone(pfn_to_page(pfn)); | ||
495 | /* | ||
496 | * If this zone is not populated, then it is not in zonelist. | 691 | * If this zone is not populated, then it is not in zonelist. |
497 | * This means the page allocator ignores this zone. | 692 | * This means the page allocator ignores this zone. |
498 | * So, zonelist must be updated after online. | 693 | * So, zonelist must be updated after online. |
499 | */ | 694 | */ |
500 | mutex_lock(&zonelists_mutex); | 695 | mutex_lock(&zonelists_mutex); |
501 | if (!populated_zone(zone)) | 696 | if (!populated_zone(zone)) { |
502 | need_zonelists_rebuild = 1; | 697 | need_zonelists_rebuild = 1; |
698 | build_all_zonelists(NULL, zone); | ||
699 | } | ||
503 | 700 | ||
504 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 701 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
505 | online_pages_range); | 702 | online_pages_range); |
506 | if (ret) { | 703 | if (ret) { |
704 | if (need_zonelists_rebuild) | ||
705 | zone_pcp_reset(zone); | ||
507 | mutex_unlock(&zonelists_mutex); | 706 | mutex_unlock(&zonelists_mutex); |
508 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", | 707 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", |
509 | (unsigned long long) pfn << PAGE_SHIFT, | 708 | (unsigned long long) pfn << PAGE_SHIFT, |
@@ -517,9 +716,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
517 | zone->present_pages += onlined_pages; | 716 | zone->present_pages += onlined_pages; |
518 | zone->zone_pgdat->node_present_pages += onlined_pages; | 717 | zone->zone_pgdat->node_present_pages += onlined_pages; |
519 | if (onlined_pages) { | 718 | if (onlined_pages) { |
520 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 719 | node_states_set_node(zone_to_nid(zone), &arg); |
521 | if (need_zonelists_rebuild) | 720 | if (need_zonelists_rebuild) |
522 | build_all_zonelists(NULL, zone); | 721 | build_all_zonelists(NULL, NULL); |
523 | else | 722 | else |
524 | zone_pcp_update(zone); | 723 | zone_pcp_update(zone); |
525 | } | 724 | } |
@@ -847,7 +1046,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, | |||
847 | { | 1046 | { |
848 | int ret; | 1047 | int ret; |
849 | long offlined = *(long *)data; | 1048 | long offlined = *(long *)data; |
850 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); | 1049 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); |
851 | offlined = nr_pages; | 1050 | offlined = nr_pages; |
852 | if (!ret) | 1051 | if (!ret) |
853 | *(long *)data += offlined; | 1052 | *(long *)data += offlined; |
@@ -867,6 +1066,91 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
867 | return offlined; | 1066 | return offlined; |
868 | } | 1067 | } |
869 | 1068 | ||
1069 | /* ensure the node has NORMAL memory if it is still online */ | ||
1070 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | ||
1071 | { | ||
1072 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1073 | unsigned long present_pages = 0; | ||
1074 | enum zone_type zt; | ||
1075 | |||
1076 | for (zt = 0; zt <= ZONE_NORMAL; zt++) | ||
1077 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1078 | |||
1079 | if (present_pages > nr_pages) | ||
1080 | return true; | ||
1081 | |||
1082 | present_pages = 0; | ||
1083 | for (; zt <= ZONE_MOVABLE; zt++) | ||
1084 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1085 | |||
1086 | /* | ||
1087 | * we can't offline the last normal memory until all | ||
1088 | * higher memory is offlined. | ||
1089 | */ | ||
1090 | return present_pages == 0; | ||
1091 | } | ||
1092 | |||
1093 | /* check which state of node_states will be changed when offline memory */ | ||
1094 | static void node_states_check_changes_offline(unsigned long nr_pages, | ||
1095 | struct zone *zone, struct memory_notify *arg) | ||
1096 | { | ||
1097 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1098 | unsigned long present_pages = 0; | ||
1099 | enum zone_type zt, zone_last = ZONE_NORMAL; | ||
1100 | |||
1101 | /* | ||
1102 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
1103 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | ||
1104 | * | ||
1105 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
1106 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
1107 | */ | ||
1108 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | ||
1109 | zone_last = ZONE_MOVABLE; | ||
1110 | |||
1111 | /* | ||
1112 | * check whether node_states[N_NORMAL_MEMORY] will be changed. | ||
1113 | * If the memory to be offline is in a zone of 0...zone_last, | ||
1114 | * and it is the last present memory, 0...zone_last will | ||
1115 | * become empty after offline , thus we can determind we will | ||
1116 | * need to clear the node from node_states[N_NORMAL_MEMORY]. | ||
1117 | */ | ||
1118 | for (zt = 0; zt <= zone_last; zt++) | ||
1119 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1120 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
1121 | arg->status_change_nid_normal = zone_to_nid(zone); | ||
1122 | else | ||
1123 | arg->status_change_nid_normal = -1; | ||
1124 | |||
1125 | /* | ||
1126 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE | ||
1127 | */ | ||
1128 | zone_last = ZONE_MOVABLE; | ||
1129 | |||
1130 | /* | ||
1131 | * check whether node_states[N_HIGH_MEMORY] will be changed | ||
1132 | * If we try to offline the last present @nr_pages from the node, | ||
1133 | * we can determind we will need to clear the node from | ||
1134 | * node_states[N_HIGH_MEMORY]. | ||
1135 | */ | ||
1136 | for (; zt <= zone_last; zt++) | ||
1137 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1138 | if (nr_pages >= present_pages) | ||
1139 | arg->status_change_nid = zone_to_nid(zone); | ||
1140 | else | ||
1141 | arg->status_change_nid = -1; | ||
1142 | } | ||
1143 | |||
1144 | static void node_states_clear_node(int node, struct memory_notify *arg) | ||
1145 | { | ||
1146 | if (arg->status_change_nid_normal >= 0) | ||
1147 | node_clear_state(node, N_NORMAL_MEMORY); | ||
1148 | |||
1149 | if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && | ||
1150 | (arg->status_change_nid >= 0)) | ||
1151 | node_clear_state(node, N_HIGH_MEMORY); | ||
1152 | } | ||
1153 | |||
870 | static int __ref __offline_pages(unsigned long start_pfn, | 1154 | static int __ref __offline_pages(unsigned long start_pfn, |
871 | unsigned long end_pfn, unsigned long timeout) | 1155 | unsigned long end_pfn, unsigned long timeout) |
872 | { | 1156 | { |
@@ -893,16 +1177,19 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
893 | node = zone_to_nid(zone); | 1177 | node = zone_to_nid(zone); |
894 | nr_pages = end_pfn - start_pfn; | 1178 | nr_pages = end_pfn - start_pfn; |
895 | 1179 | ||
1180 | ret = -EINVAL; | ||
1181 | if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) | ||
1182 | goto out; | ||
1183 | |||
896 | /* set above range as isolated */ | 1184 | /* set above range as isolated */ |
897 | ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1185 | ret = start_isolate_page_range(start_pfn, end_pfn, |
1186 | MIGRATE_MOVABLE, true); | ||
898 | if (ret) | 1187 | if (ret) |
899 | goto out; | 1188 | goto out; |
900 | 1189 | ||
901 | arg.start_pfn = start_pfn; | 1190 | arg.start_pfn = start_pfn; |
902 | arg.nr_pages = nr_pages; | 1191 | arg.nr_pages = nr_pages; |
903 | arg.status_change_nid = -1; | 1192 | node_states_check_changes_offline(nr_pages, zone, &arg); |
904 | if (nr_pages >= node_present_pages(node)) | ||
905 | arg.status_change_nid = node; | ||
906 | 1193 | ||
907 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); | 1194 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); |
908 | ret = notifier_to_errno(ret); | 1195 | ret = notifier_to_errno(ret); |
@@ -975,10 +1262,9 @@ repeat: | |||
975 | } else | 1262 | } else |
976 | zone_pcp_update(zone); | 1263 | zone_pcp_update(zone); |
977 | 1264 | ||
978 | if (!node_present_pages(node)) { | 1265 | node_states_clear_node(node, &arg); |
979 | node_clear_state(node, N_HIGH_MEMORY); | 1266 | if (arg.status_change_nid >= 0) |
980 | kswapd_stop(node); | 1267 | kswapd_stop(node); |
981 | } | ||
982 | 1268 | ||
983 | vm_total_pages = nr_free_pagecache_pages(); | 1269 | vm_total_pages = nr_free_pagecache_pages(); |
984 | writeback_set_ratelimit(); | 1270 | writeback_set_ratelimit(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ea600da8940..05b28361a39b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1907,7 +1907,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1907 | unsigned long addr, int node) | 1907 | unsigned long addr, int node) |
1908 | { | 1908 | { |
1909 | struct mempolicy *pol; | 1909 | struct mempolicy *pol; |
1910 | struct zonelist *zl; | ||
1911 | struct page *page; | 1910 | struct page *page; |
1912 | unsigned int cpuset_mems_cookie; | 1911 | unsigned int cpuset_mems_cookie; |
1913 | 1912 | ||
@@ -1926,23 +1925,11 @@ retry_cpuset: | |||
1926 | 1925 | ||
1927 | return page; | 1926 | return page; |
1928 | } | 1927 | } |
1929 | zl = policy_zonelist(gfp, pol, node); | 1928 | page = __alloc_pages_nodemask(gfp, order, |
1930 | if (unlikely(mpol_needs_cond_ref(pol))) { | 1929 | policy_zonelist(gfp, pol, node), |
1931 | /* | ||
1932 | * slow path: ref counted shared policy | ||
1933 | */ | ||
1934 | struct page *page = __alloc_pages_nodemask(gfp, order, | ||
1935 | zl, policy_nodemask(gfp, pol)); | ||
1936 | __mpol_put(pol); | ||
1937 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1938 | goto retry_cpuset; | ||
1939 | return page; | ||
1940 | } | ||
1941 | /* | ||
1942 | * fast path: default or task policy | ||
1943 | */ | ||
1944 | page = __alloc_pages_nodemask(gfp, order, zl, | ||
1945 | policy_nodemask(gfp, pol)); | 1930 | policy_nodemask(gfp, pol)); |
1931 | if (unlikely(mpol_needs_cond_ref(pol))) | ||
1932 | __mpol_put(pol); | ||
1946 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 1933 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1947 | goto retry_cpuset; | 1934 | goto retry_cpuset; |
1948 | return page; | 1935 | return page; |
diff --git a/mm/migrate.c b/mm/migrate.c index 77ed2d773705..3f675ca08279 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | 36 | #include <linux/hugetlb_cgroup.h> |
37 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
38 | #include <linux/balloon_compaction.h> | ||
38 | 39 | ||
39 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
40 | 41 | ||
@@ -79,7 +80,30 @@ void putback_lru_pages(struct list_head *l) | |||
79 | list_del(&page->lru); | 80 | list_del(&page->lru); |
80 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 81 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
81 | page_is_file_cache(page)); | 82 | page_is_file_cache(page)); |
82 | putback_lru_page(page); | 83 | putback_lru_page(page); |
84 | } | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * Put previously isolated pages back onto the appropriate lists | ||
89 | * from where they were once taken off for compaction/migration. | ||
90 | * | ||
91 | * This function shall be used instead of putback_lru_pages(), | ||
92 | * whenever the isolated pageset has been built by isolate_migratepages_range() | ||
93 | */ | ||
94 | void putback_movable_pages(struct list_head *l) | ||
95 | { | ||
96 | struct page *page; | ||
97 | struct page *page2; | ||
98 | |||
99 | list_for_each_entry_safe(page, page2, l, lru) { | ||
100 | list_del(&page->lru); | ||
101 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
102 | page_is_file_cache(page)); | ||
103 | if (unlikely(balloon_page_movable(page))) | ||
104 | balloon_page_putback(page); | ||
105 | else | ||
106 | putback_lru_page(page); | ||
83 | } | 107 | } |
84 | } | 108 | } |
85 | 109 | ||
@@ -91,8 +115,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
91 | { | 115 | { |
92 | struct mm_struct *mm = vma->vm_mm; | 116 | struct mm_struct *mm = vma->vm_mm; |
93 | swp_entry_t entry; | 117 | swp_entry_t entry; |
94 | pgd_t *pgd; | ||
95 | pud_t *pud; | ||
96 | pmd_t *pmd; | 118 | pmd_t *pmd; |
97 | pte_t *ptep, pte; | 119 | pte_t *ptep, pte; |
98 | spinlock_t *ptl; | 120 | spinlock_t *ptl; |
@@ -103,19 +125,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
103 | goto out; | 125 | goto out; |
104 | ptl = &mm->page_table_lock; | 126 | ptl = &mm->page_table_lock; |
105 | } else { | 127 | } else { |
106 | pgd = pgd_offset(mm, addr); | 128 | pmd = mm_find_pmd(mm, addr); |
107 | if (!pgd_present(*pgd)) | 129 | if (!pmd) |
108 | goto out; | ||
109 | |||
110 | pud = pud_offset(pgd, addr); | ||
111 | if (!pud_present(*pud)) | ||
112 | goto out; | 130 | goto out; |
113 | |||
114 | pmd = pmd_offset(pud, addr); | ||
115 | if (pmd_trans_huge(*pmd)) | 131 | if (pmd_trans_huge(*pmd)) |
116 | goto out; | 132 | goto out; |
117 | if (!pmd_present(*pmd)) | ||
118 | goto out; | ||
119 | 133 | ||
120 | ptep = pte_offset_map(pmd, addr); | 134 | ptep = pte_offset_map(pmd, addr); |
121 | 135 | ||
@@ -286,7 +300,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
286 | /* Anonymous page without mapping */ | 300 | /* Anonymous page without mapping */ |
287 | if (page_count(page) != 1) | 301 | if (page_count(page) != 1) |
288 | return -EAGAIN; | 302 | return -EAGAIN; |
289 | return 0; | 303 | return MIGRATEPAGE_SUCCESS; |
290 | } | 304 | } |
291 | 305 | ||
292 | spin_lock_irq(&mapping->tree_lock); | 306 | spin_lock_irq(&mapping->tree_lock); |
@@ -356,7 +370,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
356 | } | 370 | } |
357 | spin_unlock_irq(&mapping->tree_lock); | 371 | spin_unlock_irq(&mapping->tree_lock); |
358 | 372 | ||
359 | return 0; | 373 | return MIGRATEPAGE_SUCCESS; |
360 | } | 374 | } |
361 | 375 | ||
362 | /* | 376 | /* |
@@ -372,7 +386,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
372 | if (!mapping) { | 386 | if (!mapping) { |
373 | if (page_count(page) != 1) | 387 | if (page_count(page) != 1) |
374 | return -EAGAIN; | 388 | return -EAGAIN; |
375 | return 0; | 389 | return MIGRATEPAGE_SUCCESS; |
376 | } | 390 | } |
377 | 391 | ||
378 | spin_lock_irq(&mapping->tree_lock); | 392 | spin_lock_irq(&mapping->tree_lock); |
@@ -399,7 +413,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
399 | page_unfreeze_refs(page, expected_count - 1); | 413 | page_unfreeze_refs(page, expected_count - 1); |
400 | 414 | ||
401 | spin_unlock_irq(&mapping->tree_lock); | 415 | spin_unlock_irq(&mapping->tree_lock); |
402 | return 0; | 416 | return MIGRATEPAGE_SUCCESS; |
403 | } | 417 | } |
404 | 418 | ||
405 | /* | 419 | /* |
@@ -486,11 +500,11 @@ int migrate_page(struct address_space *mapping, | |||
486 | 500 | ||
487 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); | 501 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); |
488 | 502 | ||
489 | if (rc) | 503 | if (rc != MIGRATEPAGE_SUCCESS) |
490 | return rc; | 504 | return rc; |
491 | 505 | ||
492 | migrate_page_copy(newpage, page); | 506 | migrate_page_copy(newpage, page); |
493 | return 0; | 507 | return MIGRATEPAGE_SUCCESS; |
494 | } | 508 | } |
495 | EXPORT_SYMBOL(migrate_page); | 509 | EXPORT_SYMBOL(migrate_page); |
496 | 510 | ||
@@ -513,7 +527,7 @@ int buffer_migrate_page(struct address_space *mapping, | |||
513 | 527 | ||
514 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); | 528 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); |
515 | 529 | ||
516 | if (rc) | 530 | if (rc != MIGRATEPAGE_SUCCESS) |
517 | return rc; | 531 | return rc; |
518 | 532 | ||
519 | /* | 533 | /* |
@@ -549,7 +563,7 @@ int buffer_migrate_page(struct address_space *mapping, | |||
549 | 563 | ||
550 | } while (bh != head); | 564 | } while (bh != head); |
551 | 565 | ||
552 | return 0; | 566 | return MIGRATEPAGE_SUCCESS; |
553 | } | 567 | } |
554 | EXPORT_SYMBOL(buffer_migrate_page); | 568 | EXPORT_SYMBOL(buffer_migrate_page); |
555 | #endif | 569 | #endif |
@@ -628,7 +642,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
628 | * | 642 | * |
629 | * Return value: | 643 | * Return value: |
630 | * < 0 - error code | 644 | * < 0 - error code |
631 | * == 0 - success | 645 | * MIGRATEPAGE_SUCCESS - success |
632 | */ | 646 | */ |
633 | static int move_to_new_page(struct page *newpage, struct page *page, | 647 | static int move_to_new_page(struct page *newpage, struct page *page, |
634 | int remap_swapcache, enum migrate_mode mode) | 648 | int remap_swapcache, enum migrate_mode mode) |
@@ -665,7 +679,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
665 | else | 679 | else |
666 | rc = fallback_migrate_page(mapping, newpage, page, mode); | 680 | rc = fallback_migrate_page(mapping, newpage, page, mode); |
667 | 681 | ||
668 | if (rc) { | 682 | if (rc != MIGRATEPAGE_SUCCESS) { |
669 | newpage->mapping = NULL; | 683 | newpage->mapping = NULL; |
670 | } else { | 684 | } else { |
671 | if (remap_swapcache) | 685 | if (remap_swapcache) |
@@ -778,6 +792,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
778 | } | 792 | } |
779 | } | 793 | } |
780 | 794 | ||
795 | if (unlikely(balloon_page_movable(page))) { | ||
796 | /* | ||
797 | * A ballooned page does not need any special attention from | ||
798 | * physical to virtual reverse mapping procedures. | ||
799 | * Skip any attempt to unmap PTEs or to remap swap cache, | ||
800 | * in order to avoid burning cycles at rmap level, and perform | ||
801 | * the page migration right away (proteced by page lock). | ||
802 | */ | ||
803 | rc = balloon_page_migrate(newpage, page, mode); | ||
804 | goto uncharge; | ||
805 | } | ||
806 | |||
781 | /* | 807 | /* |
782 | * Corner case handling: | 808 | * Corner case handling: |
783 | * 1. When a new swap-cache page is read into, it is added to the LRU | 809 | * 1. When a new swap-cache page is read into, it is added to the LRU |
@@ -814,7 +840,9 @@ skip_unmap: | |||
814 | put_anon_vma(anon_vma); | 840 | put_anon_vma(anon_vma); |
815 | 841 | ||
816 | uncharge: | 842 | uncharge: |
817 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | 843 | mem_cgroup_end_migration(mem, page, newpage, |
844 | (rc == MIGRATEPAGE_SUCCESS || | ||
845 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); | ||
818 | unlock: | 846 | unlock: |
819 | unlock_page(page); | 847 | unlock_page(page); |
820 | out: | 848 | out: |
@@ -846,6 +874,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
846 | goto out; | 874 | goto out; |
847 | 875 | ||
848 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | 876 | rc = __unmap_and_move(page, newpage, force, offlining, mode); |
877 | |||
878 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | ||
879 | /* | ||
880 | * A ballooned page has been migrated already. | ||
881 | * Now, it's the time to wrap-up counters, | ||
882 | * handle the page back to Buddy and return. | ||
883 | */ | ||
884 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
885 | page_is_file_cache(page)); | ||
886 | balloon_page_free(page); | ||
887 | return MIGRATEPAGE_SUCCESS; | ||
888 | } | ||
849 | out: | 889 | out: |
850 | if (rc != -EAGAIN) { | 890 | if (rc != -EAGAIN) { |
851 | /* | 891 | /* |
@@ -987,7 +1027,7 @@ int migrate_pages(struct list_head *from, | |||
987 | case -EAGAIN: | 1027 | case -EAGAIN: |
988 | retry++; | 1028 | retry++; |
989 | break; | 1029 | break; |
990 | case 0: | 1030 | case MIGRATEPAGE_SUCCESS: |
991 | break; | 1031 | break; |
992 | default: | 1032 | default: |
993 | /* Permanent failure */ | 1033 | /* Permanent failure */ |
@@ -996,15 +1036,12 @@ int migrate_pages(struct list_head *from, | |||
996 | } | 1036 | } |
997 | } | 1037 | } |
998 | } | 1038 | } |
999 | rc = 0; | 1039 | rc = nr_failed + retry; |
1000 | out: | 1040 | out: |
1001 | if (!swapwrite) | 1041 | if (!swapwrite) |
1002 | current->flags &= ~PF_SWAPWRITE; | 1042 | current->flags &= ~PF_SWAPWRITE; |
1003 | 1043 | ||
1004 | if (rc) | 1044 | return rc; |
1005 | return rc; | ||
1006 | |||
1007 | return nr_failed + retry; | ||
1008 | } | 1045 | } |
1009 | 1046 | ||
1010 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | 1047 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
@@ -1024,7 +1061,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | |||
1024 | /* try again */ | 1061 | /* try again */ |
1025 | cond_resched(); | 1062 | cond_resched(); |
1026 | break; | 1063 | break; |
1027 | case 0: | 1064 | case MIGRATEPAGE_SUCCESS: |
1028 | goto out; | 1065 | goto out; |
1029 | default: | 1066 | default: |
1030 | rc = -EIO; | 1067 | rc = -EIO; |
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/khugepaged.h> | 32 | #include <linux/khugepaged.h> |
33 | #include <linux/uprobes.h> | 33 | #include <linux/uprobes.h> |
34 | #include <linux/rbtree_augmented.h> | ||
34 | 35 | ||
35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
36 | #include <asm/cacheflush.h> | 37 | #include <asm/cacheflush.h> |
@@ -311,40 +312,88 @@ out: | |||
311 | return retval; | 312 | return retval; |
312 | } | 313 | } |
313 | 314 | ||
315 | static long vma_compute_subtree_gap(struct vm_area_struct *vma) | ||
316 | { | ||
317 | unsigned long max, subtree_gap; | ||
318 | max = vma->vm_start; | ||
319 | if (vma->vm_prev) | ||
320 | max -= vma->vm_prev->vm_end; | ||
321 | if (vma->vm_rb.rb_left) { | ||
322 | subtree_gap = rb_entry(vma->vm_rb.rb_left, | ||
323 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | ||
324 | if (subtree_gap > max) | ||
325 | max = subtree_gap; | ||
326 | } | ||
327 | if (vma->vm_rb.rb_right) { | ||
328 | subtree_gap = rb_entry(vma->vm_rb.rb_right, | ||
329 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | ||
330 | if (subtree_gap > max) | ||
331 | max = subtree_gap; | ||
332 | } | ||
333 | return max; | ||
334 | } | ||
335 | |||
314 | #ifdef CONFIG_DEBUG_VM_RB | 336 | #ifdef CONFIG_DEBUG_VM_RB |
315 | static int browse_rb(struct rb_root *root) | 337 | static int browse_rb(struct rb_root *root) |
316 | { | 338 | { |
317 | int i = 0, j; | 339 | int i = 0, j, bug = 0; |
318 | struct rb_node *nd, *pn = NULL; | 340 | struct rb_node *nd, *pn = NULL; |
319 | unsigned long prev = 0, pend = 0; | 341 | unsigned long prev = 0, pend = 0; |
320 | 342 | ||
321 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 343 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
322 | struct vm_area_struct *vma; | 344 | struct vm_area_struct *vma; |
323 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 345 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
324 | if (vma->vm_start < prev) | 346 | if (vma->vm_start < prev) { |
325 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; | 347 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev); |
326 | if (vma->vm_start < pend) | 348 | bug = 1; |
349 | } | ||
350 | if (vma->vm_start < pend) { | ||
327 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); | 351 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); |
328 | if (vma->vm_start > vma->vm_end) | 352 | bug = 1; |
329 | printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); | 353 | } |
354 | if (vma->vm_start > vma->vm_end) { | ||
355 | printk("vm_end %lx < vm_start %lx\n", | ||
356 | vma->vm_end, vma->vm_start); | ||
357 | bug = 1; | ||
358 | } | ||
359 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { | ||
360 | printk("free gap %lx, correct %lx\n", | ||
361 | vma->rb_subtree_gap, | ||
362 | vma_compute_subtree_gap(vma)); | ||
363 | bug = 1; | ||
364 | } | ||
330 | i++; | 365 | i++; |
331 | pn = nd; | 366 | pn = nd; |
332 | prev = vma->vm_start; | 367 | prev = vma->vm_start; |
333 | pend = vma->vm_end; | 368 | pend = vma->vm_end; |
334 | } | 369 | } |
335 | j = 0; | 370 | j = 0; |
336 | for (nd = pn; nd; nd = rb_prev(nd)) { | 371 | for (nd = pn; nd; nd = rb_prev(nd)) |
337 | j++; | 372 | j++; |
373 | if (i != j) { | ||
374 | printk("backwards %d, forwards %d\n", j, i); | ||
375 | bug = 1; | ||
376 | } | ||
377 | return bug ? -1 : i; | ||
378 | } | ||
379 | |||
380 | static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) | ||
381 | { | ||
382 | struct rb_node *nd; | ||
383 | |||
384 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | ||
385 | struct vm_area_struct *vma; | ||
386 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | ||
387 | BUG_ON(vma != ignore && | ||
388 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); | ||
338 | } | 389 | } |
339 | if (i != j) | ||
340 | printk("backwards %d, forwards %d\n", j, i), i = 0; | ||
341 | return i; | ||
342 | } | 390 | } |
343 | 391 | ||
344 | void validate_mm(struct mm_struct *mm) | 392 | void validate_mm(struct mm_struct *mm) |
345 | { | 393 | { |
346 | int bug = 0; | 394 | int bug = 0; |
347 | int i = 0; | 395 | int i = 0; |
396 | unsigned long highest_address = 0; | ||
348 | struct vm_area_struct *vma = mm->mmap; | 397 | struct vm_area_struct *vma = mm->mmap; |
349 | while (vma) { | 398 | while (vma) { |
350 | struct anon_vma_chain *avc; | 399 | struct anon_vma_chain *avc; |
@@ -352,20 +401,73 @@ void validate_mm(struct mm_struct *mm) | |||
352 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 401 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
353 | anon_vma_interval_tree_verify(avc); | 402 | anon_vma_interval_tree_verify(avc); |
354 | vma_unlock_anon_vma(vma); | 403 | vma_unlock_anon_vma(vma); |
404 | highest_address = vma->vm_end; | ||
355 | vma = vma->vm_next; | 405 | vma = vma->vm_next; |
356 | i++; | 406 | i++; |
357 | } | 407 | } |
358 | if (i != mm->map_count) | 408 | if (i != mm->map_count) { |
359 | printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; | 409 | printk("map_count %d vm_next %d\n", mm->map_count, i); |
410 | bug = 1; | ||
411 | } | ||
412 | if (highest_address != mm->highest_vm_end) { | ||
413 | printk("mm->highest_vm_end %lx, found %lx\n", | ||
414 | mm->highest_vm_end, highest_address); | ||
415 | bug = 1; | ||
416 | } | ||
360 | i = browse_rb(&mm->mm_rb); | 417 | i = browse_rb(&mm->mm_rb); |
361 | if (i != mm->map_count) | 418 | if (i != mm->map_count) { |
362 | printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; | 419 | printk("map_count %d rb %d\n", mm->map_count, i); |
420 | bug = 1; | ||
421 | } | ||
363 | BUG_ON(bug); | 422 | BUG_ON(bug); |
364 | } | 423 | } |
365 | #else | 424 | #else |
425 | #define validate_mm_rb(root, ignore) do { } while (0) | ||
366 | #define validate_mm(mm) do { } while (0) | 426 | #define validate_mm(mm) do { } while (0) |
367 | #endif | 427 | #endif |
368 | 428 | ||
429 | RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, | ||
430 | unsigned long, rb_subtree_gap, vma_compute_subtree_gap) | ||
431 | |||
432 | /* | ||
433 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or | ||
434 | * vma->vm_prev->vm_end values changed, without modifying the vma's position | ||
435 | * in the rbtree. | ||
436 | */ | ||
437 | static void vma_gap_update(struct vm_area_struct *vma) | ||
438 | { | ||
439 | /* | ||
440 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback | ||
441 | * function that does exacltly what we want. | ||
442 | */ | ||
443 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); | ||
444 | } | ||
445 | |||
446 | static inline void vma_rb_insert(struct vm_area_struct *vma, | ||
447 | struct rb_root *root) | ||
448 | { | ||
449 | /* All rb_subtree_gap values must be consistent prior to insertion */ | ||
450 | validate_mm_rb(root, NULL); | ||
451 | |||
452 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | ||
453 | } | ||
454 | |||
455 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) | ||
456 | { | ||
457 | /* | ||
458 | * All rb_subtree_gap values must be consistent prior to erase, | ||
459 | * with the possible exception of the vma being erased. | ||
460 | */ | ||
461 | validate_mm_rb(root, vma); | ||
462 | |||
463 | /* | ||
464 | * Note rb_erase_augmented is a fairly large inline function, | ||
465 | * so make sure we instantiate it only once with our desired | ||
466 | * augmented rbtree callbacks. | ||
467 | */ | ||
468 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | ||
469 | } | ||
470 | |||
369 | /* | 471 | /* |
370 | * vma has some anon_vma assigned, and is already inserted on that | 472 | * vma has some anon_vma assigned, and is already inserted on that |
371 | * anon_vma's interval trees. | 473 | * anon_vma's interval trees. |
@@ -435,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, | |||
435 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 537 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
436 | struct rb_node **rb_link, struct rb_node *rb_parent) | 538 | struct rb_node **rb_link, struct rb_node *rb_parent) |
437 | { | 539 | { |
540 | /* Update tracking information for the gap following the new vma. */ | ||
541 | if (vma->vm_next) | ||
542 | vma_gap_update(vma->vm_next); | ||
543 | else | ||
544 | mm->highest_vm_end = vma->vm_end; | ||
545 | |||
546 | /* | ||
547 | * vma->vm_prev wasn't known when we followed the rbtree to find the | ||
548 | * correct insertion point for that vma. As a result, we could not | ||
549 | * update the vma vm_rb parents rb_subtree_gap values on the way down. | ||
550 | * So, we first insert the vma with a zero rb_subtree_gap value | ||
551 | * (to be consistent with what we did on the way down), and then | ||
552 | * immediately update the gap to the correct value. Finally we | ||
553 | * rebalance the rbtree after all augmented values have been set. | ||
554 | */ | ||
438 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); | 555 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); |
439 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | 556 | vma->rb_subtree_gap = 0; |
557 | vma_gap_update(vma); | ||
558 | vma_rb_insert(vma, &mm->mm_rb); | ||
440 | } | 559 | } |
441 | 560 | ||
442 | static void __vma_link_file(struct vm_area_struct *vma) | 561 | static void __vma_link_file(struct vm_area_struct *vma) |
@@ -512,12 +631,12 @@ static inline void | |||
512 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | 631 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, |
513 | struct vm_area_struct *prev) | 632 | struct vm_area_struct *prev) |
514 | { | 633 | { |
515 | struct vm_area_struct *next = vma->vm_next; | 634 | struct vm_area_struct *next; |
516 | 635 | ||
517 | prev->vm_next = next; | 636 | vma_rb_erase(vma, &mm->mm_rb); |
637 | prev->vm_next = next = vma->vm_next; | ||
518 | if (next) | 638 | if (next) |
519 | next->vm_prev = prev; | 639 | next->vm_prev = prev; |
520 | rb_erase(&vma->vm_rb, &mm->mm_rb); | ||
521 | if (mm->mmap_cache == vma) | 640 | if (mm->mmap_cache == vma) |
522 | mm->mmap_cache = prev; | 641 | mm->mmap_cache = prev; |
523 | } | 642 | } |
@@ -539,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
539 | struct rb_root *root = NULL; | 658 | struct rb_root *root = NULL; |
540 | struct anon_vma *anon_vma = NULL; | 659 | struct anon_vma *anon_vma = NULL; |
541 | struct file *file = vma->vm_file; | 660 | struct file *file = vma->vm_file; |
661 | bool start_changed = false, end_changed = false; | ||
542 | long adjust_next = 0; | 662 | long adjust_next = 0; |
543 | int remove_next = 0; | 663 | int remove_next = 0; |
544 | 664 | ||
@@ -629,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end); | |||
629 | vma_interval_tree_remove(next, root); | 749 | vma_interval_tree_remove(next, root); |
630 | } | 750 | } |
631 | 751 | ||
632 | vma->vm_start = start; | 752 | if (start != vma->vm_start) { |
633 | vma->vm_end = end; | 753 | vma->vm_start = start; |
754 | start_changed = true; | ||
755 | } | ||
756 | if (end != vma->vm_end) { | ||
757 | vma->vm_end = end; | ||
758 | end_changed = true; | ||
759 | } | ||
634 | vma->vm_pgoff = pgoff; | 760 | vma->vm_pgoff = pgoff; |
635 | if (adjust_next) { | 761 | if (adjust_next) { |
636 | next->vm_start += adjust_next << PAGE_SHIFT; | 762 | next->vm_start += adjust_next << PAGE_SHIFT; |
@@ -659,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end); | |||
659 | * (it may either follow vma or precede it). | 785 | * (it may either follow vma or precede it). |
660 | */ | 786 | */ |
661 | __insert_vm_struct(mm, insert); | 787 | __insert_vm_struct(mm, insert); |
788 | } else { | ||
789 | if (start_changed) | ||
790 | vma_gap_update(vma); | ||
791 | if (end_changed) { | ||
792 | if (!next) | ||
793 | mm->highest_vm_end = end; | ||
794 | else if (!adjust_next) | ||
795 | vma_gap_update(next); | ||
796 | } | ||
662 | } | 797 | } |
663 | 798 | ||
664 | if (anon_vma) { | 799 | if (anon_vma) { |
@@ -692,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end); | |||
692 | * we must remove another next too. It would clutter | 827 | * we must remove another next too. It would clutter |
693 | * up the code too much to do both in one go. | 828 | * up the code too much to do both in one go. |
694 | */ | 829 | */ |
695 | if (remove_next == 2) { | 830 | next = vma->vm_next; |
696 | next = vma->vm_next; | 831 | if (remove_next == 2) |
697 | goto again; | 832 | goto again; |
698 | } | 833 | else if (next) |
834 | vma_gap_update(next); | ||
835 | else | ||
836 | mm->highest_vm_end = end; | ||
699 | } | 837 | } |
700 | if (insert && file) | 838 | if (insert && file) |
701 | uprobe_mmap(insert); | 839 | uprobe_mmap(insert); |
@@ -1167,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1167 | * memory so no accounting is necessary | 1305 | * memory so no accounting is necessary |
1168 | */ | 1306 | */ |
1169 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, | 1307 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, |
1170 | VM_NORESERVE, &user, | 1308 | VM_NORESERVE, |
1171 | HUGETLB_ANONHUGE_INODE); | 1309 | &user, HUGETLB_ANONHUGE_INODE, |
1310 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); | ||
1172 | if (IS_ERR(file)) | 1311 | if (IS_ERR(file)) |
1173 | return PTR_ERR(file); | 1312 | return PTR_ERR(file); |
1174 | } | 1313 | } |
@@ -1414,6 +1553,206 @@ unacct_error: | |||
1414 | return error; | 1553 | return error; |
1415 | } | 1554 | } |
1416 | 1555 | ||
1556 | unsigned long unmapped_area(struct vm_unmapped_area_info *info) | ||
1557 | { | ||
1558 | /* | ||
1559 | * We implement the search by looking for an rbtree node that | ||
1560 | * immediately follows a suitable gap. That is, | ||
1561 | * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; | ||
1562 | * - gap_end = vma->vm_start >= info->low_limit + length; | ||
1563 | * - gap_end - gap_start >= length | ||
1564 | */ | ||
1565 | |||
1566 | struct mm_struct *mm = current->mm; | ||
1567 | struct vm_area_struct *vma; | ||
1568 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | ||
1569 | |||
1570 | /* Adjust search length to account for worst case alignment overhead */ | ||
1571 | length = info->length + info->align_mask; | ||
1572 | if (length < info->length) | ||
1573 | return -ENOMEM; | ||
1574 | |||
1575 | /* Adjust search limits by the desired length */ | ||
1576 | if (info->high_limit < length) | ||
1577 | return -ENOMEM; | ||
1578 | high_limit = info->high_limit - length; | ||
1579 | |||
1580 | if (info->low_limit > high_limit) | ||
1581 | return -ENOMEM; | ||
1582 | low_limit = info->low_limit + length; | ||
1583 | |||
1584 | /* Check if rbtree root looks promising */ | ||
1585 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | ||
1586 | goto check_highest; | ||
1587 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | ||
1588 | if (vma->rb_subtree_gap < length) | ||
1589 | goto check_highest; | ||
1590 | |||
1591 | while (true) { | ||
1592 | /* Visit left subtree if it looks promising */ | ||
1593 | gap_end = vma->vm_start; | ||
1594 | if (gap_end >= low_limit && vma->vm_rb.rb_left) { | ||
1595 | struct vm_area_struct *left = | ||
1596 | rb_entry(vma->vm_rb.rb_left, | ||
1597 | struct vm_area_struct, vm_rb); | ||
1598 | if (left->rb_subtree_gap >= length) { | ||
1599 | vma = left; | ||
1600 | continue; | ||
1601 | } | ||
1602 | } | ||
1603 | |||
1604 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | ||
1605 | check_current: | ||
1606 | /* Check if current node has a suitable gap */ | ||
1607 | if (gap_start > high_limit) | ||
1608 | return -ENOMEM; | ||
1609 | if (gap_end >= low_limit && gap_end - gap_start >= length) | ||
1610 | goto found; | ||
1611 | |||
1612 | /* Visit right subtree if it looks promising */ | ||
1613 | if (vma->vm_rb.rb_right) { | ||
1614 | struct vm_area_struct *right = | ||
1615 | rb_entry(vma->vm_rb.rb_right, | ||
1616 | struct vm_area_struct, vm_rb); | ||
1617 | if (right->rb_subtree_gap >= length) { | ||
1618 | vma = right; | ||
1619 | continue; | ||
1620 | } | ||
1621 | } | ||
1622 | |||
1623 | /* Go back up the rbtree to find next candidate node */ | ||
1624 | while (true) { | ||
1625 | struct rb_node *prev = &vma->vm_rb; | ||
1626 | if (!rb_parent(prev)) | ||
1627 | goto check_highest; | ||
1628 | vma = rb_entry(rb_parent(prev), | ||
1629 | struct vm_area_struct, vm_rb); | ||
1630 | if (prev == vma->vm_rb.rb_left) { | ||
1631 | gap_start = vma->vm_prev->vm_end; | ||
1632 | gap_end = vma->vm_start; | ||
1633 | goto check_current; | ||
1634 | } | ||
1635 | } | ||
1636 | } | ||
1637 | |||
1638 | check_highest: | ||
1639 | /* Check highest gap, which does not precede any rbtree node */ | ||
1640 | gap_start = mm->highest_vm_end; | ||
1641 | gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ | ||
1642 | if (gap_start > high_limit) | ||
1643 | return -ENOMEM; | ||
1644 | |||
1645 | found: | ||
1646 | /* We found a suitable gap. Clip it with the original low_limit. */ | ||
1647 | if (gap_start < info->low_limit) | ||
1648 | gap_start = info->low_limit; | ||
1649 | |||
1650 | /* Adjust gap address to the desired alignment */ | ||
1651 | gap_start += (info->align_offset - gap_start) & info->align_mask; | ||
1652 | |||
1653 | VM_BUG_ON(gap_start + info->length > info->high_limit); | ||
1654 | VM_BUG_ON(gap_start + info->length > gap_end); | ||
1655 | return gap_start; | ||
1656 | } | ||
1657 | |||
1658 | unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) | ||
1659 | { | ||
1660 | struct mm_struct *mm = current->mm; | ||
1661 | struct vm_area_struct *vma; | ||
1662 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | ||
1663 | |||
1664 | /* Adjust search length to account for worst case alignment overhead */ | ||
1665 | length = info->length + info->align_mask; | ||
1666 | if (length < info->length) | ||
1667 | return -ENOMEM; | ||
1668 | |||
1669 | /* | ||
1670 | * Adjust search limits by the desired length. | ||
1671 | * See implementation comment at top of unmapped_area(). | ||
1672 | */ | ||
1673 | gap_end = info->high_limit; | ||
1674 | if (gap_end < length) | ||
1675 | return -ENOMEM; | ||
1676 | high_limit = gap_end - length; | ||
1677 | |||
1678 | if (info->low_limit > high_limit) | ||
1679 | return -ENOMEM; | ||
1680 | low_limit = info->low_limit + length; | ||
1681 | |||
1682 | /* Check highest gap, which does not precede any rbtree node */ | ||
1683 | gap_start = mm->highest_vm_end; | ||
1684 | if (gap_start <= high_limit) | ||
1685 | goto found_highest; | ||
1686 | |||
1687 | /* Check if rbtree root looks promising */ | ||
1688 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | ||
1689 | return -ENOMEM; | ||
1690 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | ||
1691 | if (vma->rb_subtree_gap < length) | ||
1692 | return -ENOMEM; | ||
1693 | |||
1694 | while (true) { | ||
1695 | /* Visit right subtree if it looks promising */ | ||
1696 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | ||
1697 | if (gap_start <= high_limit && vma->vm_rb.rb_right) { | ||
1698 | struct vm_area_struct *right = | ||
1699 | rb_entry(vma->vm_rb.rb_right, | ||
1700 | struct vm_area_struct, vm_rb); | ||
1701 | if (right->rb_subtree_gap >= length) { | ||
1702 | vma = right; | ||
1703 | continue; | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | check_current: | ||
1708 | /* Check if current node has a suitable gap */ | ||
1709 | gap_end = vma->vm_start; | ||
1710 | if (gap_end < low_limit) | ||
1711 | return -ENOMEM; | ||
1712 | if (gap_start <= high_limit && gap_end - gap_start >= length) | ||
1713 | goto found; | ||
1714 | |||
1715 | /* Visit left subtree if it looks promising */ | ||
1716 | if (vma->vm_rb.rb_left) { | ||
1717 | struct vm_area_struct *left = | ||
1718 | rb_entry(vma->vm_rb.rb_left, | ||
1719 | struct vm_area_struct, vm_rb); | ||
1720 | if (left->rb_subtree_gap >= length) { | ||
1721 | vma = left; | ||
1722 | continue; | ||
1723 | } | ||
1724 | } | ||
1725 | |||
1726 | /* Go back up the rbtree to find next candidate node */ | ||
1727 | while (true) { | ||
1728 | struct rb_node *prev = &vma->vm_rb; | ||
1729 | if (!rb_parent(prev)) | ||
1730 | return -ENOMEM; | ||
1731 | vma = rb_entry(rb_parent(prev), | ||
1732 | struct vm_area_struct, vm_rb); | ||
1733 | if (prev == vma->vm_rb.rb_right) { | ||
1734 | gap_start = vma->vm_prev ? | ||
1735 | vma->vm_prev->vm_end : 0; | ||
1736 | goto check_current; | ||
1737 | } | ||
1738 | } | ||
1739 | } | ||
1740 | |||
1741 | found: | ||
1742 | /* We found a suitable gap. Clip it with the original high_limit. */ | ||
1743 | if (gap_end > info->high_limit) | ||
1744 | gap_end = info->high_limit; | ||
1745 | |||
1746 | found_highest: | ||
1747 | /* Compute highest gap address at the desired alignment */ | ||
1748 | gap_end -= info->length; | ||
1749 | gap_end -= (gap_end - info->align_offset) & info->align_mask; | ||
1750 | |||
1751 | VM_BUG_ON(gap_end < info->low_limit); | ||
1752 | VM_BUG_ON(gap_end < gap_start); | ||
1753 | return gap_end; | ||
1754 | } | ||
1755 | |||
1417 | /* Get an address range which is currently unmapped. | 1756 | /* Get an address range which is currently unmapped. |
1418 | * For shmat() with addr=0. | 1757 | * For shmat() with addr=0. |
1419 | * | 1758 | * |
@@ -1432,7 +1771,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1432 | { | 1771 | { |
1433 | struct mm_struct *mm = current->mm; | 1772 | struct mm_struct *mm = current->mm; |
1434 | struct vm_area_struct *vma; | 1773 | struct vm_area_struct *vma; |
1435 | unsigned long start_addr; | 1774 | struct vm_unmapped_area_info info; |
1436 | 1775 | ||
1437 | if (len > TASK_SIZE) | 1776 | if (len > TASK_SIZE) |
1438 | return -ENOMEM; | 1777 | return -ENOMEM; |
@@ -1447,40 +1786,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1447 | (!vma || addr + len <= vma->vm_start)) | 1786 | (!vma || addr + len <= vma->vm_start)) |
1448 | return addr; | 1787 | return addr; |
1449 | } | 1788 | } |
1450 | if (len > mm->cached_hole_size) { | ||
1451 | start_addr = addr = mm->free_area_cache; | ||
1452 | } else { | ||
1453 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
1454 | mm->cached_hole_size = 0; | ||
1455 | } | ||
1456 | 1789 | ||
1457 | full_search: | 1790 | info.flags = 0; |
1458 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | 1791 | info.length = len; |
1459 | /* At this point: (!vma || addr < vma->vm_end). */ | 1792 | info.low_limit = TASK_UNMAPPED_BASE; |
1460 | if (TASK_SIZE - len < addr) { | 1793 | info.high_limit = TASK_SIZE; |
1461 | /* | 1794 | info.align_mask = 0; |
1462 | * Start a new search - just in case we missed | 1795 | return vm_unmapped_area(&info); |
1463 | * some holes. | ||
1464 | */ | ||
1465 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
1466 | addr = TASK_UNMAPPED_BASE; | ||
1467 | start_addr = addr; | ||
1468 | mm->cached_hole_size = 0; | ||
1469 | goto full_search; | ||
1470 | } | ||
1471 | return -ENOMEM; | ||
1472 | } | ||
1473 | if (!vma || addr + len <= vma->vm_start) { | ||
1474 | /* | ||
1475 | * Remember the place where we stopped the search: | ||
1476 | */ | ||
1477 | mm->free_area_cache = addr + len; | ||
1478 | return addr; | ||
1479 | } | ||
1480 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1481 | mm->cached_hole_size = vma->vm_start - addr; | ||
1482 | addr = vma->vm_end; | ||
1483 | } | ||
1484 | } | 1796 | } |
1485 | #endif | 1797 | #endif |
1486 | 1798 | ||
@@ -1505,7 +1817,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1505 | { | 1817 | { |
1506 | struct vm_area_struct *vma; | 1818 | struct vm_area_struct *vma; |
1507 | struct mm_struct *mm = current->mm; | 1819 | struct mm_struct *mm = current->mm; |
1508 | unsigned long addr = addr0, start_addr; | 1820 | unsigned long addr = addr0; |
1821 | struct vm_unmapped_area_info info; | ||
1509 | 1822 | ||
1510 | /* requested length too big for entire address space */ | 1823 | /* requested length too big for entire address space */ |
1511 | if (len > TASK_SIZE) | 1824 | if (len > TASK_SIZE) |
@@ -1523,53 +1836,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1523 | return addr; | 1836 | return addr; |
1524 | } | 1837 | } |
1525 | 1838 | ||
1526 | /* check if free_area_cache is useful for us */ | 1839 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
1527 | if (len <= mm->cached_hole_size) { | 1840 | info.length = len; |
1528 | mm->cached_hole_size = 0; | 1841 | info.low_limit = PAGE_SIZE; |
1529 | mm->free_area_cache = mm->mmap_base; | 1842 | info.high_limit = mm->mmap_base; |
1530 | } | 1843 | info.align_mask = 0; |
1531 | 1844 | addr = vm_unmapped_area(&info); | |
1532 | try_again: | ||
1533 | /* either no address requested or can't fit in requested address hole */ | ||
1534 | start_addr = addr = mm->free_area_cache; | ||
1535 | |||
1536 | if (addr < len) | ||
1537 | goto fail; | ||
1538 | |||
1539 | addr -= len; | ||
1540 | do { | ||
1541 | /* | ||
1542 | * Lookup failure means no vma is above this address, | ||
1543 | * else if new region fits below vma->vm_start, | ||
1544 | * return with success: | ||
1545 | */ | ||
1546 | vma = find_vma(mm, addr); | ||
1547 | if (!vma || addr+len <= vma->vm_start) | ||
1548 | /* remember the address as a hint for next time */ | ||
1549 | return (mm->free_area_cache = addr); | ||
1550 | |||
1551 | /* remember the largest hole we saw so far */ | ||
1552 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1553 | mm->cached_hole_size = vma->vm_start - addr; | ||
1554 | |||
1555 | /* try just below the current vma->vm_start */ | ||
1556 | addr = vma->vm_start-len; | ||
1557 | } while (len < vma->vm_start); | ||
1558 | |||
1559 | fail: | ||
1560 | /* | ||
1561 | * if hint left us with no space for the requested | ||
1562 | * mapping then try again: | ||
1563 | * | ||
1564 | * Note: this is different with the case of bottomup | ||
1565 | * which does the fully line-search, but we use find_vma | ||
1566 | * here that causes some holes skipped. | ||
1567 | */ | ||
1568 | if (start_addr != mm->mmap_base) { | ||
1569 | mm->free_area_cache = mm->mmap_base; | ||
1570 | mm->cached_hole_size = 0; | ||
1571 | goto try_again; | ||
1572 | } | ||
1573 | 1845 | ||
1574 | /* | 1846 | /* |
1575 | * A failed mmap() very likely causes application failure, | 1847 | * A failed mmap() very likely causes application failure, |
@@ -1577,14 +1849,13 @@ fail: | |||
1577 | * can happen with large stack limits and large mmap() | 1849 | * can happen with large stack limits and large mmap() |
1578 | * allocations. | 1850 | * allocations. |
1579 | */ | 1851 | */ |
1580 | mm->cached_hole_size = ~0UL; | 1852 | if (addr & ~PAGE_MASK) { |
1581 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 1853 | VM_BUG_ON(addr != -ENOMEM); |
1582 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 1854 | info.flags = 0; |
1583 | /* | 1855 | info.low_limit = TASK_UNMAPPED_BASE; |
1584 | * Restore the topdown base: | 1856 | info.high_limit = TASK_SIZE; |
1585 | */ | 1857 | addr = vm_unmapped_area(&info); |
1586 | mm->free_area_cache = mm->mmap_base; | 1858 | } |
1587 | mm->cached_hole_size = ~0UL; | ||
1588 | 1859 | ||
1589 | return addr; | 1860 | return addr; |
1590 | } | 1861 | } |
@@ -1797,6 +2068,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1797 | anon_vma_interval_tree_pre_update_vma(vma); | 2068 | anon_vma_interval_tree_pre_update_vma(vma); |
1798 | vma->vm_end = address; | 2069 | vma->vm_end = address; |
1799 | anon_vma_interval_tree_post_update_vma(vma); | 2070 | anon_vma_interval_tree_post_update_vma(vma); |
2071 | if (vma->vm_next) | ||
2072 | vma_gap_update(vma->vm_next); | ||
2073 | else | ||
2074 | vma->vm_mm->highest_vm_end = address; | ||
1800 | perf_event_mmap(vma); | 2075 | perf_event_mmap(vma); |
1801 | } | 2076 | } |
1802 | } | 2077 | } |
@@ -1851,6 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1851 | vma->vm_start = address; | 2126 | vma->vm_start = address; |
1852 | vma->vm_pgoff -= grow; | 2127 | vma->vm_pgoff -= grow; |
1853 | anon_vma_interval_tree_post_update_vma(vma); | 2128 | anon_vma_interval_tree_post_update_vma(vma); |
2129 | vma_gap_update(vma); | ||
1854 | perf_event_mmap(vma); | 2130 | perf_event_mmap(vma); |
1855 | } | 2131 | } |
1856 | } | 2132 | } |
@@ -1973,14 +2249,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1973 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 2249 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
1974 | vma->vm_prev = NULL; | 2250 | vma->vm_prev = NULL; |
1975 | do { | 2251 | do { |
1976 | rb_erase(&vma->vm_rb, &mm->mm_rb); | 2252 | vma_rb_erase(vma, &mm->mm_rb); |
1977 | mm->map_count--; | 2253 | mm->map_count--; |
1978 | tail_vma = vma; | 2254 | tail_vma = vma; |
1979 | vma = vma->vm_next; | 2255 | vma = vma->vm_next; |
1980 | } while (vma && vma->vm_start < end); | 2256 | } while (vma && vma->vm_start < end); |
1981 | *insertion_point = vma; | 2257 | *insertion_point = vma; |
1982 | if (vma) | 2258 | if (vma) { |
1983 | vma->vm_prev = prev; | 2259 | vma->vm_prev = prev; |
2260 | vma_gap_update(vma); | ||
2261 | } else | ||
2262 | mm->highest_vm_end = prev ? prev->vm_end : 0; | ||
1984 | tail_vma->vm_next = NULL; | 2263 | tail_vma->vm_next = NULL; |
1985 | if (mm->unmap_area == arch_unmap_area) | 2264 | if (mm->unmap_area == arch_unmap_area) |
1986 | addr = prev ? prev->vm_end : mm->mmap_base; | 2265 | addr = prev ? prev->vm_end : mm->mmap_base; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 79e0f3e24831..18f1ae2b45de 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task; | |||
44 | int sysctl_oom_dump_tasks = 1; | 44 | int sysctl_oom_dump_tasks = 1; |
45 | static DEFINE_SPINLOCK(zone_scan_lock); | 45 | static DEFINE_SPINLOCK(zone_scan_lock); |
46 | 46 | ||
47 | /* | ||
48 | * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj | ||
49 | * @old_val: old oom_score_adj for compare | ||
50 | * @new_val: new oom_score_adj for swap | ||
51 | * | ||
52 | * Sets the oom_score_adj value for current to @new_val iff its present value is | ||
53 | * @old_val. Usually used to reinstate a previous value to prevent racing with | ||
54 | * userspacing tuning the value in the interim. | ||
55 | */ | ||
56 | void compare_swap_oom_score_adj(int old_val, int new_val) | ||
57 | { | ||
58 | struct sighand_struct *sighand = current->sighand; | ||
59 | |||
60 | spin_lock_irq(&sighand->siglock); | ||
61 | if (current->signal->oom_score_adj == old_val) | ||
62 | current->signal->oom_score_adj = new_val; | ||
63 | trace_oom_score_adj_update(current); | ||
64 | spin_unlock_irq(&sighand->siglock); | ||
65 | } | ||
66 | |||
67 | /** | ||
68 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value | ||
69 | * @new_val: new oom_score_adj value | ||
70 | * | ||
71 | * Sets the oom_score_adj value for current to @new_val with proper | ||
72 | * synchronization and returns the old value. Usually used to temporarily | ||
73 | * set a value, save the old value in the caller, and then reinstate it later. | ||
74 | */ | ||
75 | int test_set_oom_score_adj(int new_val) | ||
76 | { | ||
77 | struct sighand_struct *sighand = current->sighand; | ||
78 | int old_val; | ||
79 | |||
80 | spin_lock_irq(&sighand->siglock); | ||
81 | old_val = current->signal->oom_score_adj; | ||
82 | current->signal->oom_score_adj = new_val; | ||
83 | trace_oom_score_adj_update(current); | ||
84 | spin_unlock_irq(&sighand->siglock); | ||
85 | |||
86 | return old_val; | ||
87 | } | ||
88 | |||
89 | #ifdef CONFIG_NUMA | 47 | #ifdef CONFIG_NUMA |
90 | /** | 48 | /** |
91 | * has_intersects_mems_allowed() - check task eligiblity for kill | 49 | * has_intersects_mems_allowed() - check task eligiblity for kill |
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
193 | if (!p) | 151 | if (!p) |
194 | return 0; | 152 | return 0; |
195 | 153 | ||
196 | adj = p->signal->oom_score_adj; | 154 | adj = (long)p->signal->oom_score_adj; |
197 | if (adj == OOM_SCORE_ADJ_MIN) { | 155 | if (adj == OOM_SCORE_ADJ_MIN) { |
198 | task_unlock(p); | 156 | task_unlock(p); |
199 | return 0; | 157 | return 0; |
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
310 | if (!task->mm) | 268 | if (!task->mm) |
311 | return OOM_SCAN_CONTINUE; | 269 | return OOM_SCAN_CONTINUE; |
312 | 270 | ||
313 | if (task->flags & PF_EXITING) { | 271 | /* |
272 | * If task is allocating a lot of memory and has been marked to be | ||
273 | * killed first if it triggers an oom, then select it. | ||
274 | */ | ||
275 | if (oom_task_origin(task)) | ||
276 | return OOM_SCAN_SELECT; | ||
277 | |||
278 | if (task->flags & PF_EXITING && !force_kill) { | ||
314 | /* | 279 | /* |
315 | * If task is current and is in the process of releasing memory, | 280 | * If this task is not being ptraced on exit, then wait for it |
316 | * allow the "kill" to set TIF_MEMDIE, which will allow it to | 281 | * to finish before killing some other task unnecessarily. |
317 | * access memory reserves. Otherwise, it may stall forever. | ||
318 | * | ||
319 | * The iteration isn't broken here, however, in case other | ||
320 | * threads are found to have already been oom killed. | ||
321 | */ | 282 | */ |
322 | if (task == current) | 283 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) |
323 | return OOM_SCAN_SELECT; | 284 | return OOM_SCAN_ABORT; |
324 | else if (!force_kill) { | ||
325 | /* | ||
326 | * If this task is not being ptraced on exit, then wait | ||
327 | * for it to finish before killing some other task | ||
328 | * unnecessarily. | ||
329 | */ | ||
330 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
331 | return OOM_SCAN_ABORT; | ||
332 | } | ||
333 | } | 285 | } |
334 | return OOM_SCAN_OK; | 286 | return OOM_SCAN_OK; |
335 | } | 287 | } |
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
412 | continue; | 364 | continue; |
413 | } | 365 | } |
414 | 366 | ||
415 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", | 367 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", |
416 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 368 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
417 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 369 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
418 | task->mm->nr_ptes, | 370 | task->mm->nr_ptes, |
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
428 | { | 380 | { |
429 | task_lock(current); | 381 | task_lock(current); |
430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 382 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
431 | "oom_score_adj=%d\n", | 383 | "oom_score_adj=%hd\n", |
432 | current->comm, gfp_mask, order, | 384 | current->comm, gfp_mask, order, |
433 | current->signal->oom_score_adj); | 385 | current->signal->oom_score_adj); |
434 | cpuset_print_task_mems_allowed(current); | 386 | cpuset_print_task_mems_allowed(current); |
@@ -706,11 +658,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
706 | return; | 658 | return; |
707 | 659 | ||
708 | /* | 660 | /* |
709 | * If current has a pending SIGKILL, then automatically select it. The | 661 | * If current has a pending SIGKILL or is exiting, then automatically |
710 | * goal is to allow it to allocate so that it may quickly exit and free | 662 | * select it. The goal is to allow it to allocate so that it may |
711 | * its memory. | 663 | * quickly exit and free its memory. |
712 | */ | 664 | */ |
713 | if (fatal_signal_pending(current)) { | 665 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { |
714 | set_thread_flag(TIF_MEMDIE); | 666 | set_thread_flag(TIF_MEMDIE); |
715 | return; | 667 | return; |
716 | } | 668 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 830893b2b3c7..6f4271224493 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
1069 | } | 1069 | } |
1070 | 1070 | ||
1071 | /* | 1071 | /* |
1072 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() | 1072 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited() |
1073 | * will look to see if it needs to start dirty throttling. | 1073 | * will look to see if it needs to start dirty throttling. |
1074 | * | 1074 | * |
1075 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive | 1075 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive |
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); | |||
1436 | DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | 1436 | DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; |
1437 | 1437 | ||
1438 | /** | 1438 | /** |
1439 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 1439 | * balance_dirty_pages_ratelimited - balance dirty memory state |
1440 | * @mapping: address_space which was dirtied | 1440 | * @mapping: address_space which was dirtied |
1441 | * @nr_pages_dirtied: number of pages which the caller has just dirtied | ||
1442 | * | 1441 | * |
1443 | * Processes which are dirtying memory should call in here once for each page | 1442 | * Processes which are dirtying memory should call in here once for each page |
1444 | * which was newly dirtied. The function will periodically check the system's | 1443 | * which was newly dirtied. The function will periodically check the system's |
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | |||
1449 | * limit we decrease the ratelimiting by a lot, to prevent individual processes | 1448 | * limit we decrease the ratelimiting by a lot, to prevent individual processes |
1450 | * from overshooting the limit by (ratelimit_pages) each. | 1449 | * from overshooting the limit by (ratelimit_pages) each. |
1451 | */ | 1450 | */ |
1452 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 1451 | void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1453 | unsigned long nr_pages_dirtied) | ||
1454 | { | 1452 | { |
1455 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1453 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1456 | int ratelimit; | 1454 | int ratelimit; |
@@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1484 | */ | 1482 | */ |
1485 | p = &__get_cpu_var(dirty_throttle_leaks); | 1483 | p = &__get_cpu_var(dirty_throttle_leaks); |
1486 | if (*p > 0 && current->nr_dirtied < ratelimit) { | 1484 | if (*p > 0 && current->nr_dirtied < ratelimit) { |
1485 | unsigned long nr_pages_dirtied; | ||
1487 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); | 1486 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); |
1488 | *p -= nr_pages_dirtied; | 1487 | *p -= nr_pages_dirtied; |
1489 | current->nr_dirtied += nr_pages_dirtied; | 1488 | current->nr_dirtied += nr_pages_dirtied; |
@@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1493 | if (unlikely(current->nr_dirtied >= ratelimit)) | 1492 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1494 | balance_dirty_pages(mapping, current->nr_dirtied); | 1493 | balance_dirty_pages(mapping, current->nr_dirtied); |
1495 | } | 1494 | } |
1496 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | 1495 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
1497 | 1496 | ||
1498 | void throttle_vm_writeout(gfp_t gfp_mask) | 1497 | void throttle_vm_writeout(gfp_t gfp_mask) |
1499 | { | 1498 | { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68c..5a8d339d282a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -667,11 +667,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
668 | __free_one_page(page, zone, 0, mt); | 668 | __free_one_page(page, zone, 0, mt); |
669 | trace_mm_page_pcpu_drain(page, 0, mt); | 669 | trace_mm_page_pcpu_drain(page, 0, mt); |
670 | if (is_migrate_cma(mt)) | 670 | if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { |
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | 671 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
672 | if (is_migrate_cma(mt)) | ||
673 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | ||
674 | } | ||
672 | } while (--to_free && --batch_free && !list_empty(list)); | 675 | } while (--to_free && --batch_free && !list_empty(list)); |
673 | } | 676 | } |
674 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | ||
675 | spin_unlock(&zone->lock); | 677 | spin_unlock(&zone->lock); |
676 | } | 678 | } |
677 | 679 | ||
@@ -1392,21 +1394,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1392 | 1394 | ||
1393 | zone = page_zone(page); | 1395 | zone = page_zone(page); |
1394 | order = page_order(page); | 1396 | order = page_order(page); |
1397 | mt = get_pageblock_migratetype(page); | ||
1395 | 1398 | ||
1396 | /* Obey watermarks as if the page was being allocated */ | 1399 | if (mt != MIGRATE_ISOLATE) { |
1397 | watermark = low_wmark_pages(zone) + (1 << order); | 1400 | /* Obey watermarks as if the page was being allocated */ |
1398 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1401 | watermark = low_wmark_pages(zone) + (1 << order); |
1399 | return 0; | 1402 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1403 | return 0; | ||
1404 | |||
1405 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | ||
1406 | } | ||
1400 | 1407 | ||
1401 | /* Remove page from free list */ | 1408 | /* Remove page from free list */ |
1402 | list_del(&page->lru); | 1409 | list_del(&page->lru); |
1403 | zone->free_area[order].nr_free--; | 1410 | zone->free_area[order].nr_free--; |
1404 | rmv_page_order(page); | 1411 | rmv_page_order(page); |
1405 | 1412 | ||
1406 | mt = get_pageblock_migratetype(page); | ||
1407 | if (unlikely(mt != MIGRATE_ISOLATE)) | ||
1408 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | ||
1409 | |||
1410 | if (alloc_order != order) | 1413 | if (alloc_order != order) |
1411 | expand(zone, page, alloc_order, order, | 1414 | expand(zone, page, alloc_order, order, |
1412 | &zone->free_area[order], migratetype); | 1415 | &zone->free_area[order], migratetype); |
@@ -1871,7 +1874,7 @@ zonelist_scan: | |||
1871 | */ | 1874 | */ |
1872 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1875 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1873 | high_zoneidx, nodemask) { | 1876 | high_zoneidx, nodemask) { |
1874 | if (NUMA_BUILD && zlc_active && | 1877 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1875 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1878 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1876 | continue; | 1879 | continue; |
1877 | if ((alloc_flags & ALLOC_CPUSET) && | 1880 | if ((alloc_flags & ALLOC_CPUSET) && |
@@ -1917,7 +1920,8 @@ zonelist_scan: | |||
1917 | classzone_idx, alloc_flags)) | 1920 | classzone_idx, alloc_flags)) |
1918 | goto try_this_zone; | 1921 | goto try_this_zone; |
1919 | 1922 | ||
1920 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | 1923 | if (IS_ENABLED(CONFIG_NUMA) && |
1924 | !did_zlc_setup && nr_online_nodes > 1) { | ||
1921 | /* | 1925 | /* |
1922 | * we do zlc_setup if there are multiple nodes | 1926 | * we do zlc_setup if there are multiple nodes |
1923 | * and before considering the first zone allowed | 1927 | * and before considering the first zone allowed |
@@ -1936,7 +1940,7 @@ zonelist_scan: | |||
1936 | * As we may have just activated ZLC, check if the first | 1940 | * As we may have just activated ZLC, check if the first |
1937 | * eligible zone has failed zone_reclaim recently. | 1941 | * eligible zone has failed zone_reclaim recently. |
1938 | */ | 1942 | */ |
1939 | if (NUMA_BUILD && zlc_active && | 1943 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1940 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1944 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1941 | continue; | 1945 | continue; |
1942 | 1946 | ||
@@ -1962,11 +1966,11 @@ try_this_zone: | |||
1962 | if (page) | 1966 | if (page) |
1963 | break; | 1967 | break; |
1964 | this_zone_full: | 1968 | this_zone_full: |
1965 | if (NUMA_BUILD) | 1969 | if (IS_ENABLED(CONFIG_NUMA)) |
1966 | zlc_mark_zone_full(zonelist, z); | 1970 | zlc_mark_zone_full(zonelist, z); |
1967 | } | 1971 | } |
1968 | 1972 | ||
1969 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1973 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { |
1970 | /* Disable zlc cache for second zonelist scan */ | 1974 | /* Disable zlc cache for second zonelist scan */ |
1971 | zlc_active = 0; | 1975 | zlc_active = 0; |
1972 | goto zonelist_scan; | 1976 | goto zonelist_scan; |
@@ -2266,7 +2270,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2266 | return NULL; | 2270 | return NULL; |
2267 | 2271 | ||
2268 | /* After successful reclaim, reconsider all zones for allocation */ | 2272 | /* After successful reclaim, reconsider all zones for allocation */ |
2269 | if (NUMA_BUILD) | 2273 | if (IS_ENABLED(CONFIG_NUMA)) |
2270 | zlc_clear_zones_full(zonelist); | 2274 | zlc_clear_zones_full(zonelist); |
2271 | 2275 | ||
2272 | retry: | 2276 | retry: |
@@ -2412,7 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2412 | * allowed per node queues are empty and that nodes are | 2416 | * allowed per node queues are empty and that nodes are |
2413 | * over allocated. | 2417 | * over allocated. |
2414 | */ | 2418 | */ |
2415 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2419 | if (IS_ENABLED(CONFIG_NUMA) && |
2420 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
2416 | goto nopage; | 2421 | goto nopage; |
2417 | 2422 | ||
2418 | restart: | 2423 | restart: |
@@ -2819,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void) | |||
2819 | 2824 | ||
2820 | static inline void show_node(struct zone *zone) | 2825 | static inline void show_node(struct zone *zone) |
2821 | { | 2826 | { |
2822 | if (NUMA_BUILD) | 2827 | if (IS_ENABLED(CONFIG_NUMA)) |
2823 | printk("Node %d ", zone_to_nid(zone)); | 2828 | printk("Node %d ", zone_to_nid(zone)); |
2824 | } | 2829 | } |
2825 | 2830 | ||
@@ -2877,6 +2882,31 @@ out: | |||
2877 | 2882 | ||
2878 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2883 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2879 | 2884 | ||
2885 | static void show_migration_types(unsigned char type) | ||
2886 | { | ||
2887 | static const char types[MIGRATE_TYPES] = { | ||
2888 | [MIGRATE_UNMOVABLE] = 'U', | ||
2889 | [MIGRATE_RECLAIMABLE] = 'E', | ||
2890 | [MIGRATE_MOVABLE] = 'M', | ||
2891 | [MIGRATE_RESERVE] = 'R', | ||
2892 | #ifdef CONFIG_CMA | ||
2893 | [MIGRATE_CMA] = 'C', | ||
2894 | #endif | ||
2895 | [MIGRATE_ISOLATE] = 'I', | ||
2896 | }; | ||
2897 | char tmp[MIGRATE_TYPES + 1]; | ||
2898 | char *p = tmp; | ||
2899 | int i; | ||
2900 | |||
2901 | for (i = 0; i < MIGRATE_TYPES; i++) { | ||
2902 | if (type & (1 << i)) | ||
2903 | *p++ = types[i]; | ||
2904 | } | ||
2905 | |||
2906 | *p = '\0'; | ||
2907 | printk("(%s) ", tmp); | ||
2908 | } | ||
2909 | |||
2880 | /* | 2910 | /* |
2881 | * Show free area list (used inside shift_scroll-lock stuff) | 2911 | * Show free area list (used inside shift_scroll-lock stuff) |
2882 | * We also calculate the percentage fragmentation. We do this by counting the | 2912 | * We also calculate the percentage fragmentation. We do this by counting the |
@@ -3005,6 +3035,7 @@ void show_free_areas(unsigned int filter) | |||
3005 | 3035 | ||
3006 | for_each_populated_zone(zone) { | 3036 | for_each_populated_zone(zone) { |
3007 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 3037 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
3038 | unsigned char types[MAX_ORDER]; | ||
3008 | 3039 | ||
3009 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 3040 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
3010 | continue; | 3041 | continue; |
@@ -3013,12 +3044,24 @@ void show_free_areas(unsigned int filter) | |||
3013 | 3044 | ||
3014 | spin_lock_irqsave(&zone->lock, flags); | 3045 | spin_lock_irqsave(&zone->lock, flags); |
3015 | for (order = 0; order < MAX_ORDER; order++) { | 3046 | for (order = 0; order < MAX_ORDER; order++) { |
3016 | nr[order] = zone->free_area[order].nr_free; | 3047 | struct free_area *area = &zone->free_area[order]; |
3048 | int type; | ||
3049 | |||
3050 | nr[order] = area->nr_free; | ||
3017 | total += nr[order] << order; | 3051 | total += nr[order] << order; |
3052 | |||
3053 | types[order] = 0; | ||
3054 | for (type = 0; type < MIGRATE_TYPES; type++) { | ||
3055 | if (!list_empty(&area->free_list[type])) | ||
3056 | types[order] |= 1 << type; | ||
3057 | } | ||
3018 | } | 3058 | } |
3019 | spin_unlock_irqrestore(&zone->lock, flags); | 3059 | spin_unlock_irqrestore(&zone->lock, flags); |
3020 | for (order = 0; order < MAX_ORDER; order++) | 3060 | for (order = 0; order < MAX_ORDER; order++) { |
3021 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | 3061 | printk("%lu*%lukB ", nr[order], K(1UL) << order); |
3062 | if (nr[order]) | ||
3063 | show_migration_types(types[order]); | ||
3064 | } | ||
3022 | printk("= %lukB\n", K(total)); | 3065 | printk("= %lukB\n", K(total)); |
3023 | } | 3066 | } |
3024 | 3067 | ||
@@ -5175,10 +5218,6 @@ static void __setup_per_zone_wmarks(void) | |||
5175 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5218 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5176 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5219 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5177 | 5220 | ||
5178 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | ||
5179 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | ||
5180 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | ||
5181 | |||
5182 | setup_zone_migrate_reserve(zone); | 5221 | setup_zone_migrate_reserve(zone); |
5183 | spin_unlock_irqrestore(&zone->lock, flags); | 5222 | spin_unlock_irqrestore(&zone->lock, flags); |
5184 | } | 5223 | } |
@@ -5576,7 +5615,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5576 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 5615 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't |
5577 | * expect this function should be exact. | 5616 | * expect this function should be exact. |
5578 | */ | 5617 | */ |
5579 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | 5618 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
5619 | bool skip_hwpoisoned_pages) | ||
5580 | { | 5620 | { |
5581 | unsigned long pfn, iter, found; | 5621 | unsigned long pfn, iter, found; |
5582 | int mt; | 5622 | int mt; |
@@ -5611,6 +5651,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |||
5611 | continue; | 5651 | continue; |
5612 | } | 5652 | } |
5613 | 5653 | ||
5654 | /* | ||
5655 | * The HWPoisoned page may be not in buddy system, and | ||
5656 | * page_count() is not 0. | ||
5657 | */ | ||
5658 | if (skip_hwpoisoned_pages && PageHWPoison(page)) | ||
5659 | continue; | ||
5660 | |||
5614 | if (!PageLRU(page)) | 5661 | if (!PageLRU(page)) |
5615 | found++; | 5662 | found++; |
5616 | /* | 5663 | /* |
@@ -5653,7 +5700,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5653 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5700 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5654 | return false; | 5701 | return false; |
5655 | 5702 | ||
5656 | return !has_unmovable_pages(zone, page, 0); | 5703 | return !has_unmovable_pages(zone, page, 0, true); |
5657 | } | 5704 | } |
5658 | 5705 | ||
5659 | #ifdef CONFIG_CMA | 5706 | #ifdef CONFIG_CMA |
@@ -5711,58 +5758,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5711 | 0, false, MIGRATE_SYNC); | 5758 | 0, false, MIGRATE_SYNC); |
5712 | } | 5759 | } |
5713 | 5760 | ||
5714 | putback_lru_pages(&cc->migratepages); | 5761 | putback_movable_pages(&cc->migratepages); |
5715 | return ret > 0 ? 0 : ret; | 5762 | return ret > 0 ? 0 : ret; |
5716 | } | 5763 | } |
5717 | 5764 | ||
5718 | /* | ||
5719 | * Update zone's cma pages counter used for watermark level calculation. | ||
5720 | */ | ||
5721 | static inline void __update_cma_watermarks(struct zone *zone, int count) | ||
5722 | { | ||
5723 | unsigned long flags; | ||
5724 | spin_lock_irqsave(&zone->lock, flags); | ||
5725 | zone->min_cma_pages += count; | ||
5726 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5727 | setup_per_zone_wmarks(); | ||
5728 | } | ||
5729 | |||
5730 | /* | ||
5731 | * Trigger memory pressure bump to reclaim some pages in order to be able to | ||
5732 | * allocate 'count' pages in single page units. Does similar work as | ||
5733 | *__alloc_pages_slowpath() function. | ||
5734 | */ | ||
5735 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | ||
5736 | { | ||
5737 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
5738 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | ||
5739 | int did_some_progress = 0; | ||
5740 | int order = 1; | ||
5741 | |||
5742 | /* | ||
5743 | * Increase level of watermarks to force kswapd do his job | ||
5744 | * to stabilise at new watermark level. | ||
5745 | */ | ||
5746 | __update_cma_watermarks(zone, count); | ||
5747 | |||
5748 | /* Obey watermarks as if the page was being allocated */ | ||
5749 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | ||
5750 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | ||
5751 | |||
5752 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
5753 | NULL); | ||
5754 | if (!did_some_progress) { | ||
5755 | /* Exhausted what can be done so it's blamo time */ | ||
5756 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | ||
5757 | } | ||
5758 | } | ||
5759 | |||
5760 | /* Restore original watermark levels. */ | ||
5761 | __update_cma_watermarks(zone, -count); | ||
5762 | |||
5763 | return count; | ||
5764 | } | ||
5765 | |||
5766 | /** | 5765 | /** |
5767 | * alloc_contig_range() -- tries to allocate given range of pages | 5766 | * alloc_contig_range() -- tries to allocate given range of pages |
5768 | * @start: start PFN to allocate | 5767 | * @start: start PFN to allocate |
@@ -5786,7 +5785,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | |||
5786 | int alloc_contig_range(unsigned long start, unsigned long end, | 5785 | int alloc_contig_range(unsigned long start, unsigned long end, |
5787 | unsigned migratetype) | 5786 | unsigned migratetype) |
5788 | { | 5787 | { |
5789 | struct zone *zone = page_zone(pfn_to_page(start)); | ||
5790 | unsigned long outer_start, outer_end; | 5788 | unsigned long outer_start, outer_end; |
5791 | int ret = 0, order; | 5789 | int ret = 0, order; |
5792 | 5790 | ||
@@ -5824,7 +5822,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5824 | */ | 5822 | */ |
5825 | 5823 | ||
5826 | ret = start_isolate_page_range(pfn_max_align_down(start), | 5824 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5827 | pfn_max_align_up(end), migratetype); | 5825 | pfn_max_align_up(end), migratetype, |
5826 | false); | ||
5828 | if (ret) | 5827 | if (ret) |
5829 | return ret; | 5828 | return ret; |
5830 | 5829 | ||
@@ -5863,18 +5862,13 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5863 | } | 5862 | } |
5864 | 5863 | ||
5865 | /* Make sure the range is really isolated. */ | 5864 | /* Make sure the range is really isolated. */ |
5866 | if (test_pages_isolated(outer_start, end)) { | 5865 | if (test_pages_isolated(outer_start, end, false)) { |
5867 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | 5866 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", |
5868 | outer_start, end); | 5867 | outer_start, end); |
5869 | ret = -EBUSY; | 5868 | ret = -EBUSY; |
5870 | goto done; | 5869 | goto done; |
5871 | } | 5870 | } |
5872 | 5871 | ||
5873 | /* | ||
5874 | * Reclaim enough pages to make sure that contiguous allocation | ||
5875 | * will not starve the system. | ||
5876 | */ | ||
5877 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | ||
5878 | 5872 | ||
5879 | /* Grab isolated pages from freelists. */ | 5873 | /* Grab isolated pages from freelists. */ |
5880 | outer_end = isolate_freepages_range(&cc, outer_start, end); | 5874 | outer_end = isolate_freepages_range(&cc, outer_start, end); |
@@ -5932,7 +5926,6 @@ void __meminit zone_pcp_update(struct zone *zone) | |||
5932 | } | 5926 | } |
5933 | #endif | 5927 | #endif |
5934 | 5928 | ||
5935 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5936 | void zone_pcp_reset(struct zone *zone) | 5929 | void zone_pcp_reset(struct zone *zone) |
5937 | { | 5930 | { |
5938 | unsigned long flags; | 5931 | unsigned long flags; |
@@ -5952,6 +5945,7 @@ void zone_pcp_reset(struct zone *zone) | |||
5952 | local_irq_restore(flags); | 5945 | local_irq_restore(flags); |
5953 | } | 5946 | } |
5954 | 5947 | ||
5948 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5955 | /* | 5949 | /* |
5956 | * All pages in the range must be isolated before calling this. | 5950 | * All pages in the range must be isolated before calling this. |
5957 | */ | 5951 | */ |
@@ -5978,6 +5972,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5978 | continue; | 5972 | continue; |
5979 | } | 5973 | } |
5980 | page = pfn_to_page(pfn); | 5974 | page = pfn_to_page(pfn); |
5975 | /* | ||
5976 | * The HWPoisoned page may be not in buddy system, and | ||
5977 | * page_count() is not 0. | ||
5978 | */ | ||
5979 | if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { | ||
5980 | pfn++; | ||
5981 | SetPageReserved(page); | ||
5982 | continue; | ||
5983 | } | ||
5984 | |||
5981 | BUG_ON(page_count(page)); | 5985 | BUG_ON(page_count(page)); |
5982 | BUG_ON(!PageBuddy(page)); | 5986 | BUG_ON(!PageBuddy(page)); |
5983 | order = page_order(page); | 5987 | order = page_order(page); |
@@ -5988,8 +5992,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5988 | list_del(&page->lru); | 5992 | list_del(&page->lru); |
5989 | rmv_page_order(page); | 5993 | rmv_page_order(page); |
5990 | zone->free_area[order].nr_free--; | 5994 | zone->free_area[order].nr_free--; |
5991 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
5992 | - (1UL << order)); | ||
5993 | for (i = 0; i < (1 << order); i++) | 5995 | for (i = 0; i < (1 << order); i++) |
5994 | SetPageReserved((page+i)); | 5996 | SetPageReserved((page+i)); |
5995 | pfn += (1 << order); | 5997 | pfn += (1 << order); |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5ddad0c6daa6..44db00e253ed 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
251 | mn->nr_pages, mn->status_change_nid); | 251 | mn->nr_pages, mn->status_change_nid); |
252 | break; | 252 | break; |
253 | case MEM_CANCEL_ONLINE: | 253 | case MEM_CANCEL_ONLINE: |
254 | offline_page_cgroup(mn->start_pfn, | ||
255 | mn->nr_pages, mn->status_change_nid); | ||
256 | break; | ||
254 | case MEM_GOING_OFFLINE: | 257 | case MEM_GOING_OFFLINE: |
255 | break; | 258 | break; |
256 | case MEM_ONLINE: | 259 | case MEM_ONLINE: |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f2f5b4818e94..9d2264ea4606 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype) | |||
30 | zone->nr_pageblock_isolate--; | 30 | zone->nr_pageblock_isolate--; |
31 | } | 31 | } |
32 | 32 | ||
33 | int set_migratetype_isolate(struct page *page) | 33 | int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) |
34 | { | 34 | { |
35 | struct zone *zone; | 35 | struct zone *zone; |
36 | unsigned long flags, pfn; | 36 | unsigned long flags, pfn; |
@@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page) | |||
66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | 66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
67 | * We just check MOVABLE pages. | 67 | * We just check MOVABLE pages. |
68 | */ | 68 | */ |
69 | if (!has_unmovable_pages(zone, page, arg.pages_found)) | 69 | if (!has_unmovable_pages(zone, page, arg.pages_found, |
70 | skip_hwpoisoned_pages)) | ||
70 | ret = 0; | 71 | ret = 0; |
71 | 72 | ||
72 | /* | 73 | /* |
@@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
134 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | 135 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. |
135 | */ | 136 | */ |
136 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | 137 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
137 | unsigned migratetype) | 138 | unsigned migratetype, bool skip_hwpoisoned_pages) |
138 | { | 139 | { |
139 | unsigned long pfn; | 140 | unsigned long pfn; |
140 | unsigned long undo_pfn; | 141 | unsigned long undo_pfn; |
@@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
147 | pfn < end_pfn; | 148 | pfn < end_pfn; |
148 | pfn += pageblock_nr_pages) { | 149 | pfn += pageblock_nr_pages) { |
149 | page = __first_valid_page(pfn, pageblock_nr_pages); | 150 | page = __first_valid_page(pfn, pageblock_nr_pages); |
150 | if (page && set_migratetype_isolate(page)) { | 151 | if (page && |
152 | set_migratetype_isolate(page, skip_hwpoisoned_pages)) { | ||
151 | undo_pfn = pfn; | 153 | undo_pfn = pfn; |
152 | goto undo; | 154 | goto undo; |
153 | } | 155 | } |
@@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
190 | * Returns 1 if all pages in the range are isolated. | 192 | * Returns 1 if all pages in the range are isolated. |
191 | */ | 193 | */ |
192 | static int | 194 | static int |
193 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | 195 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, |
196 | bool skip_hwpoisoned_pages) | ||
194 | { | 197 | { |
195 | struct page *page; | 198 | struct page *page; |
196 | 199 | ||
@@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
220 | else if (page_count(page) == 0 && | 223 | else if (page_count(page) == 0 && |
221 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) | 224 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) |
222 | pfn += 1; | 225 | pfn += 1; |
226 | else if (skip_hwpoisoned_pages && PageHWPoison(page)) { | ||
227 | /* | ||
228 | * The HWPoisoned page may be not in buddy | ||
229 | * system, and page_count() is not 0. | ||
230 | */ | ||
231 | pfn++; | ||
232 | continue; | ||
233 | } | ||
223 | else | 234 | else |
224 | break; | 235 | break; |
225 | } | 236 | } |
@@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
228 | return 1; | 239 | return 1; |
229 | } | 240 | } |
230 | 241 | ||
231 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | 242 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, |
243 | bool skip_hwpoisoned_pages) | ||
232 | { | 244 | { |
233 | unsigned long pfn, flags; | 245 | unsigned long pfn, flags; |
234 | struct page *page; | 246 | struct page *page; |
@@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
251 | /* Check all pages are free or Marked as ISOLATED */ | 263 | /* Check all pages are free or Marked as ISOLATED */ |
252 | zone = page_zone(page); | 264 | zone = page_zone(page); |
253 | spin_lock_irqsave(&zone->lock, flags); | 265 | spin_lock_irqsave(&zone->lock, flags); |
254 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); | 266 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, |
267 | skip_hwpoisoned_pages); | ||
255 | spin_unlock_irqrestore(&zone->lock, flags); | 268 | spin_unlock_irqrestore(&zone->lock, flags); |
256 | return ret ? 0 : -EBUSY; | 269 | return ret ? 0 : -EBUSY; |
257 | } | 270 | } |
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
562 | return address; | 562 | return address; |
563 | } | 563 | } |
564 | 564 | ||
565 | pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) | ||
566 | { | ||
567 | pgd_t *pgd; | ||
568 | pud_t *pud; | ||
569 | pmd_t *pmd = NULL; | ||
570 | |||
571 | pgd = pgd_offset(mm, address); | ||
572 | if (!pgd_present(*pgd)) | ||
573 | goto out; | ||
574 | |||
575 | pud = pud_offset(pgd, address); | ||
576 | if (!pud_present(*pud)) | ||
577 | goto out; | ||
578 | |||
579 | pmd = pmd_offset(pud, address); | ||
580 | if (!pmd_present(*pmd)) | ||
581 | pmd = NULL; | ||
582 | out: | ||
583 | return pmd; | ||
584 | } | ||
585 | |||
565 | /* | 586 | /* |
566 | * Check that @page is mapped at @address into @mm. | 587 | * Check that @page is mapped at @address into @mm. |
567 | * | 588 | * |
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
574 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | 595 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, |
575 | unsigned long address, spinlock_t **ptlp, int sync) | 596 | unsigned long address, spinlock_t **ptlp, int sync) |
576 | { | 597 | { |
577 | pgd_t *pgd; | ||
578 | pud_t *pud; | ||
579 | pmd_t *pmd; | 598 | pmd_t *pmd; |
580 | pte_t *pte; | 599 | pte_t *pte; |
581 | spinlock_t *ptl; | 600 | spinlock_t *ptl; |
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
586 | goto check; | 605 | goto check; |
587 | } | 606 | } |
588 | 607 | ||
589 | pgd = pgd_offset(mm, address); | 608 | pmd = mm_find_pmd(mm, address); |
590 | if (!pgd_present(*pgd)) | 609 | if (!pmd) |
591 | return NULL; | ||
592 | |||
593 | pud = pud_offset(pgd, address); | ||
594 | if (!pud_present(*pud)) | ||
595 | return NULL; | 610 | return NULL; |
596 | 611 | ||
597 | pmd = pmd_offset(pud, address); | ||
598 | if (!pmd_present(*pmd)) | ||
599 | return NULL; | ||
600 | if (pmd_trans_huge(*pmd)) | 612 | if (pmd_trans_huge(*pmd)) |
601 | return NULL; | 613 | return NULL; |
602 | 614 | ||
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page) | |||
1139 | * containing the swap entry, but page not yet written to swap. | 1151 | * containing the swap entry, but page not yet written to swap. |
1140 | * | 1152 | * |
1141 | * And we can skip it on file pages, so long as the filesystem | 1153 | * And we can skip it on file pages, so long as the filesystem |
1142 | * participates in dirty tracking; but need to catch shm and tmpfs | 1154 | * participates in dirty tracking (note that this is not only an |
1143 | * and ramfs pages which have been modified since creation by read | 1155 | * optimization but also solves problems caused by dirty flag in |
1144 | * fault. | 1156 | * storage key getting set by a write from inside kernel); but need to |
1157 | * catch shm and tmpfs and ramfs pages which have been modified since | ||
1158 | * creation by read fault. | ||
1145 | * | 1159 | * |
1146 | * Note that mapping must be decided above, before decrementing | 1160 | * Note that mapping must be decided above, before decrementing |
1147 | * mapcount (which luckily provides a barrier): once page is unmapped, | 1161 | * mapcount (which luckily provides a barrier): once page is unmapped, |
@@ -1345,8 +1359,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1345 | struct vm_area_struct *vma, struct page *check_page) | 1359 | struct vm_area_struct *vma, struct page *check_page) |
1346 | { | 1360 | { |
1347 | struct mm_struct *mm = vma->vm_mm; | 1361 | struct mm_struct *mm = vma->vm_mm; |
1348 | pgd_t *pgd; | ||
1349 | pud_t *pud; | ||
1350 | pmd_t *pmd; | 1362 | pmd_t *pmd; |
1351 | pte_t *pte; | 1363 | pte_t *pte; |
1352 | pte_t pteval; | 1364 | pte_t pteval; |
@@ -1366,16 +1378,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1366 | if (end > vma->vm_end) | 1378 | if (end > vma->vm_end) |
1367 | end = vma->vm_end; | 1379 | end = vma->vm_end; |
1368 | 1380 | ||
1369 | pgd = pgd_offset(mm, address); | 1381 | pmd = mm_find_pmd(mm, address); |
1370 | if (!pgd_present(*pgd)) | 1382 | if (!pmd) |
1371 | return ret; | ||
1372 | |||
1373 | pud = pud_offset(pgd, address); | ||
1374 | if (!pud_present(*pud)) | ||
1375 | return ret; | ||
1376 | |||
1377 | pmd = pmd_offset(pud, address); | ||
1378 | if (!pmd_present(*pmd)) | ||
1379 | return ret; | 1383 | return ret; |
1380 | 1384 | ||
1381 | mmun_start = address; | 1385 | mmun_start = address; |
@@ -3573,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3573 | struct memory_notify *marg = arg; | 3573 | struct memory_notify *marg = arg; |
3574 | int offline_node; | 3574 | int offline_node; |
3575 | 3575 | ||
3576 | offline_node = marg->status_change_nid; | 3576 | offline_node = marg->status_change_nid_normal; |
3577 | 3577 | ||
3578 | /* | 3578 | /* |
3579 | * If the node still has available memory. we need kmem_cache_node | 3579 | * If the node still has available memory. we need kmem_cache_node |
@@ -3606,7 +3606,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
3606 | struct kmem_cache_node *n; | 3606 | struct kmem_cache_node *n; |
3607 | struct kmem_cache *s; | 3607 | struct kmem_cache *s; |
3608 | struct memory_notify *marg = arg; | 3608 | struct memory_notify *marg = arg; |
3609 | int nid = marg->status_change_nid; | 3609 | int nid = marg->status_change_nid_normal; |
3610 | int ret = 0; | 3610 | int ret = 0; |
3611 | 3611 | ||
3612 | /* | 3612 | /* |
diff --git a/mm/sparse.c b/mm/sparse.c index a83de2f72b30..6b5fb762e2ca 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | |||
638 | got_map_page: | 638 | got_map_page: |
639 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); | 639 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); |
640 | got_map_ptr: | 640 | got_map_ptr: |
641 | memset(ret, 0, memmap_size); | ||
642 | 641 | ||
643 | return ret; | 642 | return ret; |
644 | } | 643 | } |
@@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
758 | goto out; | 757 | goto out; |
759 | } | 758 | } |
760 | 759 | ||
760 | memset(memmap, 0, sizeof(struct page) * nr_pages); | ||
761 | |||
761 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 762 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
762 | 763 | ||
763 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); | 764 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); |
@@ -771,6 +772,27 @@ out: | |||
771 | return ret; | 772 | return ret; |
772 | } | 773 | } |
773 | 774 | ||
775 | #ifdef CONFIG_MEMORY_FAILURE | ||
776 | static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | ||
777 | { | ||
778 | int i; | ||
779 | |||
780 | if (!memmap) | ||
781 | return; | ||
782 | |||
783 | for (i = 0; i < PAGES_PER_SECTION; i++) { | ||
784 | if (PageHWPoison(&memmap[i])) { | ||
785 | atomic_long_sub(1, &mce_bad_pages); | ||
786 | ClearPageHWPoison(&memmap[i]); | ||
787 | } | ||
788 | } | ||
789 | } | ||
790 | #else | ||
791 | static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | ||
792 | { | ||
793 | } | ||
794 | #endif | ||
795 | |||
774 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 796 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
775 | { | 797 | { |
776 | struct page *memmap = NULL; | 798 | struct page *memmap = NULL; |
@@ -784,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
784 | ms->pageblock_flags = NULL; | 806 | ms->pageblock_flags = NULL; |
785 | } | 807 | } |
786 | 808 | ||
809 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | ||
787 | free_section_usemap(memmap, usemap); | 810 | free_section_usemap(memmap, usemap); |
788 | } | 811 | } |
789 | #endif | 812 | #endif |
diff --git a/mm/swapfile.c b/mm/swapfile.c index f91a25547ffe..e97a0e5aea91 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1443 | return generic_swapfile_activate(sis, swap_file, span); | 1443 | return generic_swapfile_activate(sis, swap_file, span); |
1444 | } | 1444 | } |
1445 | 1445 | ||
1446 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1446 | static void _enable_swap_info(struct swap_info_struct *p, int prio, |
1447 | unsigned char *swap_map, | 1447 | unsigned char *swap_map, |
1448 | unsigned long *frontswap_map) | 1448 | unsigned long *frontswap_map) |
1449 | { | 1449 | { |
1450 | int i, prev; | 1450 | int i, prev; |
1451 | 1451 | ||
1452 | spin_lock(&swap_lock); | ||
1453 | if (prio >= 0) | 1452 | if (prio >= 0) |
1454 | p->prio = prio; | 1453 | p->prio = prio; |
1455 | else | 1454 | else |
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1472 | swap_list.head = swap_list.next = p->type; | 1471 | swap_list.head = swap_list.next = p->type; |
1473 | else | 1472 | else |
1474 | swap_info[prev]->next = p->type; | 1473 | swap_info[prev]->next = p->type; |
1474 | } | ||
1475 | |||
1476 | static void enable_swap_info(struct swap_info_struct *p, int prio, | ||
1477 | unsigned char *swap_map, | ||
1478 | unsigned long *frontswap_map) | ||
1479 | { | ||
1480 | spin_lock(&swap_lock); | ||
1481 | _enable_swap_info(p, prio, swap_map, frontswap_map); | ||
1475 | frontswap_init(p->type); | 1482 | frontswap_init(p->type); |
1476 | spin_unlock(&swap_lock); | 1483 | spin_unlock(&swap_lock); |
1477 | } | 1484 | } |
1478 | 1485 | ||
1486 | static void reinsert_swap_info(struct swap_info_struct *p) | ||
1487 | { | ||
1488 | spin_lock(&swap_lock); | ||
1489 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | ||
1490 | spin_unlock(&swap_lock); | ||
1491 | } | ||
1492 | |||
1479 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1493 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1480 | { | 1494 | { |
1481 | struct swap_info_struct *p = NULL; | 1495 | struct swap_info_struct *p = NULL; |
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1484 | struct address_space *mapping; | 1498 | struct address_space *mapping; |
1485 | struct inode *inode; | 1499 | struct inode *inode; |
1486 | struct filename *pathname; | 1500 | struct filename *pathname; |
1487 | int oom_score_adj; | ||
1488 | int i, type, prev; | 1501 | int i, type, prev; |
1489 | int err; | 1502 | int err; |
1490 | 1503 | ||
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1543 | p->flags &= ~SWP_WRITEOK; | 1556 | p->flags &= ~SWP_WRITEOK; |
1544 | spin_unlock(&swap_lock); | 1557 | spin_unlock(&swap_lock); |
1545 | 1558 | ||
1546 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1559 | set_current_oom_origin(); |
1547 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | 1560 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
1548 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1561 | clear_current_oom_origin(); |
1549 | 1562 | ||
1550 | if (err) { | 1563 | if (err) { |
1551 | /* | ||
1552 | * reading p->prio and p->swap_map outside the lock is | ||
1553 | * safe here because only sys_swapon and sys_swapoff | ||
1554 | * change them, and there can be no other sys_swapon or | ||
1555 | * sys_swapoff for this swap_info_struct at this point. | ||
1556 | */ | ||
1557 | /* re-insert swap space back into swap_list */ | 1564 | /* re-insert swap space back into swap_list */ |
1558 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | 1565 | reinsert_swap_info(p); |
1559 | goto out_dput; | 1566 | goto out_dput; |
1560 | } | 1567 | } |
1561 | 1568 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78e08300db21..5123a169ab7b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p) | |||
2550 | 2550 | ||
2551 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) | 2551 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) |
2552 | { | 2552 | { |
2553 | if (NUMA_BUILD) { | 2553 | if (IS_ENABLED(CONFIG_NUMA)) { |
2554 | unsigned int nr, *counters = m->private; | 2554 | unsigned int nr, *counters = m->private; |
2555 | 2555 | ||
2556 | if (!counters) | 2556 | if (!counters) |
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file) | |||
2615 | unsigned int *ptr = NULL; | 2615 | unsigned int *ptr = NULL; |
2616 | int ret; | 2616 | int ret; |
2617 | 2617 | ||
2618 | if (NUMA_BUILD) { | 2618 | if (IS_ENABLED(CONFIG_NUMA)) { |
2619 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | 2619 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); |
2620 | if (ptr == NULL) | 2620 | if (ptr == NULL) |
2621 | return -ENOMEM; | 2621 | return -ENOMEM; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index b7ed37675644..157bb116dec8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1679,13 +1679,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1679 | 1679 | ||
1680 | if (global_reclaim(sc)) { | 1680 | if (global_reclaim(sc)) { |
1681 | free = zone_page_state(zone, NR_FREE_PAGES); | 1681 | free = zone_page_state(zone, NR_FREE_PAGES); |
1682 | /* If we have very few page cache pages, | ||
1683 | force-scan anon pages. */ | ||
1684 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1682 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1683 | /* | ||
1684 | * If we have very few page cache pages, force-scan | ||
1685 | * anon pages. | ||
1686 | */ | ||
1685 | fraction[0] = 1; | 1687 | fraction[0] = 1; |
1686 | fraction[1] = 0; | 1688 | fraction[1] = 0; |
1687 | denominator = 1; | 1689 | denominator = 1; |
1688 | goto out; | 1690 | goto out; |
1691 | } else if (!inactive_file_is_low_global(zone)) { | ||
1692 | /* | ||
1693 | * There is enough inactive page cache, do not | ||
1694 | * reclaim anything from the working set right now. | ||
1695 | */ | ||
1696 | fraction[0] = 0; | ||
1697 | fraction[1] = 1; | ||
1698 | denominator = 1; | ||
1699 | goto out; | ||
1689 | } | 1700 | } |
1690 | } | 1701 | } |
1691 | 1702 | ||
@@ -1752,7 +1763,7 @@ out: | |||
1752 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 1763 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
1753 | static bool in_reclaim_compaction(struct scan_control *sc) | 1764 | static bool in_reclaim_compaction(struct scan_control *sc) |
1754 | { | 1765 | { |
1755 | if (COMPACTION_BUILD && sc->order && | 1766 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && |
1756 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || | 1767 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || |
1757 | sc->priority < DEF_PRIORITY - 2)) | 1768 | sc->priority < DEF_PRIORITY - 2)) |
1758 | return true; | 1769 | return true; |
@@ -2005,7 +2016,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2005 | if (zone->all_unreclaimable && | 2016 | if (zone->all_unreclaimable && |
2006 | sc->priority != DEF_PRIORITY) | 2017 | sc->priority != DEF_PRIORITY) |
2007 | continue; /* Let kswapd poll it */ | 2018 | continue; /* Let kswapd poll it */ |
2008 | if (COMPACTION_BUILD) { | 2019 | if (IS_ENABLED(CONFIG_COMPACTION)) { |
2009 | /* | 2020 | /* |
2010 | * If we already have plenty of memory free for | 2021 | * If we already have plenty of memory free for |
2011 | * compaction in this zone, don't free any more. | 2022 | * compaction in this zone, don't free any more. |
@@ -2421,7 +2432,8 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2421 | balance_gap, classzone_idx, 0)) | 2432 | balance_gap, classzone_idx, 0)) |
2422 | return false; | 2433 | return false; |
2423 | 2434 | ||
2424 | if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) | 2435 | if (IS_ENABLED(CONFIG_COMPACTION) && order && |
2436 | !compaction_suitable(zone, order)) | ||
2425 | return false; | 2437 | return false; |
2426 | 2438 | ||
2427 | return true; | 2439 | return true; |
@@ -2684,7 +2696,7 @@ loop_again: | |||
2684 | * Do not reclaim more than needed for compaction. | 2696 | * Do not reclaim more than needed for compaction. |
2685 | */ | 2697 | */ |
2686 | testorder = order; | 2698 | testorder = order; |
2687 | if (COMPACTION_BUILD && order && | 2699 | if (IS_ENABLED(CONFIG_COMPACTION) && order && |
2688 | compaction_suitable(zone, order) != | 2700 | compaction_suitable(zone, order) != |
2689 | COMPACT_SKIPPED) | 2701 | COMPACT_SKIPPED) |
2690 | testorder = 0; | 2702 | testorder = 0; |
@@ -2951,7 +2963,7 @@ static int kswapd(void *p) | |||
2951 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2963 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2952 | balanced_classzone_idx = classzone_idx; | 2964 | balanced_classzone_idx = classzone_idx; |
2953 | for ( ; ; ) { | 2965 | for ( ; ; ) { |
2954 | int ret; | 2966 | bool ret; |
2955 | 2967 | ||
2956 | /* | 2968 | /* |
2957 | * If the last balance_pgdat was unsuccessful it's unlikely a | 2969 | * If the last balance_pgdat was unsuccessful it's unlikely a |
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index b336b24aa6c0..7300d0702efe 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile | |||
@@ -1,9 +1,9 @@ | |||
1 | # Makefile for vm selftests | 1 | # Makefile for vm selftests |
2 | 2 | ||
3 | CC = $(CROSS_COMPILE)gcc | 3 | CC = $(CROSS_COMPILE)gcc |
4 | CFLAGS = -Wall -Wextra | 4 | CFLAGS = -Wall |
5 | 5 | ||
6 | all: hugepage-mmap hugepage-shm map_hugetlb | 6 | all: hugepage-mmap hugepage-shm map_hugetlb thuge-gen |
7 | %: %.c | 7 | %: %.c |
8 | $(CC) $(CFLAGS) -o $@ $^ | 8 | $(CC) $(CFLAGS) -o $@ $^ |
9 | 9 | ||
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c new file mode 100644 index 000000000000..c87957295f74 --- /dev/null +++ b/tools/testing/selftests/vm/thuge-gen.c | |||
@@ -0,0 +1,254 @@ | |||
1 | /* Test selecting other page sizes for mmap/shmget. | ||
2 | |||
3 | Before running this huge pages for each huge page size must have been | ||
4 | reserved. | ||
5 | For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. | ||
6 | Also shmmax must be increased. | ||
7 | And you need to run as root to work around some weird permissions in shm. | ||
8 | And nothing using huge pages should run in parallel. | ||
9 | When the program aborts you may need to clean up the shm segments with | ||
10 | ipcrm -m by hand, like this | ||
11 | sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m | ||
12 | (warning this will remove all if someone else uses them) */ | ||
13 | |||
14 | #define _GNU_SOURCE 1 | ||
15 | #include <sys/mman.h> | ||
16 | #include <stdlib.h> | ||
17 | #include <stdio.h> | ||
18 | #include <sys/ipc.h> | ||
19 | #include <sys/shm.h> | ||
20 | #include <sys/stat.h> | ||
21 | #include <glob.h> | ||
22 | #include <assert.h> | ||
23 | #include <unistd.h> | ||
24 | #include <stdarg.h> | ||
25 | #include <string.h> | ||
26 | |||
27 | #define err(x) perror(x), exit(1) | ||
28 | |||
29 | #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) | ||
30 | #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) | ||
31 | #define MAP_HUGE_SHIFT 26 | ||
32 | #define MAP_HUGE_MASK 0x3f | ||
33 | #define MAP_HUGETLB 0x40000 | ||
34 | |||
35 | #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ | ||
36 | #define SHM_HUGE_SHIFT 26 | ||
37 | #define SHM_HUGE_MASK 0x3f | ||
38 | #define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) | ||
39 | #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) | ||
40 | |||
41 | #define NUM_PAGESIZES 5 | ||
42 | |||
43 | #define NUM_PAGES 4 | ||
44 | |||
45 | #define Dprintf(fmt...) // printf(fmt) | ||
46 | |||
47 | unsigned long page_sizes[NUM_PAGESIZES]; | ||
48 | int num_page_sizes; | ||
49 | |||
50 | int ilog2(unsigned long v) | ||
51 | { | ||
52 | int l = 0; | ||
53 | while ((1UL << l) < v) | ||
54 | l++; | ||
55 | return l; | ||
56 | } | ||
57 | |||
58 | void find_pagesizes(void) | ||
59 | { | ||
60 | glob_t g; | ||
61 | int i; | ||
62 | glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); | ||
63 | assert(g.gl_pathc <= NUM_PAGESIZES); | ||
64 | for (i = 0; i < g.gl_pathc; i++) { | ||
65 | sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", | ||
66 | &page_sizes[i]); | ||
67 | page_sizes[i] <<= 10; | ||
68 | printf("Found %luMB\n", page_sizes[i] >> 20); | ||
69 | } | ||
70 | num_page_sizes = g.gl_pathc; | ||
71 | globfree(&g); | ||
72 | } | ||
73 | |||
74 | unsigned long default_huge_page_size(void) | ||
75 | { | ||
76 | unsigned long hps = 0; | ||
77 | char *line = NULL; | ||
78 | size_t linelen = 0; | ||
79 | FILE *f = fopen("/proc/meminfo", "r"); | ||
80 | if (!f) | ||
81 | return 0; | ||
82 | while (getline(&line, &linelen, f) > 0) { | ||
83 | if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { | ||
84 | hps <<= 10; | ||
85 | break; | ||
86 | } | ||
87 | } | ||
88 | free(line); | ||
89 | return hps; | ||
90 | } | ||
91 | |||
92 | void show(unsigned long ps) | ||
93 | { | ||
94 | char buf[100]; | ||
95 | if (ps == getpagesize()) | ||
96 | return; | ||
97 | printf("%luMB: ", ps >> 20); | ||
98 | fflush(stdout); | ||
99 | snprintf(buf, sizeof buf, | ||
100 | "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", | ||
101 | ps >> 10); | ||
102 | system(buf); | ||
103 | } | ||
104 | |||
105 | unsigned long read_sysfs(int warn, char *fmt, ...) | ||
106 | { | ||
107 | char *line = NULL; | ||
108 | size_t linelen = 0; | ||
109 | char buf[100]; | ||
110 | FILE *f; | ||
111 | va_list ap; | ||
112 | unsigned long val = 0; | ||
113 | |||
114 | va_start(ap, fmt); | ||
115 | vsnprintf(buf, sizeof buf, fmt, ap); | ||
116 | va_end(ap); | ||
117 | |||
118 | f = fopen(buf, "r"); | ||
119 | if (!f) { | ||
120 | if (warn) | ||
121 | printf("missing %s\n", buf); | ||
122 | return 0; | ||
123 | } | ||
124 | if (getline(&line, &linelen, f) > 0) { | ||
125 | sscanf(line, "%lu", &val); | ||
126 | } | ||
127 | fclose(f); | ||
128 | free(line); | ||
129 | return val; | ||
130 | } | ||
131 | |||
132 | unsigned long read_free(unsigned long ps) | ||
133 | { | ||
134 | return read_sysfs(ps != getpagesize(), | ||
135 | "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", | ||
136 | ps >> 10); | ||
137 | } | ||
138 | |||
139 | void test_mmap(unsigned long size, unsigned flags) | ||
140 | { | ||
141 | char *map; | ||
142 | unsigned long before, after; | ||
143 | int err; | ||
144 | |||
145 | before = read_free(size); | ||
146 | map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, | ||
147 | MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, 0, 0); | ||
148 | |||
149 | if (map == (char *)-1) err("mmap"); | ||
150 | memset(map, 0xff, size*NUM_PAGES); | ||
151 | after = read_free(size); | ||
152 | Dprintf("before %lu after %lu diff %ld size %lu\n", | ||
153 | before, after, before - after, size); | ||
154 | assert(size == getpagesize() || (before - after) == NUM_PAGES); | ||
155 | show(size); | ||
156 | err = munmap(map, size); | ||
157 | assert(!err); | ||
158 | } | ||
159 | |||
160 | void test_shmget(unsigned long size, unsigned flags) | ||
161 | { | ||
162 | int id; | ||
163 | unsigned long before, after; | ||
164 | int err; | ||
165 | |||
166 | before = read_free(size); | ||
167 | id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); | ||
168 | if (id < 0) err("shmget"); | ||
169 | |||
170 | struct shm_info i; | ||
171 | if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); | ||
172 | Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); | ||
173 | |||
174 | |||
175 | Dprintf("id %d\n", id); | ||
176 | char *map = shmat(id, NULL, 0600); | ||
177 | if (map == (char*)-1) err("shmat"); | ||
178 | |||
179 | shmctl(id, IPC_RMID, NULL); | ||
180 | |||
181 | memset(map, 0xff, size*NUM_PAGES); | ||
182 | after = read_free(size); | ||
183 | |||
184 | Dprintf("before %lu after %lu diff %ld size %lu\n", | ||
185 | before, after, before - after, size); | ||
186 | assert(size == getpagesize() || (before - after) == NUM_PAGES); | ||
187 | show(size); | ||
188 | err = shmdt(map); | ||
189 | assert(!err); | ||
190 | } | ||
191 | |||
192 | void sanity_checks(void) | ||
193 | { | ||
194 | int i; | ||
195 | unsigned long largest = getpagesize(); | ||
196 | |||
197 | for (i = 0; i < num_page_sizes; i++) { | ||
198 | if (page_sizes[i] > largest) | ||
199 | largest = page_sizes[i]; | ||
200 | |||
201 | if (read_free(page_sizes[i]) < NUM_PAGES) { | ||
202 | printf("Not enough huge pages for page size %lu MB, need %u\n", | ||
203 | page_sizes[i] >> 20, | ||
204 | NUM_PAGES); | ||
205 | exit(0); | ||
206 | } | ||
207 | } | ||
208 | |||
209 | if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { | ||
210 | printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); | ||
211 | exit(0); | ||
212 | } | ||
213 | |||
214 | #if defined(__x86_64__) | ||
215 | if (largest != 1U<<30) { | ||
216 | printf("No GB pages available on x86-64\n" | ||
217 | "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); | ||
218 | exit(0); | ||
219 | } | ||
220 | #endif | ||
221 | } | ||
222 | |||
223 | int main(void) | ||
224 | { | ||
225 | int i; | ||
226 | unsigned default_hps = default_huge_page_size(); | ||
227 | |||
228 | find_pagesizes(); | ||
229 | |||
230 | sanity_checks(); | ||
231 | |||
232 | for (i = 0; i < num_page_sizes; i++) { | ||
233 | unsigned long ps = page_sizes[i]; | ||
234 | int arg = ilog2(ps) << MAP_HUGE_SHIFT; | ||
235 | printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); | ||
236 | test_mmap(ps, MAP_HUGETLB | arg); | ||
237 | } | ||
238 | printf("Testing default huge mmap\n"); | ||
239 | test_mmap(default_hps, SHM_HUGETLB); | ||
240 | |||
241 | puts("Testing non-huge shmget"); | ||
242 | test_shmget(getpagesize(), 0); | ||
243 | |||
244 | for (i = 0; i < num_page_sizes; i++) { | ||
245 | unsigned long ps = page_sizes[i]; | ||
246 | int arg = ilog2(ps) << SHM_HUGE_SHIFT; | ||
247 | printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); | ||
248 | test_shmget(ps, SHM_HUGETLB | arg); | ||
249 | } | ||
250 | puts("default huge shmget"); | ||
251 | test_shmget(default_hps, SHM_HUGETLB); | ||
252 | |||
253 | return 0; | ||
254 | } | ||