diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/ashmem.c | 748 | ||||
-rw-r--r-- | mm/backing-dev.c | 123 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/failslab.c | 39 | ||||
-rw-r--r-- | mm/filemap.c | 160 | ||||
-rw-r--r-- | mm/highmem.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 43 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/init-mm.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 46 | ||||
-rw-r--r-- | mm/kmemleak.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 500 | ||||
-rw-r--r-- | mm/memory-failure.c | 92 | ||||
-rw-r--r-- | mm/memory.c | 127 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 68 | ||||
-rw-r--r-- | mm/mempolicy.c | 32 | ||||
-rw-r--r-- | mm/migrate.c | 8 | ||||
-rw-r--r-- | mm/mincore.c | 11 | ||||
-rw-r--r-- | mm/mmap.c | 34 | ||||
-rw-r--r-- | mm/nommu.c | 37 | ||||
-rw-r--r-- | mm/oom_kill.c | 11 | ||||
-rw-r--r-- | mm/page-writeback.c | 280 | ||||
-rw-r--r-- | mm/page_alloc.c | 207 | ||||
-rw-r--r-- | mm/page_cgroup.c | 10 | ||||
-rw-r--r-- | mm/pagewalk.c | 49 | ||||
-rw-r--r-- | mm/percpu-vm.c | 12 | ||||
-rw-r--r-- | mm/percpu.c | 40 | ||||
-rw-r--r-- | mm/rmap.c | 11 | ||||
-rw-r--r-- | mm/shmem.c | 1828 | ||||
-rw-r--r-- | mm/slab.c | 121 | ||||
-rw-r--r-- | mm/slob.c | 8 | ||||
-rw-r--r-- | mm/slub.c | 882 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 83 | ||||
-rw-r--r-- | mm/swapfile.c | 49 | ||||
-rw-r--r-- | mm/thrash.c | 17 | ||||
-rw-r--r-- | mm/truncate.c | 154 | ||||
-rw-r--r-- | mm/vmalloc.c | 102 | ||||
-rw-r--r-- | mm/vmscan.c | 145 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
44 files changed, 3794 insertions, 2358 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8ca47a5ee9c..f2f1ca19ed5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -356,7 +356,7 @@ config CLEANCACHE | |||
356 | for clean pages that the kernel's pageframe replacement algorithm | 356 | for clean pages that the kernel's pageframe replacement algorithm |
357 | (PFRA) would like to keep around, but can't since there isn't enough | 357 | (PFRA) would like to keep around, but can't since there isn't enough |
358 | memory. So when the PFRA "evicts" a page, it first attempts to use | 358 | memory. So when the PFRA "evicts" a page, it first attempts to use |
359 | cleancacne code to put the data contained in that page into | 359 | cleancache code to put the data contained in that page into |
360 | "transcendent memory", memory that is not directly accessible or | 360 | "transcendent memory", memory that is not directly accessible or |
361 | addressable by the kernel and is of unknown and possibly | 361 | addressable by the kernel and is of unknown and possibly |
362 | time-varying size. And when a cleancache-enabled | 362 | time-varying size. And when a cleancache-enabled |
diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1b..2d00bf57ca4 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -30,6 +30,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
30 | obj-$(CONFIG_NUMA) += mempolicy.o | 30 | obj-$(CONFIG_NUMA) += mempolicy.o |
31 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 31 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
32 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 32 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
33 | obj-$(CONFIG_ASHMEM) += ashmem.o | ||
33 | obj-$(CONFIG_SLOB) += slob.o | 34 | obj-$(CONFIG_SLOB) += slob.o |
34 | obj-$(CONFIG_COMPACTION) += compaction.o | 35 | obj-$(CONFIG_COMPACTION) += compaction.o |
35 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 36 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
diff --git a/mm/ashmem.c b/mm/ashmem.c new file mode 100644 index 00000000000..66e3f23ee33 --- /dev/null +++ b/mm/ashmem.c | |||
@@ -0,0 +1,748 @@ | |||
1 | /* mm/ashmem.c | ||
2 | ** | ||
3 | ** Anonymous Shared Memory Subsystem, ashmem | ||
4 | ** | ||
5 | ** Copyright (C) 2008 Google, Inc. | ||
6 | ** | ||
7 | ** Robert Love <rlove@google.com> | ||
8 | ** | ||
9 | ** This software is licensed under the terms of the GNU General Public | ||
10 | ** License version 2, as published by the Free Software Foundation, and | ||
11 | ** may be copied, distributed, and modified under those terms. | ||
12 | ** | ||
13 | ** This program is distributed in the hope that it will be useful, | ||
14 | ** but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | ** GNU General Public License for more details. | ||
17 | */ | ||
18 | |||
19 | #include <linux/module.h> | ||
20 | #include <linux/file.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/miscdevice.h> | ||
23 | #include <linux/security.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/mman.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <linux/personality.h> | ||
28 | #include <linux/bitops.h> | ||
29 | #include <linux/mutex.h> | ||
30 | #include <linux/shmem_fs.h> | ||
31 | #include <linux/ashmem.h> | ||
32 | |||
33 | #define ASHMEM_NAME_PREFIX "dev/ashmem/" | ||
34 | #define ASHMEM_NAME_PREFIX_LEN (sizeof(ASHMEM_NAME_PREFIX) - 1) | ||
35 | #define ASHMEM_FULL_NAME_LEN (ASHMEM_NAME_LEN + ASHMEM_NAME_PREFIX_LEN) | ||
36 | |||
37 | /* | ||
38 | * ashmem_area - anonymous shared memory area | ||
39 | * Lifecycle: From our parent file's open() until its release() | ||
40 | * Locking: Protected by `ashmem_mutex' | ||
41 | * Big Note: Mappings do NOT pin this structure; it dies on close() | ||
42 | */ | ||
43 | struct ashmem_area { | ||
44 | char name[ASHMEM_FULL_NAME_LEN];/* optional name for /proc/pid/maps */ | ||
45 | struct list_head unpinned_list; /* list of all ashmem areas */ | ||
46 | struct file *file; /* the shmem-based backing file */ | ||
47 | size_t size; /* size of the mapping, in bytes */ | ||
48 | unsigned long prot_mask; /* allowed prot bits, as vm_flags */ | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | * ashmem_range - represents an interval of unpinned (evictable) pages | ||
53 | * Lifecycle: From unpin to pin | ||
54 | * Locking: Protected by `ashmem_mutex' | ||
55 | */ | ||
56 | struct ashmem_range { | ||
57 | struct list_head lru; /* entry in LRU list */ | ||
58 | struct list_head unpinned; /* entry in its area's unpinned list */ | ||
59 | struct ashmem_area *asma; /* associated area */ | ||
60 | size_t pgstart; /* starting page, inclusive */ | ||
61 | size_t pgend; /* ending page, inclusive */ | ||
62 | unsigned int purged; /* ASHMEM_NOT or ASHMEM_WAS_PURGED */ | ||
63 | }; | ||
64 | |||
65 | /* LRU list of unpinned pages, protected by ashmem_mutex */ | ||
66 | static LIST_HEAD(ashmem_lru_list); | ||
67 | |||
68 | /* Count of pages on our LRU list, protected by ashmem_mutex */ | ||
69 | static unsigned long lru_count; | ||
70 | |||
71 | /* | ||
72 | * ashmem_mutex - protects the list of and each individual ashmem_area | ||
73 | * | ||
74 | * Lock Ordering: ashmex_mutex -> i_mutex -> i_alloc_sem | ||
75 | */ | ||
76 | static DEFINE_MUTEX(ashmem_mutex); | ||
77 | |||
78 | static struct kmem_cache *ashmem_area_cachep __read_mostly; | ||
79 | static struct kmem_cache *ashmem_range_cachep __read_mostly; | ||
80 | |||
81 | #define range_size(range) \ | ||
82 | ((range)->pgend - (range)->pgstart + 1) | ||
83 | |||
84 | #define range_on_lru(range) \ | ||
85 | ((range)->purged == ASHMEM_NOT_PURGED) | ||
86 | |||
87 | #define page_range_subsumes_range(range, start, end) \ | ||
88 | (((range)->pgstart >= (start)) && ((range)->pgend <= (end))) | ||
89 | |||
90 | #define page_range_subsumed_by_range(range, start, end) \ | ||
91 | (((range)->pgstart <= (start)) && ((range)->pgend >= (end))) | ||
92 | |||
93 | #define page_in_range(range, page) \ | ||
94 | (((range)->pgstart <= (page)) && ((range)->pgend >= (page))) | ||
95 | |||
96 | #define page_range_in_range(range, start, end) \ | ||
97 | (page_in_range(range, start) || page_in_range(range, end) || \ | ||
98 | page_range_subsumes_range(range, start, end)) | ||
99 | |||
100 | #define range_before_page(range, page) \ | ||
101 | ((range)->pgend < (page)) | ||
102 | |||
103 | #define PROT_MASK (PROT_EXEC | PROT_READ | PROT_WRITE) | ||
104 | |||
105 | static inline void lru_add(struct ashmem_range *range) | ||
106 | { | ||
107 | list_add_tail(&range->lru, &ashmem_lru_list); | ||
108 | lru_count += range_size(range); | ||
109 | } | ||
110 | |||
111 | static inline void lru_del(struct ashmem_range *range) | ||
112 | { | ||
113 | list_del(&range->lru); | ||
114 | lru_count -= range_size(range); | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * range_alloc - allocate and initialize a new ashmem_range structure | ||
119 | * | ||
120 | * 'asma' - associated ashmem_area | ||
121 | * 'prev_range' - the previous ashmem_range in the sorted asma->unpinned list | ||
122 | * 'purged' - initial purge value (ASMEM_NOT_PURGED or ASHMEM_WAS_PURGED) | ||
123 | * 'start' - starting page, inclusive | ||
124 | * 'end' - ending page, inclusive | ||
125 | * | ||
126 | * Caller must hold ashmem_mutex. | ||
127 | */ | ||
128 | static int range_alloc(struct ashmem_area *asma, | ||
129 | struct ashmem_range *prev_range, unsigned int purged, | ||
130 | size_t start, size_t end) | ||
131 | { | ||
132 | struct ashmem_range *range; | ||
133 | |||
134 | range = kmem_cache_zalloc(ashmem_range_cachep, GFP_KERNEL); | ||
135 | if (unlikely(!range)) | ||
136 | return -ENOMEM; | ||
137 | |||
138 | range->asma = asma; | ||
139 | range->pgstart = start; | ||
140 | range->pgend = end; | ||
141 | range->purged = purged; | ||
142 | |||
143 | list_add_tail(&range->unpinned, &prev_range->unpinned); | ||
144 | |||
145 | if (range_on_lru(range)) | ||
146 | lru_add(range); | ||
147 | |||
148 | return 0; | ||
149 | } | ||
150 | |||
151 | static void range_del(struct ashmem_range *range) | ||
152 | { | ||
153 | list_del(&range->unpinned); | ||
154 | if (range_on_lru(range)) | ||
155 | lru_del(range); | ||
156 | kmem_cache_free(ashmem_range_cachep, range); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * range_shrink - shrinks a range | ||
161 | * | ||
162 | * Caller must hold ashmem_mutex. | ||
163 | */ | ||
164 | static inline void range_shrink(struct ashmem_range *range, | ||
165 | size_t start, size_t end) | ||
166 | { | ||
167 | size_t pre = range_size(range); | ||
168 | |||
169 | range->pgstart = start; | ||
170 | range->pgend = end; | ||
171 | |||
172 | if (range_on_lru(range)) | ||
173 | lru_count -= pre - range_size(range); | ||
174 | } | ||
175 | |||
176 | static int ashmem_open(struct inode *inode, struct file *file) | ||
177 | { | ||
178 | struct ashmem_area *asma; | ||
179 | int ret; | ||
180 | |||
181 | ret = generic_file_open(inode, file); | ||
182 | if (unlikely(ret)) | ||
183 | return ret; | ||
184 | |||
185 | asma = kmem_cache_zalloc(ashmem_area_cachep, GFP_KERNEL); | ||
186 | if (unlikely(!asma)) | ||
187 | return -ENOMEM; | ||
188 | |||
189 | INIT_LIST_HEAD(&asma->unpinned_list); | ||
190 | memcpy(asma->name, ASHMEM_NAME_PREFIX, ASHMEM_NAME_PREFIX_LEN); | ||
191 | asma->prot_mask = PROT_MASK; | ||
192 | file->private_data = asma; | ||
193 | |||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static int ashmem_release(struct inode *ignored, struct file *file) | ||
198 | { | ||
199 | struct ashmem_area *asma = file->private_data; | ||
200 | struct ashmem_range *range, *next; | ||
201 | |||
202 | mutex_lock(&ashmem_mutex); | ||
203 | list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) | ||
204 | range_del(range); | ||
205 | mutex_unlock(&ashmem_mutex); | ||
206 | |||
207 | if (asma->file) | ||
208 | fput(asma->file); | ||
209 | kmem_cache_free(ashmem_area_cachep, asma); | ||
210 | |||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | static ssize_t ashmem_read(struct file *file, char __user *buf, | ||
215 | size_t len, loff_t *pos) | ||
216 | { | ||
217 | struct ashmem_area *asma = file->private_data; | ||
218 | int ret = 0; | ||
219 | |||
220 | mutex_lock(&ashmem_mutex); | ||
221 | |||
222 | /* If size is not set, or set to 0, always return EOF. */ | ||
223 | if (asma->size == 0) { | ||
224 | goto out; | ||
225 | } | ||
226 | |||
227 | if (!asma->file) { | ||
228 | ret = -EBADF; | ||
229 | goto out; | ||
230 | } | ||
231 | |||
232 | ret = asma->file->f_op->read(asma->file, buf, len, pos); | ||
233 | if (ret < 0) { | ||
234 | goto out; | ||
235 | } | ||
236 | |||
237 | /** Update backing file pos, since f_ops->read() doesn't */ | ||
238 | asma->file->f_pos = *pos; | ||
239 | |||
240 | out: | ||
241 | mutex_unlock(&ashmem_mutex); | ||
242 | return ret; | ||
243 | } | ||
244 | |||
245 | static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin) | ||
246 | { | ||
247 | struct ashmem_area *asma = file->private_data; | ||
248 | int ret; | ||
249 | |||
250 | mutex_lock(&ashmem_mutex); | ||
251 | |||
252 | if (asma->size == 0) { | ||
253 | ret = -EINVAL; | ||
254 | goto out; | ||
255 | } | ||
256 | |||
257 | if (!asma->file) { | ||
258 | ret = -EBADF; | ||
259 | goto out; | ||
260 | } | ||
261 | |||
262 | ret = asma->file->f_op->llseek(asma->file, offset, origin); | ||
263 | if (ret < 0) { | ||
264 | goto out; | ||
265 | } | ||
266 | |||
267 | /** Copy f_pos from backing file, since f_ops->llseek() sets it */ | ||
268 | file->f_pos = asma->file->f_pos; | ||
269 | |||
270 | out: | ||
271 | mutex_unlock(&ashmem_mutex); | ||
272 | return ret; | ||
273 | } | ||
274 | |||
275 | static inline unsigned long | ||
276 | calc_vm_may_flags(unsigned long prot) | ||
277 | { | ||
278 | return _calc_vm_trans(prot, PROT_READ, VM_MAYREAD ) | | ||
279 | _calc_vm_trans(prot, PROT_WRITE, VM_MAYWRITE) | | ||
280 | _calc_vm_trans(prot, PROT_EXEC, VM_MAYEXEC); | ||
281 | } | ||
282 | |||
283 | static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) | ||
284 | { | ||
285 | struct ashmem_area *asma = file->private_data; | ||
286 | int ret = 0; | ||
287 | |||
288 | mutex_lock(&ashmem_mutex); | ||
289 | |||
290 | /* user needs to SET_SIZE before mapping */ | ||
291 | if (unlikely(!asma->size)) { | ||
292 | ret = -EINVAL; | ||
293 | goto out; | ||
294 | } | ||
295 | |||
296 | /* requested protection bits must match our allowed protection mask */ | ||
297 | if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) & | ||
298 | calc_vm_prot_bits(PROT_MASK))) { | ||
299 | ret = -EPERM; | ||
300 | goto out; | ||
301 | } | ||
302 | vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask); | ||
303 | |||
304 | if (!asma->file) { | ||
305 | char *name = ASHMEM_NAME_DEF; | ||
306 | struct file *vmfile; | ||
307 | |||
308 | if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') | ||
309 | name = asma->name; | ||
310 | |||
311 | /* ... and allocate the backing shmem file */ | ||
312 | vmfile = shmem_file_setup(name, asma->size, vma->vm_flags); | ||
313 | if (unlikely(IS_ERR(vmfile))) { | ||
314 | ret = PTR_ERR(vmfile); | ||
315 | goto out; | ||
316 | } | ||
317 | asma->file = vmfile; | ||
318 | } | ||
319 | get_file(asma->file); | ||
320 | |||
321 | if (vma->vm_flags & VM_SHARED) | ||
322 | shmem_set_file(vma, asma->file); | ||
323 | else { | ||
324 | if (vma->vm_file) | ||
325 | fput(vma->vm_file); | ||
326 | vma->vm_file = asma->file; | ||
327 | } | ||
328 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
329 | |||
330 | out: | ||
331 | mutex_unlock(&ashmem_mutex); | ||
332 | return ret; | ||
333 | } | ||
334 | |||
335 | /* | ||
336 | * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab | ||
337 | * | ||
338 | * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how | ||
339 | * many objects (pages) we have in total. | ||
340 | * | ||
341 | * 'gfp_mask' is the mask of the allocation that got us into this mess. | ||
342 | * | ||
343 | * Return value is the number of objects (pages) remaining, or -1 if we cannot | ||
344 | * proceed without risk of deadlock (due to gfp_mask). | ||
345 | * | ||
346 | * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial | ||
347 | * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan' | ||
348 | * pages freed. | ||
349 | */ | ||
350 | static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc) | ||
351 | { | ||
352 | struct ashmem_range *range, *next; | ||
353 | |||
354 | /* We might recurse into filesystem code, so bail out if necessary */ | ||
355 | if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) | ||
356 | return -1; | ||
357 | if (!sc->nr_to_scan) | ||
358 | return lru_count; | ||
359 | |||
360 | mutex_lock(&ashmem_mutex); | ||
361 | list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) { | ||
362 | struct inode *inode = range->asma->file->f_dentry->d_inode; | ||
363 | loff_t start = range->pgstart * PAGE_SIZE; | ||
364 | loff_t end = (range->pgend + 1) * PAGE_SIZE - 1; | ||
365 | |||
366 | vmtruncate_range(inode, start, end); | ||
367 | range->purged = ASHMEM_WAS_PURGED; | ||
368 | lru_del(range); | ||
369 | |||
370 | sc->nr_to_scan -= range_size(range); | ||
371 | if (sc->nr_to_scan <= 0) | ||
372 | break; | ||
373 | } | ||
374 | mutex_unlock(&ashmem_mutex); | ||
375 | |||
376 | return lru_count; | ||
377 | } | ||
378 | |||
379 | static struct shrinker ashmem_shrinker = { | ||
380 | .shrink = ashmem_shrink, | ||
381 | .seeks = DEFAULT_SEEKS * 4, | ||
382 | }; | ||
383 | |||
384 | static int set_prot_mask(struct ashmem_area *asma, unsigned long prot) | ||
385 | { | ||
386 | int ret = 0; | ||
387 | |||
388 | mutex_lock(&ashmem_mutex); | ||
389 | |||
390 | /* the user can only remove, not add, protection bits */ | ||
391 | if (unlikely((asma->prot_mask & prot) != prot)) { | ||
392 | ret = -EINVAL; | ||
393 | goto out; | ||
394 | } | ||
395 | |||
396 | /* does the application expect PROT_READ to imply PROT_EXEC? */ | ||
397 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | ||
398 | prot |= PROT_EXEC; | ||
399 | |||
400 | asma->prot_mask = prot; | ||
401 | |||
402 | out: | ||
403 | mutex_unlock(&ashmem_mutex); | ||
404 | return ret; | ||
405 | } | ||
406 | |||
407 | static int set_name(struct ashmem_area *asma, void __user *name) | ||
408 | { | ||
409 | int ret = 0; | ||
410 | |||
411 | mutex_lock(&ashmem_mutex); | ||
412 | |||
413 | /* cannot change an existing mapping's name */ | ||
414 | if (unlikely(asma->file)) { | ||
415 | ret = -EINVAL; | ||
416 | goto out; | ||
417 | } | ||
418 | |||
419 | if (unlikely(copy_from_user(asma->name + ASHMEM_NAME_PREFIX_LEN, | ||
420 | name, ASHMEM_NAME_LEN))) | ||
421 | ret = -EFAULT; | ||
422 | asma->name[ASHMEM_FULL_NAME_LEN-1] = '\0'; | ||
423 | |||
424 | out: | ||
425 | mutex_unlock(&ashmem_mutex); | ||
426 | |||
427 | return ret; | ||
428 | } | ||
429 | |||
430 | static int get_name(struct ashmem_area *asma, void __user *name) | ||
431 | { | ||
432 | int ret = 0; | ||
433 | |||
434 | mutex_lock(&ashmem_mutex); | ||
435 | if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') { | ||
436 | size_t len; | ||
437 | |||
438 | /* | ||
439 | * Copying only `len', instead of ASHMEM_NAME_LEN, bytes | ||
440 | * prevents us from revealing one user's stack to another. | ||
441 | */ | ||
442 | len = strlen(asma->name + ASHMEM_NAME_PREFIX_LEN) + 1; | ||
443 | if (unlikely(copy_to_user(name, | ||
444 | asma->name + ASHMEM_NAME_PREFIX_LEN, len))) | ||
445 | ret = -EFAULT; | ||
446 | } else { | ||
447 | if (unlikely(copy_to_user(name, ASHMEM_NAME_DEF, | ||
448 | sizeof(ASHMEM_NAME_DEF)))) | ||
449 | ret = -EFAULT; | ||
450 | } | ||
451 | mutex_unlock(&ashmem_mutex); | ||
452 | |||
453 | return ret; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * ashmem_pin - pin the given ashmem region, returning whether it was | ||
458 | * previously purged (ASHMEM_WAS_PURGED) or not (ASHMEM_NOT_PURGED). | ||
459 | * | ||
460 | * Caller must hold ashmem_mutex. | ||
461 | */ | ||
462 | static int ashmem_pin(struct ashmem_area *asma, size_t pgstart, size_t pgend) | ||
463 | { | ||
464 | struct ashmem_range *range, *next; | ||
465 | int ret = ASHMEM_NOT_PURGED; | ||
466 | |||
467 | list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) { | ||
468 | /* moved past last applicable page; we can short circuit */ | ||
469 | if (range_before_page(range, pgstart)) | ||
470 | break; | ||
471 | |||
472 | /* | ||
473 | * The user can ask us to pin pages that span multiple ranges, | ||
474 | * or to pin pages that aren't even unpinned, so this is messy. | ||
475 | * | ||
476 | * Four cases: | ||
477 | * 1. The requested range subsumes an existing range, so we | ||
478 | * just remove the entire matching range. | ||
479 | * 2. The requested range overlaps the start of an existing | ||
480 | * range, so we just update that range. | ||
481 | * 3. The requested range overlaps the end of an existing | ||
482 | * range, so we just update that range. | ||
483 | * 4. The requested range punches a hole in an existing range, | ||
484 | * so we have to update one side of the range and then | ||
485 | * create a new range for the other side. | ||
486 | */ | ||
487 | if (page_range_in_range(range, pgstart, pgend)) { | ||
488 | ret |= range->purged; | ||
489 | |||
490 | /* Case #1: Easy. Just nuke the whole thing. */ | ||
491 | if (page_range_subsumes_range(range, pgstart, pgend)) { | ||
492 | range_del(range); | ||
493 | continue; | ||
494 | } | ||
495 | |||
496 | /* Case #2: We overlap from the start, so adjust it */ | ||
497 | if (range->pgstart >= pgstart) { | ||
498 | range_shrink(range, pgend + 1, range->pgend); | ||
499 | continue; | ||
500 | } | ||
501 | |||
502 | /* Case #3: We overlap from the rear, so adjust it */ | ||
503 | if (range->pgend <= pgend) { | ||
504 | range_shrink(range, range->pgstart, pgstart-1); | ||
505 | continue; | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * Case #4: We eat a chunk out of the middle. A bit | ||
510 | * more complicated, we allocate a new range for the | ||
511 | * second half and adjust the first chunk's endpoint. | ||
512 | */ | ||
513 | range_alloc(asma, range, range->purged, | ||
514 | pgend + 1, range->pgend); | ||
515 | range_shrink(range, range->pgstart, pgstart - 1); | ||
516 | break; | ||
517 | } | ||
518 | } | ||
519 | |||
520 | return ret; | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * ashmem_unpin - unpin the given range of pages. Returns zero on success. | ||
525 | * | ||
526 | * Caller must hold ashmem_mutex. | ||
527 | */ | ||
528 | static int ashmem_unpin(struct ashmem_area *asma, size_t pgstart, size_t pgend) | ||
529 | { | ||
530 | struct ashmem_range *range, *next; | ||
531 | unsigned int purged = ASHMEM_NOT_PURGED; | ||
532 | |||
533 | restart: | ||
534 | list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) { | ||
535 | /* short circuit: this is our insertion point */ | ||
536 | if (range_before_page(range, pgstart)) | ||
537 | break; | ||
538 | |||
539 | /* | ||
540 | * The user can ask us to unpin pages that are already entirely | ||
541 | * or partially pinned. We handle those two cases here. | ||
542 | */ | ||
543 | if (page_range_subsumed_by_range(range, pgstart, pgend)) | ||
544 | return 0; | ||
545 | if (page_range_in_range(range, pgstart, pgend)) { | ||
546 | pgstart = min_t(size_t, range->pgstart, pgstart), | ||
547 | pgend = max_t(size_t, range->pgend, pgend); | ||
548 | purged |= range->purged; | ||
549 | range_del(range); | ||
550 | goto restart; | ||
551 | } | ||
552 | } | ||
553 | |||
554 | return range_alloc(asma, range, purged, pgstart, pgend); | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | * ashmem_get_pin_status - Returns ASHMEM_IS_UNPINNED if _any_ pages in the | ||
559 | * given interval are unpinned and ASHMEM_IS_PINNED otherwise. | ||
560 | * | ||
561 | * Caller must hold ashmem_mutex. | ||
562 | */ | ||
563 | static int ashmem_get_pin_status(struct ashmem_area *asma, size_t pgstart, | ||
564 | size_t pgend) | ||
565 | { | ||
566 | struct ashmem_range *range; | ||
567 | int ret = ASHMEM_IS_PINNED; | ||
568 | |||
569 | list_for_each_entry(range, &asma->unpinned_list, unpinned) { | ||
570 | if (range_before_page(range, pgstart)) | ||
571 | break; | ||
572 | if (page_range_in_range(range, pgstart, pgend)) { | ||
573 | ret = ASHMEM_IS_UNPINNED; | ||
574 | break; | ||
575 | } | ||
576 | } | ||
577 | |||
578 | return ret; | ||
579 | } | ||
580 | |||
581 | static int ashmem_pin_unpin(struct ashmem_area *asma, unsigned long cmd, | ||
582 | void __user *p) | ||
583 | { | ||
584 | struct ashmem_pin pin; | ||
585 | size_t pgstart, pgend; | ||
586 | int ret = -EINVAL; | ||
587 | |||
588 | if (unlikely(!asma->file)) | ||
589 | return -EINVAL; | ||
590 | |||
591 | if (unlikely(copy_from_user(&pin, p, sizeof(pin)))) | ||
592 | return -EFAULT; | ||
593 | |||
594 | /* per custom, you can pass zero for len to mean "everything onward" */ | ||
595 | if (!pin.len) | ||
596 | pin.len = PAGE_ALIGN(asma->size) - pin.offset; | ||
597 | |||
598 | if (unlikely((pin.offset | pin.len) & ~PAGE_MASK)) | ||
599 | return -EINVAL; | ||
600 | |||
601 | if (unlikely(((__u32) -1) - pin.offset < pin.len)) | ||
602 | return -EINVAL; | ||
603 | |||
604 | if (unlikely(PAGE_ALIGN(asma->size) < pin.offset + pin.len)) | ||
605 | return -EINVAL; | ||
606 | |||
607 | pgstart = pin.offset / PAGE_SIZE; | ||
608 | pgend = pgstart + (pin.len / PAGE_SIZE) - 1; | ||
609 | |||
610 | mutex_lock(&ashmem_mutex); | ||
611 | |||
612 | switch (cmd) { | ||
613 | case ASHMEM_PIN: | ||
614 | ret = ashmem_pin(asma, pgstart, pgend); | ||
615 | break; | ||
616 | case ASHMEM_UNPIN: | ||
617 | ret = ashmem_unpin(asma, pgstart, pgend); | ||
618 | break; | ||
619 | case ASHMEM_GET_PIN_STATUS: | ||
620 | ret = ashmem_get_pin_status(asma, pgstart, pgend); | ||
621 | break; | ||
622 | } | ||
623 | |||
624 | mutex_unlock(&ashmem_mutex); | ||
625 | |||
626 | return ret; | ||
627 | } | ||
628 | |||
629 | static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
630 | { | ||
631 | struct ashmem_area *asma = file->private_data; | ||
632 | long ret = -ENOTTY; | ||
633 | |||
634 | switch (cmd) { | ||
635 | case ASHMEM_SET_NAME: | ||
636 | ret = set_name(asma, (void __user *) arg); | ||
637 | break; | ||
638 | case ASHMEM_GET_NAME: | ||
639 | ret = get_name(asma, (void __user *) arg); | ||
640 | break; | ||
641 | case ASHMEM_SET_SIZE: | ||
642 | ret = -EINVAL; | ||
643 | if (!asma->file) { | ||
644 | ret = 0; | ||
645 | asma->size = (size_t) arg; | ||
646 | } | ||
647 | break; | ||
648 | case ASHMEM_GET_SIZE: | ||
649 | ret = asma->size; | ||
650 | break; | ||
651 | case ASHMEM_SET_PROT_MASK: | ||
652 | ret = set_prot_mask(asma, arg); | ||
653 | break; | ||
654 | case ASHMEM_GET_PROT_MASK: | ||
655 | ret = asma->prot_mask; | ||
656 | break; | ||
657 | case ASHMEM_PIN: | ||
658 | case ASHMEM_UNPIN: | ||
659 | case ASHMEM_GET_PIN_STATUS: | ||
660 | ret = ashmem_pin_unpin(asma, cmd, (void __user *) arg); | ||
661 | break; | ||
662 | case ASHMEM_PURGE_ALL_CACHES: | ||
663 | ret = -EPERM; | ||
664 | if (capable(CAP_SYS_ADMIN)) { | ||
665 | struct shrink_control sc = { | ||
666 | .gfp_mask = GFP_KERNEL, | ||
667 | .nr_to_scan = 0, | ||
668 | }; | ||
669 | ret = ashmem_shrink(&ashmem_shrinker, &sc); | ||
670 | sc.nr_to_scan = ret; | ||
671 | ashmem_shrink(&ashmem_shrinker, &sc); | ||
672 | } | ||
673 | break; | ||
674 | } | ||
675 | |||
676 | return ret; | ||
677 | } | ||
678 | |||
679 | static struct file_operations ashmem_fops = { | ||
680 | .owner = THIS_MODULE, | ||
681 | .open = ashmem_open, | ||
682 | .release = ashmem_release, | ||
683 | .read = ashmem_read, | ||
684 | .llseek = ashmem_llseek, | ||
685 | .mmap = ashmem_mmap, | ||
686 | .unlocked_ioctl = ashmem_ioctl, | ||
687 | .compat_ioctl = ashmem_ioctl, | ||
688 | }; | ||
689 | |||
690 | static struct miscdevice ashmem_misc = { | ||
691 | .minor = MISC_DYNAMIC_MINOR, | ||
692 | .name = "ashmem", | ||
693 | .fops = &ashmem_fops, | ||
694 | }; | ||
695 | |||
696 | static int __init ashmem_init(void) | ||
697 | { | ||
698 | int ret; | ||
699 | |||
700 | ashmem_area_cachep = kmem_cache_create("ashmem_area_cache", | ||
701 | sizeof(struct ashmem_area), | ||
702 | 0, 0, NULL); | ||
703 | if (unlikely(!ashmem_area_cachep)) { | ||
704 | printk(KERN_ERR "ashmem: failed to create slab cache\n"); | ||
705 | return -ENOMEM; | ||
706 | } | ||
707 | |||
708 | ashmem_range_cachep = kmem_cache_create("ashmem_range_cache", | ||
709 | sizeof(struct ashmem_range), | ||
710 | 0, 0, NULL); | ||
711 | if (unlikely(!ashmem_range_cachep)) { | ||
712 | printk(KERN_ERR "ashmem: failed to create slab cache\n"); | ||
713 | return -ENOMEM; | ||
714 | } | ||
715 | |||
716 | ret = misc_register(&ashmem_misc); | ||
717 | if (unlikely(ret)) { | ||
718 | printk(KERN_ERR "ashmem: failed to register misc device!\n"); | ||
719 | return ret; | ||
720 | } | ||
721 | |||
722 | register_shrinker(&ashmem_shrinker); | ||
723 | |||
724 | printk(KERN_INFO "ashmem: initialized\n"); | ||
725 | |||
726 | return 0; | ||
727 | } | ||
728 | |||
729 | static void __exit ashmem_exit(void) | ||
730 | { | ||
731 | int ret; | ||
732 | |||
733 | unregister_shrinker(&ashmem_shrinker); | ||
734 | |||
735 | ret = misc_deregister(&ashmem_misc); | ||
736 | if (unlikely(ret)) | ||
737 | printk(KERN_ERR "ashmem: failed to unregister misc device!\n"); | ||
738 | |||
739 | kmem_cache_destroy(ashmem_range_cachep); | ||
740 | kmem_cache_destroy(ashmem_area_cachep); | ||
741 | |||
742 | printk(KERN_INFO "ashmem: unloaded\n"); | ||
743 | } | ||
744 | |||
745 | module_init(ashmem_init); | ||
746 | module_exit(ashmem_exit); | ||
747 | |||
748 | MODULE_LICENSE("GPL"); | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09..253b071b7d9 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; | |||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | ||
82 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | seq_printf(m, | 94 | seq_printf(m, |
84 | "BdiWriteback: %8lu kB\n" | 95 | "BdiWriteback: %10lu kB\n" |
85 | "BdiReclaimable: %8lu kB\n" | 96 | "BdiReclaimable: %10lu kB\n" |
86 | "BdiDirtyThresh: %8lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
87 | "DirtyThresh: %8lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
88 | "BackgroundThresh: %8lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
89 | "b_dirty: %8lu\n" | 100 | "BdiWritten: %10lu kB\n" |
90 | "b_io: %8lu\n" | 101 | "BdiWriteBandwidth: %10lu kBps\n" |
91 | "b_more_io: %8lu\n" | 102 | "b_dirty: %10lu\n" |
92 | "bdi_list: %8u\n" | 103 | "b_io: %10lu\n" |
93 | "state: %8lx\n", | 104 | "b_more_io: %10lu\n" |
105 | "bdi_list: %10u\n" | ||
106 | "state: %10lx\n", | ||
94 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 107 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 108 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | K(bdi_thresh), K(dirty_thresh), | 109 | K(bdi_thresh), |
97 | K(background_thresh), nr_dirty, nr_io, nr_more_io, | 110 | K(dirty_thresh), |
111 | K(background_thresh), | ||
112 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | ||
113 | (unsigned long) K(bdi->write_bandwidth), | ||
114 | nr_dirty, | ||
115 | nr_io, | ||
116 | nr_more_io, | ||
98 | !list_empty(&bdi->bdi_list), bdi->state); | 117 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | #undef K | 118 | #undef K |
100 | 119 | ||
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
249 | return wb_has_dirty_io(&bdi->wb); | 268 | return wb_has_dirty_io(&bdi->wb); |
250 | } | 269 | } |
251 | 270 | ||
252 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
253 | { | ||
254 | struct writeback_control wbc = { | ||
255 | .sync_mode = WB_SYNC_NONE, | ||
256 | .older_than_this = NULL, | ||
257 | .range_cyclic = 1, | ||
258 | .nr_to_write = 1024, | ||
259 | }; | ||
260 | |||
261 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
262 | } | ||
263 | |||
264 | /* | 271 | /* |
265 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | 272 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | * or we risk deadlocking on ->s_umount. The longer term solution would be | 273 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
@@ -352,6 +359,17 @@ static unsigned long bdi_longest_inactive(void) | |||
352 | return max(5UL * 60 * HZ, interval); | 359 | return max(5UL * 60 * HZ, interval); |
353 | } | 360 | } |
354 | 361 | ||
362 | /* | ||
363 | * Clear pending bit and wakeup anybody waiting for flusher thread creation or | ||
364 | * shutdown | ||
365 | */ | ||
366 | static void bdi_clear_pending(struct backing_dev_info *bdi) | ||
367 | { | ||
368 | clear_bit(BDI_pending, &bdi->state); | ||
369 | smp_mb__after_clear_bit(); | ||
370 | wake_up_bit(&bdi->state, BDI_pending); | ||
371 | } | ||
372 | |||
355 | static int bdi_forker_thread(void *ptr) | 373 | static int bdi_forker_thread(void *ptr) |
356 | { | 374 | { |
357 | struct bdi_writeback *me = ptr; | 375 | struct bdi_writeback *me = ptr; |
@@ -383,6 +401,13 @@ static int bdi_forker_thread(void *ptr) | |||
383 | } | 401 | } |
384 | 402 | ||
385 | spin_lock_bh(&bdi_lock); | 403 | spin_lock_bh(&bdi_lock); |
404 | /* | ||
405 | * In the following loop we are going to check whether we have | ||
406 | * some work to do without any synchronization with tasks | ||
407 | * waking us up to do work for them. So we have to set task | ||
408 | * state already here so that we don't miss wakeups coming | ||
409 | * after we verify some condition. | ||
410 | */ | ||
386 | set_current_state(TASK_INTERRUPTIBLE); | 411 | set_current_state(TASK_INTERRUPTIBLE); |
387 | 412 | ||
388 | list_for_each_entry(bdi, &bdi_list, bdi_list) { | 413 | list_for_each_entry(bdi, &bdi_list, bdi_list) { |
@@ -446,9 +471,10 @@ static int bdi_forker_thread(void *ptr) | |||
446 | if (IS_ERR(task)) { | 471 | if (IS_ERR(task)) { |
447 | /* | 472 | /* |
448 | * If thread creation fails, force writeout of | 473 | * If thread creation fails, force writeout of |
449 | * the bdi from the thread. | 474 | * the bdi from the thread. Hopefully 1024 is |
475 | * large enough for efficient IO. | ||
450 | */ | 476 | */ |
451 | bdi_flush_io(bdi); | 477 | writeback_inodes_wb(&bdi->wb, 1024); |
452 | } else { | 478 | } else { |
453 | /* | 479 | /* |
454 | * The spinlock makes sure we do not lose | 480 | * The spinlock makes sure we do not lose |
@@ -461,11 +487,13 @@ static int bdi_forker_thread(void *ptr) | |||
461 | spin_unlock_bh(&bdi->wb_lock); | 487 | spin_unlock_bh(&bdi->wb_lock); |
462 | wake_up_process(task); | 488 | wake_up_process(task); |
463 | } | 489 | } |
490 | bdi_clear_pending(bdi); | ||
464 | break; | 491 | break; |
465 | 492 | ||
466 | case KILL_THREAD: | 493 | case KILL_THREAD: |
467 | __set_current_state(TASK_RUNNING); | 494 | __set_current_state(TASK_RUNNING); |
468 | kthread_stop(task); | 495 | kthread_stop(task); |
496 | bdi_clear_pending(bdi); | ||
469 | break; | 497 | break; |
470 | 498 | ||
471 | case NO_ACTION: | 499 | case NO_ACTION: |
@@ -481,16 +509,8 @@ static int bdi_forker_thread(void *ptr) | |||
481 | else | 509 | else |
482 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); | 510 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
483 | try_to_freeze(); | 511 | try_to_freeze(); |
484 | /* Back to the main loop */ | 512 | break; |
485 | continue; | ||
486 | } | 513 | } |
487 | |||
488 | /* | ||
489 | * Clear pending bit and wakeup anybody waiting to tear us down. | ||
490 | */ | ||
491 | clear_bit(BDI_pending, &bdi->state); | ||
492 | smp_mb__after_clear_bit(); | ||
493 | wake_up_bit(&bdi->state, BDI_pending); | ||
494 | } | 514 | } |
495 | 515 | ||
496 | return 0; | 516 | return 0; |
@@ -505,7 +525,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) | |||
505 | list_del_rcu(&bdi->bdi_list); | 525 | list_del_rcu(&bdi->bdi_list); |
506 | spin_unlock_bh(&bdi_lock); | 526 | spin_unlock_bh(&bdi_lock); |
507 | 527 | ||
508 | synchronize_rcu(); | 528 | synchronize_rcu_expedited(); |
509 | } | 529 | } |
510 | 530 | ||
511 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 531 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
@@ -606,6 +626,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) | |||
606 | void bdi_unregister(struct backing_dev_info *bdi) | 626 | void bdi_unregister(struct backing_dev_info *bdi) |
607 | { | 627 | { |
608 | if (bdi->dev) { | 628 | if (bdi->dev) { |
629 | bdi_set_min_ratio(bdi, 0); | ||
609 | trace_writeback_bdi_unregister(bdi); | 630 | trace_writeback_bdi_unregister(bdi); |
610 | bdi_prune_sb(bdi); | 631 | bdi_prune_sb(bdi); |
611 | del_timer_sync(&bdi->wb.wakeup_timer); | 632 | del_timer_sync(&bdi->wb.wakeup_timer); |
@@ -628,9 +649,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
628 | INIT_LIST_HEAD(&wb->b_dirty); | 649 | INIT_LIST_HEAD(&wb->b_dirty); |
629 | INIT_LIST_HEAD(&wb->b_io); | 650 | INIT_LIST_HEAD(&wb->b_io); |
630 | INIT_LIST_HEAD(&wb->b_more_io); | 651 | INIT_LIST_HEAD(&wb->b_more_io); |
652 | spin_lock_init(&wb->list_lock); | ||
631 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 653 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
632 | } | 654 | } |
633 | 655 | ||
656 | /* | ||
657 | * Initial write bandwidth: 100 MB/s | ||
658 | */ | ||
659 | #define INIT_BW (100 << (20 - PAGE_SHIFT)) | ||
660 | |||
634 | int bdi_init(struct backing_dev_info *bdi) | 661 | int bdi_init(struct backing_dev_info *bdi) |
635 | { | 662 | { |
636 | int i, err; | 663 | int i, err; |
@@ -653,6 +680,13 @@ int bdi_init(struct backing_dev_info *bdi) | |||
653 | } | 680 | } |
654 | 681 | ||
655 | bdi->dirty_exceeded = 0; | 682 | bdi->dirty_exceeded = 0; |
683 | |||
684 | bdi->bw_time_stamp = jiffies; | ||
685 | bdi->written_stamp = 0; | ||
686 | |||
687 | bdi->write_bandwidth = INIT_BW; | ||
688 | bdi->avg_write_bandwidth = INIT_BW; | ||
689 | |||
656 | err = prop_local_init_percpu(&bdi->completions); | 690 | err = prop_local_init_percpu(&bdi->completions); |
657 | 691 | ||
658 | if (err) { | 692 | if (err) { |
@@ -676,15 +710,24 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
676 | if (bdi_has_dirty_io(bdi)) { | 710 | if (bdi_has_dirty_io(bdi)) { |
677 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 711 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
678 | 712 | ||
679 | spin_lock(&inode_wb_list_lock); | 713 | bdi_lock_two(&bdi->wb, dst); |
680 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 714 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
681 | list_splice(&bdi->wb.b_io, &dst->b_io); | 715 | list_splice(&bdi->wb.b_io, &dst->b_io); |
682 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 716 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
683 | spin_unlock(&inode_wb_list_lock); | 717 | spin_unlock(&bdi->wb.list_lock); |
718 | spin_unlock(&dst->list_lock); | ||
684 | } | 719 | } |
685 | 720 | ||
686 | bdi_unregister(bdi); | 721 | bdi_unregister(bdi); |
687 | 722 | ||
723 | /* | ||
724 | * If bdi_unregister() had already been called earlier, the | ||
725 | * wakeup_timer could still be armed because bdi_prune_sb() | ||
726 | * can race with the bdi_wakeup_thread_delayed() calls from | ||
727 | * __mark_inode_dirty(). | ||
728 | */ | ||
729 | del_timer_sync(&bdi->wb.wakeup_timer); | ||
730 | |||
688 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 731 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
689 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 732 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
690 | 733 | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb4519..fbb58e34688 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool) | |||
500 | { | 500 | { |
501 | struct device *dev = pool->dev; | 501 | struct device *dev = pool->dev; |
502 | 502 | ||
503 | dma_pool_destroy(pool); | ||
504 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); | 503 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); |
504 | dma_pool_destroy(pool); | ||
505 | } | 505 | } |
506 | EXPORT_SYMBOL(dmam_pool_destroy); | 506 | EXPORT_SYMBOL(dmam_pool_destroy); |
diff --git a/mm/failslab.c b/mm/failslab.c index c5f88f240dd..0dd7b8fec71 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -5,10 +5,6 @@ static struct { | |||
5 | struct fault_attr attr; | 5 | struct fault_attr attr; |
6 | u32 ignore_gfp_wait; | 6 | u32 ignore_gfp_wait; |
7 | int cache_filter; | 7 | int cache_filter; |
8 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
9 | struct dentry *ignore_gfp_wait_file; | ||
10 | struct dentry *cache_filter_file; | ||
11 | #endif | ||
12 | } failslab = { | 8 | } failslab = { |
13 | .attr = FAULT_ATTR_INITIALIZER, | 9 | .attr = FAULT_ATTR_INITIALIZER, |
14 | .ignore_gfp_wait = 1, | 10 | .ignore_gfp_wait = 1, |
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab); | |||
38 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 34 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
39 | static int __init failslab_debugfs_init(void) | 35 | static int __init failslab_debugfs_init(void) |
40 | { | 36 | { |
41 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
42 | struct dentry *dir; | 37 | struct dentry *dir; |
43 | int err; | 38 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
44 | |||
45 | err = init_fault_attr_dentries(&failslab.attr, "failslab"); | ||
46 | if (err) | ||
47 | return err; | ||
48 | dir = failslab.attr.dentries.dir; | ||
49 | 39 | ||
50 | failslab.ignore_gfp_wait_file = | 40 | dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); |
51 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 41 | if (IS_ERR(dir)) |
52 | &failslab.ignore_gfp_wait); | 42 | return PTR_ERR(dir); |
53 | 43 | ||
54 | failslab.cache_filter_file = | 44 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
55 | debugfs_create_bool("cache-filter", mode, dir, | 45 | &failslab.ignore_gfp_wait)) |
56 | &failslab.cache_filter); | 46 | goto fail; |
47 | if (!debugfs_create_bool("cache-filter", mode, dir, | ||
48 | &failslab.cache_filter)) | ||
49 | goto fail; | ||
57 | 50 | ||
58 | if (!failslab.ignore_gfp_wait_file || | 51 | return 0; |
59 | !failslab.cache_filter_file) { | 52 | fail: |
60 | err = -ENOMEM; | 53 | debugfs_remove_recursive(dir); |
61 | debugfs_remove(failslab.cache_filter_file); | ||
62 | debugfs_remove(failslab.ignore_gfp_wait_file); | ||
63 | cleanup_fault_attr_dentries(&failslab.attr); | ||
64 | } | ||
65 | 54 | ||
66 | return err; | 55 | return -ENOMEM; |
67 | } | 56 | } |
68 | 57 | ||
69 | late_initcall(failslab_debugfs_init); | 58 | late_initcall(failslab_debugfs_init); |
diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8d345..0eedbf85062 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -33,7 +33,6 @@ | |||
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | ||
37 | #include <linux/cleancache.h> | 36 | #include <linux/cleancache.h> |
38 | #include "internal.h" | 37 | #include "internal.h" |
39 | 38 | ||
@@ -78,10 +77,7 @@ | |||
78 | * ->i_mutex (generic_file_buffered_write) | 77 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 78 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 79 | * |
81 | * ->i_mutex | 80 | * bdi->wb.list_lock |
82 | * ->i_alloc_sem (various) | ||
83 | * | ||
84 | * inode_wb_list_lock | ||
85 | * sb_lock (fs/fs-writeback.c) | 81 | * sb_lock (fs/fs-writeback.c) |
86 | * ->mapping->tree_lock (__sync_single_inode) | 82 | * ->mapping->tree_lock (__sync_single_inode) |
87 | * | 83 | * |
@@ -99,9 +95,9 @@ | |||
99 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 95 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
100 | * ->private_lock (page_remove_rmap->set_page_dirty) | 96 | * ->private_lock (page_remove_rmap->set_page_dirty) |
101 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 97 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
102 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | 98 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
103 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) | 99 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
104 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | 100 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) |
105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
107 | * | 103 | * |
@@ -131,6 +127,7 @@ void __delete_from_page_cache(struct page *page) | |||
131 | 127 | ||
132 | radix_tree_delete(&mapping->page_tree, page->index); | 128 | radix_tree_delete(&mapping->page_tree, page->index); |
133 | page->mapping = NULL; | 129 | page->mapping = NULL; |
130 | /* Leave page->index set: truncation lookup relies upon it */ | ||
134 | mapping->nrpages--; | 131 | mapping->nrpages--; |
135 | __dec_zone_page_state(page, NR_FILE_PAGES); | 132 | __dec_zone_page_state(page, NR_FILE_PAGES); |
136 | if (PageSwapBacked(page)) | 133 | if (PageSwapBacked(page)) |
@@ -396,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); | |||
396 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | 393 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) |
397 | { | 394 | { |
398 | int error; | 395 | int error; |
399 | struct mem_cgroup *memcg = NULL; | ||
400 | 396 | ||
401 | VM_BUG_ON(!PageLocked(old)); | 397 | VM_BUG_ON(!PageLocked(old)); |
402 | VM_BUG_ON(!PageLocked(new)); | 398 | VM_BUG_ON(!PageLocked(new)); |
403 | VM_BUG_ON(new->mapping); | 399 | VM_BUG_ON(new->mapping); |
404 | 400 | ||
405 | /* | ||
406 | * This is not page migration, but prepare_migration and | ||
407 | * end_migration does enough work for charge replacement. | ||
408 | * | ||
409 | * In the longer term we probably want a specialized function | ||
410 | * for moving the charge from old to new in a more efficient | ||
411 | * manner. | ||
412 | */ | ||
413 | error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); | ||
414 | if (error) | ||
415 | return error; | ||
416 | |||
417 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 401 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
418 | if (!error) { | 402 | if (!error) { |
419 | struct address_space *mapping = old->mapping; | 403 | struct address_space *mapping = old->mapping; |
@@ -435,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
435 | if (PageSwapBacked(new)) | 419 | if (PageSwapBacked(new)) |
436 | __inc_zone_page_state(new, NR_SHMEM); | 420 | __inc_zone_page_state(new, NR_SHMEM); |
437 | spin_unlock_irq(&mapping->tree_lock); | 421 | spin_unlock_irq(&mapping->tree_lock); |
422 | /* mem_cgroup codes must not be called under tree_lock */ | ||
423 | mem_cgroup_replace_page_cache(old, new); | ||
438 | radix_tree_preload_end(); | 424 | radix_tree_preload_end(); |
439 | if (freepage) | 425 | if (freepage) |
440 | freepage(old); | 426 | freepage(old); |
441 | page_cache_release(old); | 427 | page_cache_release(old); |
442 | mem_cgroup_end_migration(memcg, old, new, true); | ||
443 | } else { | ||
444 | mem_cgroup_end_migration(memcg, old, new, false); | ||
445 | } | 428 | } |
446 | 429 | ||
447 | return error; | 430 | return error; |
@@ -464,6 +447,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
464 | int error; | 447 | int error; |
465 | 448 | ||
466 | VM_BUG_ON(!PageLocked(page)); | 449 | VM_BUG_ON(!PageLocked(page)); |
450 | VM_BUG_ON(PageSwapBacked(page)); | ||
467 | 451 | ||
468 | error = mem_cgroup_cache_charge(page, current->mm, | 452 | error = mem_cgroup_cache_charge(page, current->mm, |
469 | gfp_mask & GFP_RECLAIM_MASK); | 453 | gfp_mask & GFP_RECLAIM_MASK); |
@@ -481,11 +465,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
481 | if (likely(!error)) { | 465 | if (likely(!error)) { |
482 | mapping->nrpages++; | 466 | mapping->nrpages++; |
483 | __inc_zone_page_state(page, NR_FILE_PAGES); | 467 | __inc_zone_page_state(page, NR_FILE_PAGES); |
484 | if (PageSwapBacked(page)) | ||
485 | __inc_zone_page_state(page, NR_SHMEM); | ||
486 | spin_unlock_irq(&mapping->tree_lock); | 468 | spin_unlock_irq(&mapping->tree_lock); |
487 | } else { | 469 | } else { |
488 | page->mapping = NULL; | 470 | page->mapping = NULL; |
471 | /* Leave page->index set: truncation relies upon it */ | ||
489 | spin_unlock_irq(&mapping->tree_lock); | 472 | spin_unlock_irq(&mapping->tree_lock); |
490 | mem_cgroup_uncharge_cache_page(page); | 473 | mem_cgroup_uncharge_cache_page(page); |
491 | page_cache_release(page); | 474 | page_cache_release(page); |
@@ -503,22 +486,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
503 | { | 486 | { |
504 | int ret; | 487 | int ret; |
505 | 488 | ||
506 | /* | ||
507 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | ||
508 | * before shmem_readpage has a chance to mark them as SwapBacked: they | ||
509 | * need to go on the anon lru below, and mem_cgroup_cache_charge | ||
510 | * (called in add_to_page_cache) needs to know where they're going too. | ||
511 | */ | ||
512 | if (mapping_cap_swap_backed(mapping)) | ||
513 | SetPageSwapBacked(page); | ||
514 | |||
515 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 489 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); |
516 | if (ret == 0) { | 490 | if (ret == 0) |
517 | if (page_is_file_cache(page)) | 491 | lru_cache_add_file(page); |
518 | lru_cache_add_file(page); | ||
519 | else | ||
520 | lru_cache_add_anon(page); | ||
521 | } | ||
522 | return ret; | 492 | return ret; |
523 | } | 493 | } |
524 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | 494 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); |
@@ -715,9 +685,16 @@ repeat: | |||
715 | page = radix_tree_deref_slot(pagep); | 685 | page = radix_tree_deref_slot(pagep); |
716 | if (unlikely(!page)) | 686 | if (unlikely(!page)) |
717 | goto out; | 687 | goto out; |
718 | if (radix_tree_deref_retry(page)) | 688 | if (radix_tree_exception(page)) { |
719 | goto repeat; | 689 | if (radix_tree_deref_retry(page)) |
720 | 690 | goto repeat; | |
691 | /* | ||
692 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
693 | * here as an exceptional entry: so return it without | ||
694 | * attempting to raise page count. | ||
695 | */ | ||
696 | goto out; | ||
697 | } | ||
721 | if (!page_cache_get_speculative(page)) | 698 | if (!page_cache_get_speculative(page)) |
722 | goto repeat; | 699 | goto repeat; |
723 | 700 | ||
@@ -754,7 +731,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | |||
754 | 731 | ||
755 | repeat: | 732 | repeat: |
756 | page = find_get_page(mapping, offset); | 733 | page = find_get_page(mapping, offset); |
757 | if (page) { | 734 | if (page && !radix_tree_exception(page)) { |
758 | lock_page(page); | 735 | lock_page(page); |
759 | /* Has the page been truncated? */ | 736 | /* Has the page been truncated? */ |
760 | if (unlikely(page->mapping != mapping)) { | 737 | if (unlikely(page->mapping != mapping)) { |
@@ -836,13 +813,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
836 | { | 813 | { |
837 | unsigned int i; | 814 | unsigned int i; |
838 | unsigned int ret; | 815 | unsigned int ret; |
839 | unsigned int nr_found; | 816 | unsigned int nr_found, nr_skip; |
840 | 817 | ||
841 | rcu_read_lock(); | 818 | rcu_read_lock(); |
842 | restart: | 819 | restart: |
843 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 820 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
844 | (void ***)pages, start, nr_pages); | 821 | (void ***)pages, NULL, start, nr_pages); |
845 | ret = 0; | 822 | ret = 0; |
823 | nr_skip = 0; | ||
846 | for (i = 0; i < nr_found; i++) { | 824 | for (i = 0; i < nr_found; i++) { |
847 | struct page *page; | 825 | struct page *page; |
848 | repeat: | 826 | repeat: |
@@ -850,13 +828,23 @@ repeat: | |||
850 | if (unlikely(!page)) | 828 | if (unlikely(!page)) |
851 | continue; | 829 | continue; |
852 | 830 | ||
853 | /* | 831 | if (radix_tree_exception(page)) { |
854 | * This can only trigger when the entry at index 0 moves out | 832 | if (radix_tree_deref_retry(page)) { |
855 | * of or back to the root: none yet gotten, safe to restart. | 833 | /* |
856 | */ | 834 | * Transient condition which can only trigger |
857 | if (radix_tree_deref_retry(page)) { | 835 | * when entry at index 0 moves out of or back |
858 | WARN_ON(start | i); | 836 | * to root: none yet gotten, safe to restart. |
859 | goto restart; | 837 | */ |
838 | WARN_ON(start | i); | ||
839 | goto restart; | ||
840 | } | ||
841 | /* | ||
842 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
843 | * here as an exceptional entry: so skip over it - | ||
844 | * we only reach this from invalidate_mapping_pages(). | ||
845 | */ | ||
846 | nr_skip++; | ||
847 | continue; | ||
860 | } | 848 | } |
861 | 849 | ||
862 | if (!page_cache_get_speculative(page)) | 850 | if (!page_cache_get_speculative(page)) |
@@ -876,7 +864,7 @@ repeat: | |||
876 | * If all entries were removed before we could secure them, | 864 | * If all entries were removed before we could secure them, |
877 | * try again, because callers stop trying once 0 is returned. | 865 | * try again, because callers stop trying once 0 is returned. |
878 | */ | 866 | */ |
879 | if (unlikely(!ret && nr_found)) | 867 | if (unlikely(!ret && nr_found > nr_skip)) |
880 | goto restart; | 868 | goto restart; |
881 | rcu_read_unlock(); | 869 | rcu_read_unlock(); |
882 | return ret; | 870 | return ret; |
@@ -904,7 +892,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
904 | rcu_read_lock(); | 892 | rcu_read_lock(); |
905 | restart: | 893 | restart: |
906 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 894 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
907 | (void ***)pages, index, nr_pages); | 895 | (void ***)pages, NULL, index, nr_pages); |
908 | ret = 0; | 896 | ret = 0; |
909 | for (i = 0; i < nr_found; i++) { | 897 | for (i = 0; i < nr_found; i++) { |
910 | struct page *page; | 898 | struct page *page; |
@@ -913,12 +901,22 @@ repeat: | |||
913 | if (unlikely(!page)) | 901 | if (unlikely(!page)) |
914 | continue; | 902 | continue; |
915 | 903 | ||
916 | /* | 904 | if (radix_tree_exception(page)) { |
917 | * This can only trigger when the entry at index 0 moves out | 905 | if (radix_tree_deref_retry(page)) { |
918 | * of or back to the root: none yet gotten, safe to restart. | 906 | /* |
919 | */ | 907 | * Transient condition which can only trigger |
920 | if (radix_tree_deref_retry(page)) | 908 | * when entry at index 0 moves out of or back |
921 | goto restart; | 909 | * to root: none yet gotten, safe to restart. |
910 | */ | ||
911 | goto restart; | ||
912 | } | ||
913 | /* | ||
914 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
915 | * here as an exceptional entry: so stop looking for | ||
916 | * contiguous pages. | ||
917 | */ | ||
918 | break; | ||
919 | } | ||
922 | 920 | ||
923 | if (!page_cache_get_speculative(page)) | 921 | if (!page_cache_get_speculative(page)) |
924 | goto repeat; | 922 | goto repeat; |
@@ -978,12 +976,21 @@ repeat: | |||
978 | if (unlikely(!page)) | 976 | if (unlikely(!page)) |
979 | continue; | 977 | continue; |
980 | 978 | ||
981 | /* | 979 | if (radix_tree_exception(page)) { |
982 | * This can only trigger when the entry at index 0 moves out | 980 | if (radix_tree_deref_retry(page)) { |
983 | * of or back to the root: none yet gotten, safe to restart. | 981 | /* |
984 | */ | 982 | * Transient condition which can only trigger |
985 | if (radix_tree_deref_retry(page)) | 983 | * when entry at index 0 moves out of or back |
986 | goto restart; | 984 | * to root: none yet gotten, safe to restart. |
985 | */ | ||
986 | goto restart; | ||
987 | } | ||
988 | /* | ||
989 | * This function is never used on a shmem/tmpfs | ||
990 | * mapping, so a swap entry won't be found here. | ||
991 | */ | ||
992 | BUG(); | ||
993 | } | ||
987 | 994 | ||
988 | if (!page_cache_get_speculative(page)) | 995 | if (!page_cache_get_speculative(page)) |
989 | goto repeat; | 996 | goto repeat; |
@@ -1795,7 +1802,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); | |||
1795 | 1802 | ||
1796 | static struct page *__read_cache_page(struct address_space *mapping, | 1803 | static struct page *__read_cache_page(struct address_space *mapping, |
1797 | pgoff_t index, | 1804 | pgoff_t index, |
1798 | int (*filler)(void *,struct page*), | 1805 | int (*filler)(void *, struct page *), |
1799 | void *data, | 1806 | void *data, |
1800 | gfp_t gfp) | 1807 | gfp_t gfp) |
1801 | { | 1808 | { |
@@ -1807,7 +1814,7 @@ repeat: | |||
1807 | page = __page_cache_alloc(gfp | __GFP_COLD); | 1814 | page = __page_cache_alloc(gfp | __GFP_COLD); |
1808 | if (!page) | 1815 | if (!page) |
1809 | return ERR_PTR(-ENOMEM); | 1816 | return ERR_PTR(-ENOMEM); |
1810 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); | 1817 | err = add_to_page_cache_lru(page, mapping, index, gfp); |
1811 | if (unlikely(err)) { | 1818 | if (unlikely(err)) { |
1812 | page_cache_release(page); | 1819 | page_cache_release(page); |
1813 | if (err == -EEXIST) | 1820 | if (err == -EEXIST) |
@@ -1826,7 +1833,7 @@ repeat: | |||
1826 | 1833 | ||
1827 | static struct page *do_read_cache_page(struct address_space *mapping, | 1834 | static struct page *do_read_cache_page(struct address_space *mapping, |
1828 | pgoff_t index, | 1835 | pgoff_t index, |
1829 | int (*filler)(void *,struct page*), | 1836 | int (*filler)(void *, struct page *), |
1830 | void *data, | 1837 | void *data, |
1831 | gfp_t gfp) | 1838 | gfp_t gfp) |
1832 | 1839 | ||
@@ -1866,7 +1873,7 @@ out: | |||
1866 | * @mapping: the page's address_space | 1873 | * @mapping: the page's address_space |
1867 | * @index: the page index | 1874 | * @index: the page index |
1868 | * @filler: function to perform the read | 1875 | * @filler: function to perform the read |
1869 | * @data: destination for read data | 1876 | * @data: first arg to filler(data, page) function, often left as NULL |
1870 | * | 1877 | * |
1871 | * Same as read_cache_page, but don't wait for page to become unlocked | 1878 | * Same as read_cache_page, but don't wait for page to become unlocked |
1872 | * after submitting it to the filler. | 1879 | * after submitting it to the filler. |
@@ -1878,7 +1885,7 @@ out: | |||
1878 | */ | 1885 | */ |
1879 | struct page *read_cache_page_async(struct address_space *mapping, | 1886 | struct page *read_cache_page_async(struct address_space *mapping, |
1880 | pgoff_t index, | 1887 | pgoff_t index, |
1881 | int (*filler)(void *,struct page*), | 1888 | int (*filler)(void *, struct page *), |
1882 | void *data) | 1889 | void *data) |
1883 | { | 1890 | { |
1884 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | 1891 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); |
@@ -1904,10 +1911,7 @@ static struct page *wait_on_page_read(struct page *page) | |||
1904 | * @gfp: the page allocator flags to use if allocating | 1911 | * @gfp: the page allocator flags to use if allocating |
1905 | * | 1912 | * |
1906 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with | 1913 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with |
1907 | * any new page allocations done using the specified allocation flags. Note | 1914 | * any new page allocations done using the specified allocation flags. |
1908 | * that the Radix tree operations will still use GFP_KERNEL, so you can't | ||
1909 | * expect to do this atomically or anything like that - but you can pass in | ||
1910 | * other page requirements. | ||
1911 | * | 1915 | * |
1912 | * If the page does not get brought uptodate, return -EIO. | 1916 | * If the page does not get brought uptodate, return -EIO. |
1913 | */ | 1917 | */ |
@@ -1926,7 +1930,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); | |||
1926 | * @mapping: the page's address_space | 1930 | * @mapping: the page's address_space |
1927 | * @index: the page index | 1931 | * @index: the page index |
1928 | * @filler: function to perform the read | 1932 | * @filler: function to perform the read |
1929 | * @data: destination for read data | 1933 | * @data: first arg to filler(data, page) function, often left as NULL |
1930 | * | 1934 | * |
1931 | * Read into the page cache. If a page already exists, and PageUptodate() is | 1935 | * Read into the page cache. If a page already exists, and PageUptodate() is |
1932 | * not set, try to fill the page then wait for it to become unlocked. | 1936 | * not set, try to fill the page then wait for it to become unlocked. |
@@ -1935,7 +1939,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); | |||
1935 | */ | 1939 | */ |
1936 | struct page *read_cache_page(struct address_space *mapping, | 1940 | struct page *read_cache_page(struct address_space *mapping, |
1937 | pgoff_t index, | 1941 | pgoff_t index, |
1938 | int (*filler)(void *,struct page*), | 1942 | int (*filler)(void *, struct page *), |
1939 | void *data) | 1943 | void *data) |
1940 | { | 1944 | { |
1941 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); | 1945 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); |
diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2e..5ef672c07f7 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -326,7 +326,7 @@ static struct page_address_slot { | |||
326 | spinlock_t lock; /* Protect this bucket's list */ | 326 | spinlock_t lock; /* Protect this bucket's list */ |
327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; | 327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; |
328 | 328 | ||
329 | static struct page_address_slot *page_slot(struct page *page) | 329 | static struct page_address_slot *page_slot(const struct page *page) |
330 | { | 330 | { |
331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; | 331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; |
332 | } | 332 | } |
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) | |||
337 | * | 337 | * |
338 | * Returns the page's virtual address. | 338 | * Returns the page's virtual address. |
339 | */ | 339 | */ |
340 | void *page_address(struct page *page) | 340 | void *page_address(const struct page *page) |
341 | { | 341 | { |
342 | unsigned long flags; | 342 | unsigned long flags; |
343 | void *ret; | 343 | void *ret; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81532f297fd..d819d938288 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
989 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 989 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
990 | VM_BUG_ON(!PageCompound(page)); | 990 | VM_BUG_ON(!PageCompound(page)); |
991 | if (flags & FOLL_GET) | 991 | if (flags & FOLL_GET) |
992 | get_page(page); | 992 | get_page_foll(page); |
993 | 993 | ||
994 | out: | 994 | out: |
995 | return page; | 995 | return page; |
@@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1156 | unsigned long head_index = page->index; | 1156 | unsigned long head_index = page->index; |
1157 | struct zone *zone = page_zone(page); | 1157 | struct zone *zone = page_zone(page); |
1158 | int zonestat; | 1158 | int zonestat; |
1159 | int tail_count = 0; | ||
1159 | 1160 | ||
1160 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 1161 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
1161 | spin_lock_irq(&zone->lru_lock); | 1162 | spin_lock_irq(&zone->lru_lock); |
@@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(struct page *page) | |||
1164 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 1165 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
1165 | struct page *page_tail = page + i; | 1166 | struct page *page_tail = page + i; |
1166 | 1167 | ||
1167 | /* tail_page->_count cannot change */ | 1168 | /* tail_page->_mapcount cannot change */ |
1168 | atomic_sub(atomic_read(&page_tail->_count), &page->_count); | 1169 | BUG_ON(page_mapcount(page_tail) < 0); |
1169 | BUG_ON(page_count(page) <= 0); | 1170 | tail_count += page_mapcount(page_tail); |
1170 | atomic_add(page_mapcount(page) + 1, &page_tail->_count); | 1171 | /* check for overflow */ |
1171 | BUG_ON(atomic_read(&page_tail->_count) <= 0); | 1172 | BUG_ON(tail_count < 0); |
1173 | BUG_ON(atomic_read(&page_tail->_count) != 0); | ||
1174 | /* | ||
1175 | * tail_page->_count is zero and not changing from | ||
1176 | * under us. But get_page_unless_zero() may be running | ||
1177 | * from under us on the tail_page. If we used | ||
1178 | * atomic_set() below instead of atomic_add(), we | ||
1179 | * would then run atomic_set() concurrently with | ||
1180 | * get_page_unless_zero(), and atomic_set() is | ||
1181 | * implemented in C not using locked ops. spin_unlock | ||
1182 | * on x86 sometime uses locked ops because of PPro | ||
1183 | * errata 66, 92, so unless somebody can guarantee | ||
1184 | * atomic_set() here would be safe on all archs (and | ||
1185 | * not only on x86), it's safer to use atomic_add(). | ||
1186 | */ | ||
1187 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, | ||
1188 | &page_tail->_count); | ||
1172 | 1189 | ||
1173 | /* after clearing PageTail the gup refcount can be released */ | 1190 | /* after clearing PageTail the gup refcount can be released */ |
1174 | smp_mb(); | 1191 | smp_mb(); |
@@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1186 | (1L << PG_uptodate))); | 1203 | (1L << PG_uptodate))); |
1187 | page_tail->flags |= (1L << PG_dirty); | 1204 | page_tail->flags |= (1L << PG_dirty); |
1188 | 1205 | ||
1189 | /* | 1206 | /* clear PageTail before overwriting first_page */ |
1190 | * 1) clear PageTail before overwriting first_page | ||
1191 | * 2) clear PageTail before clearing PageHead for VM_BUG_ON | ||
1192 | */ | ||
1193 | smp_wmb(); | 1207 | smp_wmb(); |
1194 | 1208 | ||
1195 | /* | 1209 | /* |
@@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(struct page *page) | |||
1206 | * status is achieved setting a reserved bit in the | 1220 | * status is achieved setting a reserved bit in the |
1207 | * pmd, not by clearing the present bit. | 1221 | * pmd, not by clearing the present bit. |
1208 | */ | 1222 | */ |
1209 | BUG_ON(page_mapcount(page_tail)); | ||
1210 | page_tail->_mapcount = page->_mapcount; | 1223 | page_tail->_mapcount = page->_mapcount; |
1211 | 1224 | ||
1212 | BUG_ON(page_tail->mapping); | 1225 | BUG_ON(page_tail->mapping); |
@@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(struct page *page) | |||
1223 | 1236 | ||
1224 | lru_add_page_tail(zone, page, page_tail); | 1237 | lru_add_page_tail(zone, page, page_tail); |
1225 | } | 1238 | } |
1239 | atomic_sub(tail_count, &page->_count); | ||
1240 | BUG_ON(atomic_read(&page->_count) <= 0); | ||
1226 | 1241 | ||
1227 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1242 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1228 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | 1243 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); |
@@ -1596,14 +1611,13 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
1596 | list_del(&mm_slot->mm_node); | 1611 | list_del(&mm_slot->mm_node); |
1597 | free = 1; | 1612 | free = 1; |
1598 | } | 1613 | } |
1614 | spin_unlock(&khugepaged_mm_lock); | ||
1599 | 1615 | ||
1600 | if (free) { | 1616 | if (free) { |
1601 | spin_unlock(&khugepaged_mm_lock); | ||
1602 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | 1617 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); |
1603 | free_mm_slot(mm_slot); | 1618 | free_mm_slot(mm_slot); |
1604 | mmdrop(mm); | 1619 | mmdrop(mm); |
1605 | } else if (mm_slot) { | 1620 | } else if (mm_slot) { |
1606 | spin_unlock(&khugepaged_mm_lock); | ||
1607 | /* | 1621 | /* |
1608 | * This is required to serialize against | 1622 | * This is required to serialize against |
1609 | * khugepaged_test_exit() (which is guaranteed to run | 1623 | * khugepaged_test_exit() (which is guaranteed to run |
@@ -1614,8 +1628,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
1614 | */ | 1628 | */ |
1615 | down_write(&mm->mmap_sem); | 1629 | down_write(&mm->mmap_sem); |
1616 | up_write(&mm->mmap_sem); | 1630 | up_write(&mm->mmap_sem); |
1617 | } else | 1631 | } |
1618 | spin_unlock(&khugepaged_mm_lock); | ||
1619 | } | 1632 | } |
1620 | 1633 | ||
1621 | static void release_pte_page(struct page *page) | 1634 | static void release_pte_page(struct page *page) |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfcf153bc82..2316840b337 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,7 +24,7 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <asm/io.h> | 27 | #include <linux/io.h> |
28 | 28 | ||
29 | #include <linux/hugetlb.h> | 29 | #include <linux/hugetlb.h> |
30 | #include <linux/node.h> | 30 | #include <linux/node.h> |
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock); | |||
62 | * must either hold the mmap_sem for write, or the mmap_sem for read and | 62 | * must either hold the mmap_sem for write, or the mmap_sem for read and |
63 | * the hugetlb_instantiation mutex: | 63 | * the hugetlb_instantiation mutex: |
64 | * | 64 | * |
65 | * down_write(&mm->mmap_sem); | 65 | * down_write(&mm->mmap_sem); |
66 | * or | 66 | * or |
67 | * down_read(&mm->mmap_sem); | 67 | * down_read(&mm->mmap_sem); |
68 | * mutex_lock(&hugetlb_instantiation_mutex); | 68 | * mutex_lock(&hugetlb_instantiation_mutex); |
69 | */ | 69 | */ |
70 | struct file_region { | 70 | struct file_region { |
71 | struct list_head link; | 71 | struct list_head link; |
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
503 | h->nr_huge_pages--; | 503 | h->nr_huge_pages--; |
504 | h->nr_huge_pages_node[page_to_nid(page)]--; | 504 | h->nr_huge_pages_node[page_to_nid(page)]--; |
505 | for (i = 0; i < pages_per_huge_page(h); i++) { | 505 | for (i = 0; i < pages_per_huge_page(h); i++) { |
506 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 506 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | |
507 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 507 | 1 << PG_referenced | 1 << PG_dirty | |
508 | 1 << PG_private | 1<< PG_writeback); | 508 | 1 << PG_active | 1 << PG_reserved | |
509 | 1 << PG_private | 1 << PG_writeback); | ||
509 | } | 510 | } |
510 | set_compound_page_dtor(page, NULL); | 511 | set_compound_page_dtor(page, NULL); |
511 | set_page_refcounted(page); | 512 | set_page_refcounted(page); |
@@ -575,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
575 | __SetPageHead(page); | 576 | __SetPageHead(page); |
576 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | 577 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
577 | __SetPageTail(p); | 578 | __SetPageTail(p); |
579 | set_page_count(p, 0); | ||
578 | p->first_page = page; | 580 | p->first_page = page; |
579 | } | 581 | } |
580 | } | 582 | } |
@@ -591,7 +593,6 @@ int PageHuge(struct page *page) | |||
591 | 593 | ||
592 | return dtor == free_huge_page; | 594 | return dtor == free_huge_page; |
593 | } | 595 | } |
594 | |||
595 | EXPORT_SYMBOL_GPL(PageHuge); | 596 | EXPORT_SYMBOL_GPL(PageHuge); |
596 | 597 | ||
597 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 598 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
@@ -900,7 +901,6 @@ retry: | |||
900 | h->resv_huge_pages += delta; | 901 | h->resv_huge_pages += delta; |
901 | ret = 0; | 902 | ret = 0; |
902 | 903 | ||
903 | spin_unlock(&hugetlb_lock); | ||
904 | /* Free the needed pages to the hugetlb pool */ | 904 | /* Free the needed pages to the hugetlb pool */ |
905 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 905 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
906 | if ((--needed) < 0) | 906 | if ((--needed) < 0) |
@@ -914,6 +914,7 @@ retry: | |||
914 | VM_BUG_ON(page_count(page)); | 914 | VM_BUG_ON(page_count(page)); |
915 | enqueue_huge_page(h, page); | 915 | enqueue_huge_page(h, page); |
916 | } | 916 | } |
917 | spin_unlock(&hugetlb_lock); | ||
917 | 918 | ||
918 | /* Free unnecessary surplus pages to the buddy allocator */ | 919 | /* Free unnecessary surplus pages to the buddy allocator */ |
919 | free: | 920 | free: |
@@ -1105,8 +1106,16 @@ static void __init gather_bootmem_prealloc(void) | |||
1105 | struct huge_bootmem_page *m; | 1106 | struct huge_bootmem_page *m; |
1106 | 1107 | ||
1107 | list_for_each_entry(m, &huge_boot_pages, list) { | 1108 | list_for_each_entry(m, &huge_boot_pages, list) { |
1108 | struct page *page = virt_to_page(m); | ||
1109 | struct hstate *h = m->hstate; | 1109 | struct hstate *h = m->hstate; |
1110 | struct page *page; | ||
1111 | |||
1112 | #ifdef CONFIG_HIGHMEM | ||
1113 | page = pfn_to_page(m->phys >> PAGE_SHIFT); | ||
1114 | free_bootmem_late((unsigned long)m, | ||
1115 | sizeof(struct huge_bootmem_page)); | ||
1116 | #else | ||
1117 | page = virt_to_page(m); | ||
1118 | #endif | ||
1110 | __ClearPageReserved(page); | 1119 | __ClearPageReserved(page); |
1111 | WARN_ON(page_count(page) != 1); | 1120 | WARN_ON(page_count(page) != 1); |
1112 | prep_compound_huge_page(page, h->order); | 1121 | prep_compound_huge_page(page, h->order); |
@@ -2124,9 +2133,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
2124 | pte_t entry; | 2133 | pte_t entry; |
2125 | 2134 | ||
2126 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2135 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
2127 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2136 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) |
2128 | update_mmu_cache(vma, address, ptep); | 2137 | update_mmu_cache(vma, address, ptep); |
2129 | } | ||
2130 | } | 2138 | } |
2131 | 2139 | ||
2132 | 2140 | ||
@@ -2181,9 +2189,9 @@ static int is_hugetlb_entry_migration(pte_t pte) | |||
2181 | if (huge_pte_none(pte) || pte_present(pte)) | 2189 | if (huge_pte_none(pte) || pte_present(pte)) |
2182 | return 0; | 2190 | return 0; |
2183 | swp = pte_to_swp_entry(pte); | 2191 | swp = pte_to_swp_entry(pte); |
2184 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | 2192 | if (non_swap_entry(swp) && is_migration_entry(swp)) |
2185 | return 1; | 2193 | return 1; |
2186 | } else | 2194 | else |
2187 | return 0; | 2195 | return 0; |
2188 | } | 2196 | } |
2189 | 2197 | ||
@@ -2194,9 +2202,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2194 | if (huge_pte_none(pte) || pte_present(pte)) | 2202 | if (huge_pte_none(pte) || pte_present(pte)) |
2195 | return 0; | 2203 | return 0; |
2196 | swp = pte_to_swp_entry(pte); | 2204 | swp = pte_to_swp_entry(pte); |
2197 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { | 2205 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) |
2198 | return 1; | 2206 | return 1; |
2199 | } else | 2207 | else |
2200 | return 0; | 2208 | return 0; |
2201 | } | 2209 | } |
2202 | 2210 | ||
@@ -2415,6 +2423,8 @@ retry_avoidcopy: | |||
2415 | * anon_vma prepared. | 2423 | * anon_vma prepared. |
2416 | */ | 2424 | */ |
2417 | if (unlikely(anon_vma_prepare(vma))) { | 2425 | if (unlikely(anon_vma_prepare(vma))) { |
2426 | page_cache_release(new_page); | ||
2427 | page_cache_release(old_page); | ||
2418 | /* Caller expects lock to be held */ | 2428 | /* Caller expects lock to be held */ |
2419 | spin_lock(&mm->page_table_lock); | 2429 | spin_lock(&mm->page_table_lock); |
2420 | return VM_FAULT_OOM; | 2430 | return VM_FAULT_OOM; |
@@ -2559,7 +2569,7 @@ retry: | |||
2559 | * So we need to block hugepage fault by PG_hwpoison bit check. | 2569 | * So we need to block hugepage fault by PG_hwpoison bit check. |
2560 | */ | 2570 | */ |
2561 | if (unlikely(PageHWPoison(page))) { | 2571 | if (unlikely(PageHWPoison(page))) { |
2562 | ret = VM_FAULT_HWPOISON | | 2572 | ret = VM_FAULT_HWPOISON | |
2563 | VM_FAULT_SET_HINDEX(h - hstates); | 2573 | VM_FAULT_SET_HINDEX(h - hstates); |
2564 | goto backout_unlocked; | 2574 | goto backout_unlocked; |
2565 | } | 2575 | } |
@@ -2627,7 +2637,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2627 | migration_entry_wait(mm, (pmd_t *)ptep, address); | 2637 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
2628 | return 0; | 2638 | return 0; |
2629 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2639 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2630 | return VM_FAULT_HWPOISON_LARGE | | 2640 | return VM_FAULT_HWPOISON_LARGE | |
2631 | VM_FAULT_SET_HINDEX(h - hstates); | 2641 | VM_FAULT_SET_HINDEX(h - hstates); |
2632 | } | 2642 | } |
2633 | 2643 | ||
diff --git a/mm/init-mm.c b/mm/init-mm.c index 4019979b263..a56a851908d 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c | |||
@@ -5,7 +5,7 @@ | |||
5 | #include <linux/list.h> | 5 | #include <linux/list.h> |
6 | #include <linux/cpumask.h> | 6 | #include <linux/cpumask.h> |
7 | 7 | ||
8 | #include <asm/atomic.h> | 8 | #include <linux/atomic.h> |
9 | #include <asm/pgtable.h> | 9 | #include <asm/pgtable.h> |
10 | #include <asm/mmu.h> | 10 | #include <asm/mmu.h> |
11 | 11 | ||
diff --git a/mm/internal.h b/mm/internal.h index d071d380fb4..2189af49178 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) | |||
37 | atomic_dec(&page->_count); | 37 | atomic_dec(&page->_count); |
38 | } | 38 | } |
39 | 39 | ||
40 | static inline void __get_page_tail_foll(struct page *page, | ||
41 | bool get_page_head) | ||
42 | { | ||
43 | /* | ||
44 | * If we're getting a tail page, the elevated page->_count is | ||
45 | * required only in the head page and we will elevate the head | ||
46 | * page->_count and tail page->_mapcount. | ||
47 | * | ||
48 | * We elevate page_tail->_mapcount for tail pages to force | ||
49 | * page_tail->_count to be zero at all times to avoid getting | ||
50 | * false positives from get_page_unless_zero() with | ||
51 | * speculative page access (like in | ||
52 | * page_cache_get_speculative()) on tail pages. | ||
53 | */ | ||
54 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | ||
55 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
56 | VM_BUG_ON(page_mapcount(page) < 0); | ||
57 | if (get_page_head) | ||
58 | atomic_inc(&page->first_page->_count); | ||
59 | atomic_inc(&page->_mapcount); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This is meant to be called as the FOLL_GET operation of | ||
64 | * follow_page() and it must be called while holding the proper PT | ||
65 | * lock while the pte (or pmd_trans_huge) is still mapping the page. | ||
66 | */ | ||
67 | static inline void get_page_foll(struct page *page) | ||
68 | { | ||
69 | if (unlikely(PageTail(page))) | ||
70 | /* | ||
71 | * This is safe only because | ||
72 | * __split_huge_page_refcount() can't run under | ||
73 | * get_page_foll() because we hold the proper PT lock. | ||
74 | */ | ||
75 | __get_page_tail_foll(page, true); | ||
76 | else { | ||
77 | /* | ||
78 | * Getting a normal page or the head of a compound page | ||
79 | * requires to already have an elevated page->_count. | ||
80 | */ | ||
81 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | ||
82 | atomic_inc(&page->_count); | ||
83 | } | ||
84 | } | ||
85 | |||
40 | extern unsigned long highest_memmap_pfn; | 86 | extern unsigned long highest_memmap_pfn; |
41 | 87 | ||
42 | /* | 88 | /* |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index aacee45616f..d6880f542f9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -96,7 +96,7 @@ | |||
96 | 96 | ||
97 | #include <asm/sections.h> | 97 | #include <asm/sections.h> |
98 | #include <asm/processor.h> | 98 | #include <asm/processor.h> |
99 | #include <asm/atomic.h> | 99 | #include <linux/atomic.h> |
100 | 100 | ||
101 | #include <linux/kmemcheck.h> | 101 | #include <linux/kmemcheck.h> |
102 | #include <linux/kmemleak.h> | 102 | #include <linux/kmemleak.h> |
diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed50..74bf193eff0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
218 | endoff = (loff_t)(end - vma->vm_start - 1) | 218 | endoff = (loff_t)(end - vma->vm_start - 1) |
219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
220 | 220 | ||
221 | /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ | 221 | /* vmtruncate_range needs to take i_mutex */ |
222 | up_read(¤t->mm->mmap_sem); | 222 | up_read(¤t->mm->mmap_sem); |
223 | error = vmtruncate_range(mapping->host, offset, endoff); | 223 | error = vmtruncate_range(mapping->host, offset, endoff); |
224 | down_read(¤t->mm->mmap_sem); | 224 | down_read(¤t->mm->mmap_sem); |
diff --git a/mm/memblock.c b/mm/memblock.c index a0562d1a6ad..ccbf9733959 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -758,9 +758,9 @@ void __init memblock_analyze(void) | |||
758 | 758 | ||
759 | /* Check marker in the unused last array entry */ | 759 | /* Check marker in the unused last array entry */ |
760 | WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base | 760 | WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base |
761 | != (phys_addr_t)RED_INACTIVE); | 761 | != MEMBLOCK_INACTIVE); |
762 | WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base | 762 | WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base |
763 | != (phys_addr_t)RED_INACTIVE); | 763 | != MEMBLOCK_INACTIVE); |
764 | 764 | ||
765 | memblock.memory_size = 0; | 765 | memblock.memory_size = 0; |
766 | 766 | ||
@@ -786,8 +786,8 @@ void __init memblock_init(void) | |||
786 | memblock.reserved.max = INIT_MEMBLOCK_REGIONS; | 786 | memblock.reserved.max = INIT_MEMBLOCK_REGIONS; |
787 | 787 | ||
788 | /* Write a marker in the unused last array entry */ | 788 | /* Write a marker in the unused last array entry */ |
789 | memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; | 789 | memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; |
790 | memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; | 790 | memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; |
791 | 791 | ||
792 | /* Create a dummy zero size MEMBLOCK which will get coalesced away later. | 792 | /* Create a dummy zero size MEMBLOCK which will get coalesced away later. |
793 | * This simplifies the memblock_add() code below... | 793 | * This simplifies the memblock_add() code below... |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e57d2..dd81ddc64b4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -35,7 +35,6 @@ | |||
35 | #include <linux/limits.h> | 35 | #include <linux/limits.h> |
36 | #include <linux/mutex.h> | 36 | #include <linux/mutex.h> |
37 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
38 | #include <linux/shmem_fs.h> | ||
39 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
40 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
41 | #include <linux/swapops.h> | 40 | #include <linux/swapops.h> |
@@ -246,10 +245,13 @@ struct mem_cgroup { | |||
246 | * Should the accounting and control be hierarchical, per subtree? | 245 | * Should the accounting and control be hierarchical, per subtree? |
247 | */ | 246 | */ |
248 | bool use_hierarchy; | 247 | bool use_hierarchy; |
249 | atomic_t oom_lock; | 248 | |
249 | bool oom_lock; | ||
250 | atomic_t under_oom; | ||
251 | |||
250 | atomic_t refcnt; | 252 | atomic_t refcnt; |
251 | 253 | ||
252 | unsigned int swappiness; | 254 | int swappiness; |
253 | /* OOM-Killer disable */ | 255 | /* OOM-Killer disable */ |
254 | int oom_kill_disable; | 256 | int oom_kill_disable; |
255 | 257 | ||
@@ -636,27 +638,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
636 | preempt_enable(); | 638 | preempt_enable(); |
637 | } | 639 | } |
638 | 640 | ||
639 | static unsigned long | 641 | unsigned long |
640 | mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) | 642 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, |
643 | unsigned int lru_mask) | ||
641 | { | 644 | { |
642 | struct mem_cgroup_per_zone *mz; | 645 | struct mem_cgroup_per_zone *mz; |
646 | enum lru_list l; | ||
647 | unsigned long ret = 0; | ||
648 | |||
649 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
650 | |||
651 | for_each_lru(l) { | ||
652 | if (BIT(l) & lru_mask) | ||
653 | ret += MEM_CGROUP_ZSTAT(mz, l); | ||
654 | } | ||
655 | return ret; | ||
656 | } | ||
657 | |||
658 | static unsigned long | ||
659 | mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, | ||
660 | int nid, unsigned int lru_mask) | ||
661 | { | ||
643 | u64 total = 0; | 662 | u64 total = 0; |
644 | int zid; | 663 | int zid; |
645 | 664 | ||
646 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 665 | for (zid = 0; zid < MAX_NR_ZONES; zid++) |
647 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 666 | total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); |
648 | total += MEM_CGROUP_ZSTAT(mz, idx); | 667 | |
649 | } | ||
650 | return total; | 668 | return total; |
651 | } | 669 | } |
652 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 670 | |
653 | enum lru_list idx) | 671 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, |
672 | unsigned int lru_mask) | ||
654 | { | 673 | { |
655 | int nid; | 674 | int nid; |
656 | u64 total = 0; | 675 | u64 total = 0; |
657 | 676 | ||
658 | for_each_online_node(nid) | 677 | for_each_node_state(nid, N_HIGH_MEMORY) |
659 | total += mem_cgroup_get_zonestat_node(mem, nid, idx); | 678 | total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); |
660 | return total; | 679 | return total; |
661 | } | 680 | } |
662 | 681 | ||
@@ -1043,6 +1062,21 @@ void mem_cgroup_move_lists(struct page *page, | |||
1043 | mem_cgroup_add_lru_list(page, to); | 1062 | mem_cgroup_add_lru_list(page, to); |
1044 | } | 1063 | } |
1045 | 1064 | ||
1065 | /* | ||
1066 | * Checks whether given mem is same or in the root_mem's | ||
1067 | * hierarchy subtree | ||
1068 | */ | ||
1069 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, | ||
1070 | struct mem_cgroup *mem) | ||
1071 | { | ||
1072 | if (root_mem != mem) { | ||
1073 | return (root_mem->use_hierarchy && | ||
1074 | css_is_ancestor(&mem->css, &root_mem->css)); | ||
1075 | } | ||
1076 | |||
1077 | return true; | ||
1078 | } | ||
1079 | |||
1046 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 1080 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
1047 | { | 1081 | { |
1048 | int ret; | 1082 | int ret; |
@@ -1062,10 +1096,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
1062 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | 1096 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* |
1063 | * hierarchy(even if use_hierarchy is disabled in "mem"). | 1097 | * hierarchy(even if use_hierarchy is disabled in "mem"). |
1064 | */ | 1098 | */ |
1065 | if (mem->use_hierarchy) | 1099 | ret = mem_cgroup_same_or_subtree(mem, curr); |
1066 | ret = css_is_ancestor(&curr->css, &mem->css); | ||
1067 | else | ||
1068 | ret = (curr == mem); | ||
1069 | css_put(&curr->css); | 1100 | css_put(&curr->css); |
1070 | return ret; | 1101 | return ret; |
1071 | } | 1102 | } |
@@ -1077,8 +1108,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ | |||
1077 | unsigned long gb; | 1108 | unsigned long gb; |
1078 | unsigned long inactive_ratio; | 1109 | unsigned long inactive_ratio; |
1079 | 1110 | ||
1080 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); | 1111 | inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); |
1081 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); | 1112 | active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); |
1082 | 1113 | ||
1083 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1114 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1084 | if (gb) | 1115 | if (gb) |
@@ -1117,109 +1148,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | |||
1117 | unsigned long active; | 1148 | unsigned long active; |
1118 | unsigned long inactive; | 1149 | unsigned long inactive; |
1119 | 1150 | ||
1120 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | 1151 | inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); |
1121 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | 1152 | active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); |
1122 | 1153 | ||
1123 | return (active > inactive); | 1154 | return (active > inactive); |
1124 | } | 1155 | } |
1125 | 1156 | ||
1126 | unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, | ||
1127 | struct zone *zone, | ||
1128 | enum lru_list lru) | ||
1129 | { | ||
1130 | int nid = zone_to_nid(zone); | ||
1131 | int zid = zone_idx(zone); | ||
1132 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
1133 | |||
1134 | return MEM_CGROUP_ZSTAT(mz, lru); | ||
1135 | } | ||
1136 | |||
1137 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | ||
1138 | int nid) | ||
1139 | { | ||
1140 | unsigned long ret; | ||
1141 | |||
1142 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + | ||
1143 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); | ||
1144 | |||
1145 | return ret; | ||
1146 | } | ||
1147 | |||
1148 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1149 | int nid) | ||
1150 | { | ||
1151 | unsigned long ret; | ||
1152 | |||
1153 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1154 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1155 | return ret; | ||
1156 | } | ||
1157 | |||
1158 | #if MAX_NUMNODES > 1 | ||
1159 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | ||
1160 | { | ||
1161 | u64 total = 0; | ||
1162 | int nid; | ||
1163 | |||
1164 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1165 | total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); | ||
1166 | |||
1167 | return total; | ||
1168 | } | ||
1169 | |||
1170 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) | ||
1171 | { | ||
1172 | u64 total = 0; | ||
1173 | int nid; | ||
1174 | |||
1175 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1176 | total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); | ||
1177 | |||
1178 | return total; | ||
1179 | } | ||
1180 | |||
1181 | static unsigned long | ||
1182 | mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) | ||
1183 | { | ||
1184 | return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); | ||
1185 | } | ||
1186 | |||
1187 | static unsigned long | ||
1188 | mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) | ||
1189 | { | ||
1190 | u64 total = 0; | ||
1191 | int nid; | ||
1192 | |||
1193 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1194 | total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); | ||
1195 | |||
1196 | return total; | ||
1197 | } | ||
1198 | |||
1199 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | ||
1200 | int nid) | ||
1201 | { | ||
1202 | enum lru_list l; | ||
1203 | u64 total = 0; | ||
1204 | |||
1205 | for_each_lru(l) | ||
1206 | total += mem_cgroup_get_zonestat_node(memcg, nid, l); | ||
1207 | |||
1208 | return total; | ||
1209 | } | ||
1210 | |||
1211 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) | ||
1212 | { | ||
1213 | u64 total = 0; | ||
1214 | int nid; | ||
1215 | |||
1216 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1217 | total += mem_cgroup_node_nr_lru_pages(memcg, nid); | ||
1218 | |||
1219 | return total; | ||
1220 | } | ||
1221 | #endif /* CONFIG_NUMA */ | ||
1222 | |||
1223 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | 1157 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
1224 | struct zone *zone) | 1158 | struct zone *zone) |
1225 | { | 1159 | { |
@@ -1329,7 +1263,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) | |||
1329 | return margin >> PAGE_SHIFT; | 1263 | return margin >> PAGE_SHIFT; |
1330 | } | 1264 | } |
1331 | 1265 | ||
1332 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1266 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) |
1333 | { | 1267 | { |
1334 | struct cgroup *cgrp = memcg->css.cgroup; | 1268 | struct cgroup *cgrp = memcg->css.cgroup; |
1335 | 1269 | ||
@@ -1401,10 +1335,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) | |||
1401 | to = mc.to; | 1335 | to = mc.to; |
1402 | if (!from) | 1336 | if (!from) |
1403 | goto unlock; | 1337 | goto unlock; |
1404 | if (from == mem || to == mem | 1338 | |
1405 | || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) | 1339 | ret = mem_cgroup_same_or_subtree(mem, from) |
1406 | || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) | 1340 | || mem_cgroup_same_or_subtree(mem, to); |
1407 | ret = true; | ||
1408 | unlock: | 1341 | unlock: |
1409 | spin_unlock(&mc.lock); | 1342 | spin_unlock(&mc.lock); |
1410 | return ret; | 1343 | return ret; |
@@ -1576,11 +1509,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1576 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | 1509 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, |
1577 | int nid, bool noswap) | 1510 | int nid, bool noswap) |
1578 | { | 1511 | { |
1579 | if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) | 1512 | if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) |
1580 | return true; | 1513 | return true; |
1581 | if (noswap || !total_swap_pages) | 1514 | if (noswap || !total_swap_pages) |
1582 | return false; | 1515 | return false; |
1583 | if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) | 1516 | if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) |
1584 | return true; | 1517 | return true; |
1585 | return false; | 1518 | return false; |
1586 | 1519 | ||
@@ -1730,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1730 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1663 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1731 | 1664 | ||
1732 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1665 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1733 | if (!check_soft && root_mem->memsw_is_minimum) | 1666 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) |
1734 | noswap = true; | 1667 | noswap = true; |
1735 | 1668 | ||
1736 | while (1) { | 1669 | while (1) { |
@@ -1776,12 +1709,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1776 | /* we use swappiness of local cgroup */ | 1709 | /* we use swappiness of local cgroup */ |
1777 | if (check_soft) { | 1710 | if (check_soft) { |
1778 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1711 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1779 | noswap, get_swappiness(victim), zone, | 1712 | noswap, zone, &nr_scanned); |
1780 | &nr_scanned); | ||
1781 | *total_scanned += nr_scanned; | 1713 | *total_scanned += nr_scanned; |
1782 | } else | 1714 | } else |
1783 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1715 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1784 | noswap, get_swappiness(victim)); | 1716 | noswap); |
1785 | css_put(&victim->css); | 1717 | css_put(&victim->css); |
1786 | /* | 1718 | /* |
1787 | * At shrinking usage, we can't check we should stop here or | 1719 | * At shrinking usage, we can't check we should stop here or |
@@ -1803,38 +1735,77 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1803 | /* | 1735 | /* |
1804 | * Check OOM-Killer is already running under our hierarchy. | 1736 | * Check OOM-Killer is already running under our hierarchy. |
1805 | * If someone is running, return false. | 1737 | * If someone is running, return false. |
1738 | * Has to be called with memcg_oom_lock | ||
1806 | */ | 1739 | */ |
1807 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | 1740 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) |
1808 | { | 1741 | { |
1809 | int x, lock_count = 0; | 1742 | struct mem_cgroup *iter, *failed = NULL; |
1810 | struct mem_cgroup *iter; | 1743 | bool cond = true; |
1811 | 1744 | ||
1812 | for_each_mem_cgroup_tree(iter, mem) { | 1745 | for_each_mem_cgroup_tree_cond(iter, mem, cond) { |
1813 | x = atomic_inc_return(&iter->oom_lock); | 1746 | if (iter->oom_lock) { |
1814 | lock_count = max(x, lock_count); | 1747 | /* |
1748 | * this subtree of our hierarchy is already locked | ||
1749 | * so we cannot give a lock. | ||
1750 | */ | ||
1751 | failed = iter; | ||
1752 | cond = false; | ||
1753 | } else | ||
1754 | iter->oom_lock = true; | ||
1815 | } | 1755 | } |
1816 | 1756 | ||
1817 | if (lock_count == 1) | 1757 | if (!failed) |
1818 | return true; | 1758 | return true; |
1759 | |||
1760 | /* | ||
1761 | * OK, we failed to lock the whole subtree so we have to clean up | ||
1762 | * what we set up to the failing subtree | ||
1763 | */ | ||
1764 | cond = true; | ||
1765 | for_each_mem_cgroup_tree_cond(iter, mem, cond) { | ||
1766 | if (iter == failed) { | ||
1767 | cond = false; | ||
1768 | continue; | ||
1769 | } | ||
1770 | iter->oom_lock = false; | ||
1771 | } | ||
1819 | return false; | 1772 | return false; |
1820 | } | 1773 | } |
1821 | 1774 | ||
1775 | /* | ||
1776 | * Has to be called with memcg_oom_lock | ||
1777 | */ | ||
1822 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) | 1778 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1823 | { | 1779 | { |
1824 | struct mem_cgroup *iter; | 1780 | struct mem_cgroup *iter; |
1825 | 1781 | ||
1782 | for_each_mem_cgroup_tree(iter, mem) | ||
1783 | iter->oom_lock = false; | ||
1784 | return 0; | ||
1785 | } | ||
1786 | |||
1787 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) | ||
1788 | { | ||
1789 | struct mem_cgroup *iter; | ||
1790 | |||
1791 | for_each_mem_cgroup_tree(iter, mem) | ||
1792 | atomic_inc(&iter->under_oom); | ||
1793 | } | ||
1794 | |||
1795 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) | ||
1796 | { | ||
1797 | struct mem_cgroup *iter; | ||
1798 | |||
1826 | /* | 1799 | /* |
1827 | * When a new child is created while the hierarchy is under oom, | 1800 | * When a new child is created while the hierarchy is under oom, |
1828 | * mem_cgroup_oom_lock() may not be called. We have to use | 1801 | * mem_cgroup_oom_lock() may not be called. We have to use |
1829 | * atomic_add_unless() here. | 1802 | * atomic_add_unless() here. |
1830 | */ | 1803 | */ |
1831 | for_each_mem_cgroup_tree(iter, mem) | 1804 | for_each_mem_cgroup_tree(iter, mem) |
1832 | atomic_add_unless(&iter->oom_lock, -1, 0); | 1805 | atomic_add_unless(&iter->under_oom, -1, 0); |
1833 | return 0; | ||
1834 | } | 1806 | } |
1835 | 1807 | ||
1836 | 1808 | static DEFINE_SPINLOCK(memcg_oom_lock); | |
1837 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1838 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1809 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1839 | 1810 | ||
1840 | struct oom_wait_info { | 1811 | struct oom_wait_info { |
@@ -1845,25 +1816,20 @@ struct oom_wait_info { | |||
1845 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1816 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1846 | unsigned mode, int sync, void *arg) | 1817 | unsigned mode, int sync, void *arg) |
1847 | { | 1818 | { |
1848 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | 1819 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, |
1820 | *oom_wait_mem; | ||
1849 | struct oom_wait_info *oom_wait_info; | 1821 | struct oom_wait_info *oom_wait_info; |
1850 | 1822 | ||
1851 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1823 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1824 | oom_wait_mem = oom_wait_info->mem; | ||
1852 | 1825 | ||
1853 | if (oom_wait_info->mem == wake_mem) | ||
1854 | goto wakeup; | ||
1855 | /* if no hierarchy, no match */ | ||
1856 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | ||
1857 | return 0; | ||
1858 | /* | 1826 | /* |
1859 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1827 | * Both of oom_wait_info->mem and wake_mem are stable under us. |
1860 | * Then we can use css_is_ancestor without taking care of RCU. | 1828 | * Then we can use css_is_ancestor without taking care of RCU. |
1861 | */ | 1829 | */ |
1862 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | 1830 | if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) |
1863 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | 1831 | && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) |
1864 | return 0; | 1832 | return 0; |
1865 | |||
1866 | wakeup: | ||
1867 | return autoremove_wake_function(wait, mode, sync, arg); | 1833 | return autoremove_wake_function(wait, mode, sync, arg); |
1868 | } | 1834 | } |
1869 | 1835 | ||
@@ -1875,7 +1841,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) | |||
1875 | 1841 | ||
1876 | static void memcg_oom_recover(struct mem_cgroup *mem) | 1842 | static void memcg_oom_recover(struct mem_cgroup *mem) |
1877 | { | 1843 | { |
1878 | if (mem && atomic_read(&mem->oom_lock)) | 1844 | if (mem && atomic_read(&mem->under_oom)) |
1879 | memcg_wakeup_oom(mem); | 1845 | memcg_wakeup_oom(mem); |
1880 | } | 1846 | } |
1881 | 1847 | ||
@@ -1893,8 +1859,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1893 | owait.wait.private = current; | 1859 | owait.wait.private = current; |
1894 | INIT_LIST_HEAD(&owait.wait.task_list); | 1860 | INIT_LIST_HEAD(&owait.wait.task_list); |
1895 | need_to_kill = true; | 1861 | need_to_kill = true; |
1862 | mem_cgroup_mark_under_oom(mem); | ||
1863 | |||
1896 | /* At first, try to OOM lock hierarchy under mem.*/ | 1864 | /* At first, try to OOM lock hierarchy under mem.*/ |
1897 | mutex_lock(&memcg_oom_mutex); | 1865 | spin_lock(&memcg_oom_lock); |
1898 | locked = mem_cgroup_oom_lock(mem); | 1866 | locked = mem_cgroup_oom_lock(mem); |
1899 | /* | 1867 | /* |
1900 | * Even if signal_pending(), we can't quit charge() loop without | 1868 | * Even if signal_pending(), we can't quit charge() loop without |
@@ -1906,7 +1874,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1906 | need_to_kill = false; | 1874 | need_to_kill = false; |
1907 | if (locked) | 1875 | if (locked) |
1908 | mem_cgroup_oom_notify(mem); | 1876 | mem_cgroup_oom_notify(mem); |
1909 | mutex_unlock(&memcg_oom_mutex); | 1877 | spin_unlock(&memcg_oom_lock); |
1910 | 1878 | ||
1911 | if (need_to_kill) { | 1879 | if (need_to_kill) { |
1912 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1880 | finish_wait(&memcg_oom_waitq, &owait.wait); |
@@ -1915,10 +1883,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1915 | schedule(); | 1883 | schedule(); |
1916 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1884 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1917 | } | 1885 | } |
1918 | mutex_lock(&memcg_oom_mutex); | 1886 | spin_lock(&memcg_oom_lock); |
1919 | mem_cgroup_oom_unlock(mem); | 1887 | if (locked) |
1888 | mem_cgroup_oom_unlock(mem); | ||
1920 | memcg_wakeup_oom(mem); | 1889 | memcg_wakeup_oom(mem); |
1921 | mutex_unlock(&memcg_oom_mutex); | 1890 | spin_unlock(&memcg_oom_lock); |
1891 | |||
1892 | mem_cgroup_unmark_under_oom(mem); | ||
1922 | 1893 | ||
1923 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1894 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
1924 | return false; | 1895 | return false; |
@@ -2079,59 +2050,70 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | |||
2079 | } | 2050 | } |
2080 | 2051 | ||
2081 | /* | 2052 | /* |
2082 | * Tries to drain stocked charges in other cpus. This function is asynchronous | 2053 | * Drains all per-CPU charge caches for given root_mem resp. subtree |
2083 | * and just put a work per cpu for draining localy on each cpu. Caller can | 2054 | * of the hierarchy under it. sync flag says whether we should block |
2084 | * expects some charges will be back to res_counter later but cannot wait for | 2055 | * until the work is done. |
2085 | * it. | ||
2086 | */ | 2056 | */ |
2087 | static void drain_all_stock_async(struct mem_cgroup *root_mem) | 2057 | static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) |
2088 | { | 2058 | { |
2089 | int cpu, curcpu; | 2059 | int cpu, curcpu; |
2090 | /* | 2060 | |
2091 | * If someone calls draining, avoid adding more kworker runs. | ||
2092 | */ | ||
2093 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2094 | return; | ||
2095 | /* Notify other cpus that system-wide "drain" is running */ | 2061 | /* Notify other cpus that system-wide "drain" is running */ |
2096 | get_online_cpus(); | 2062 | get_online_cpus(); |
2097 | /* | 2063 | curcpu = get_cpu(); |
2098 | * Get a hint for avoiding draining charges on the current cpu, | ||
2099 | * which must be exhausted by our charging. It is not required that | ||
2100 | * this be a precise check, so we use raw_smp_processor_id() instead of | ||
2101 | * getcpu()/putcpu(). | ||
2102 | */ | ||
2103 | curcpu = raw_smp_processor_id(); | ||
2104 | for_each_online_cpu(cpu) { | 2064 | for_each_online_cpu(cpu) { |
2105 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2065 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2106 | struct mem_cgroup *mem; | 2066 | struct mem_cgroup *mem; |
2107 | 2067 | ||
2108 | if (cpu == curcpu) | ||
2109 | continue; | ||
2110 | |||
2111 | mem = stock->cached; | 2068 | mem = stock->cached; |
2112 | if (!mem) | 2069 | if (!mem || !stock->nr_pages) |
2070 | continue; | ||
2071 | if (!mem_cgroup_same_or_subtree(root_mem, mem)) | ||
2113 | continue; | 2072 | continue; |
2114 | if (mem != root_mem) { | 2073 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2115 | if (!root_mem->use_hierarchy) | 2074 | if (cpu == curcpu) |
2116 | continue; | 2075 | drain_local_stock(&stock->work); |
2117 | /* check whether "mem" is under tree of "root_mem" */ | 2076 | else |
2118 | if (!css_is_ancestor(&mem->css, &root_mem->css)) | 2077 | schedule_work_on(cpu, &stock->work); |
2119 | continue; | ||
2120 | } | 2078 | } |
2121 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2122 | schedule_work_on(cpu, &stock->work); | ||
2123 | } | 2079 | } |
2080 | put_cpu(); | ||
2081 | |||
2082 | if (!sync) | ||
2083 | goto out; | ||
2084 | |||
2085 | for_each_online_cpu(cpu) { | ||
2086 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
2087 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2088 | flush_work(&stock->work); | ||
2089 | } | ||
2090 | out: | ||
2124 | put_online_cpus(); | 2091 | put_online_cpus(); |
2092 | } | ||
2093 | |||
2094 | /* | ||
2095 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
2096 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
2097 | * expects some charges will be back to res_counter later but cannot wait for | ||
2098 | * it. | ||
2099 | */ | ||
2100 | static void drain_all_stock_async(struct mem_cgroup *root_mem) | ||
2101 | { | ||
2102 | /* | ||
2103 | * If someone calls draining, avoid adding more kworker runs. | ||
2104 | */ | ||
2105 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2106 | return; | ||
2107 | drain_all_stock(root_mem, false); | ||
2125 | mutex_unlock(&percpu_charge_mutex); | 2108 | mutex_unlock(&percpu_charge_mutex); |
2126 | /* We don't wait for flush_work */ | ||
2127 | } | 2109 | } |
2128 | 2110 | ||
2129 | /* This is a synchronous drain interface. */ | 2111 | /* This is a synchronous drain interface. */ |
2130 | static void drain_all_stock_sync(void) | 2112 | static void drain_all_stock_sync(struct mem_cgroup *root_mem) |
2131 | { | 2113 | { |
2132 | /* called when force_empty is called */ | 2114 | /* called when force_empty is called */ |
2133 | mutex_lock(&percpu_charge_mutex); | 2115 | mutex_lock(&percpu_charge_mutex); |
2134 | schedule_on_each_cpu(drain_local_stock); | 2116 | drain_all_stock(root_mem, true); |
2135 | mutex_unlock(&percpu_charge_mutex); | 2117 | mutex_unlock(&percpu_charge_mutex); |
2136 | } | 2118 | } |
2137 | 2119 | ||
@@ -2784,30 +2766,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2784 | return 0; | 2766 | return 0; |
2785 | if (PageCompound(page)) | 2767 | if (PageCompound(page)) |
2786 | return 0; | 2768 | return 0; |
2787 | /* | ||
2788 | * Corner case handling. This is called from add_to_page_cache() | ||
2789 | * in usual. But some FS (shmem) precharges this page before calling it | ||
2790 | * and call add_to_page_cache() with GFP_NOWAIT. | ||
2791 | * | ||
2792 | * For GFP_NOWAIT case, the page may be pre-charged before calling | ||
2793 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | ||
2794 | * charge twice. (It works but has to pay a bit larger cost.) | ||
2795 | * And when the page is SwapCache, it should take swap information | ||
2796 | * into account. This is under lock_page() now. | ||
2797 | */ | ||
2798 | if (!(gfp_mask & __GFP_WAIT)) { | ||
2799 | struct page_cgroup *pc; | ||
2800 | |||
2801 | pc = lookup_page_cgroup(page); | ||
2802 | if (!pc) | ||
2803 | return 0; | ||
2804 | lock_page_cgroup(pc); | ||
2805 | if (PageCgroupUsed(pc)) { | ||
2806 | unlock_page_cgroup(pc); | ||
2807 | return 0; | ||
2808 | } | ||
2809 | unlock_page_cgroup(pc); | ||
2810 | } | ||
2811 | 2769 | ||
2812 | if (unlikely(!mm)) | 2770 | if (unlikely(!mm)) |
2813 | mm = &init_mm; | 2771 | mm = &init_mm; |
@@ -3398,28 +3356,47 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
3398 | } | 3356 | } |
3399 | 3357 | ||
3400 | /* | 3358 | /* |
3401 | * A call to try to shrink memory usage on charge failure at shmem's swapin. | 3359 | * At replace page cache, newpage is not under any memcg but it's on |
3402 | * Calling hierarchical_reclaim is not enough because we should update | 3360 | * LRU. So, this function doesn't touch res_counter but handles LRU |
3403 | * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. | 3361 | * in correct way. Both pages are locked so we cannot race with uncharge. |
3404 | * Moreover considering hierarchy, we should reclaim from the mem_over_limit, | ||
3405 | * not from the memcg which this page would be charged to. | ||
3406 | * try_charge_swapin does all of these works properly. | ||
3407 | */ | 3362 | */ |
3408 | int mem_cgroup_shmem_charge_fallback(struct page *page, | 3363 | void mem_cgroup_replace_page_cache(struct page *oldpage, |
3409 | struct mm_struct *mm, | 3364 | struct page *newpage) |
3410 | gfp_t gfp_mask) | ||
3411 | { | 3365 | { |
3412 | struct mem_cgroup *mem; | 3366 | struct mem_cgroup *memcg; |
3413 | int ret; | 3367 | struct page_cgroup *pc; |
3368 | struct zone *zone; | ||
3369 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3370 | unsigned long flags; | ||
3414 | 3371 | ||
3415 | if (mem_cgroup_disabled()) | 3372 | if (mem_cgroup_disabled()) |
3416 | return 0; | 3373 | return; |
3417 | 3374 | ||
3418 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 3375 | pc = lookup_page_cgroup(oldpage); |
3419 | if (!ret) | 3376 | /* fix accounting on old pages */ |
3420 | mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ | 3377 | lock_page_cgroup(pc); |
3378 | memcg = pc->mem_cgroup; | ||
3379 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); | ||
3380 | ClearPageCgroupUsed(pc); | ||
3381 | unlock_page_cgroup(pc); | ||
3421 | 3382 | ||
3422 | return ret; | 3383 | if (PageSwapBacked(oldpage)) |
3384 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
3385 | |||
3386 | zone = page_zone(newpage); | ||
3387 | pc = lookup_page_cgroup(newpage); | ||
3388 | /* | ||
3389 | * Even if newpage->mapping was NULL before starting replacement, | ||
3390 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | ||
3391 | * LRU while we overwrite pc->mem_cgroup. | ||
3392 | */ | ||
3393 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
3394 | if (PageLRU(newpage)) | ||
3395 | del_page_from_lru_list(zone, newpage, page_lru(newpage)); | ||
3396 | __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type); | ||
3397 | if (PageLRU(newpage)) | ||
3398 | add_page_to_lru_list(zone, newpage, page_lru(newpage)); | ||
3399 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3423 | } | 3400 | } |
3424 | 3401 | ||
3425 | #ifdef CONFIG_DEBUG_VM | 3402 | #ifdef CONFIG_DEBUG_VM |
@@ -3780,7 +3757,7 @@ move_account: | |||
3780 | goto out; | 3757 | goto out; |
3781 | /* This is for making all *used* pages to be on LRU. */ | 3758 | /* This is for making all *used* pages to be on LRU. */ |
3782 | lru_add_drain_all(); | 3759 | lru_add_drain_all(); |
3783 | drain_all_stock_sync(); | 3760 | drain_all_stock_sync(mem); |
3784 | ret = 0; | 3761 | ret = 0; |
3785 | mem_cgroup_start_move(mem); | 3762 | mem_cgroup_start_move(mem); |
3786 | for_each_node_state(node, N_HIGH_MEMORY) { | 3763 | for_each_node_state(node, N_HIGH_MEMORY) { |
@@ -3826,7 +3803,7 @@ try_to_free: | |||
3826 | goto out; | 3803 | goto out; |
3827 | } | 3804 | } |
3828 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | 3805 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, |
3829 | false, get_swappiness(mem)); | 3806 | false); |
3830 | if (!progress) { | 3807 | if (!progress) { |
3831 | nr_retries--; | 3808 | nr_retries--; |
3832 | /* maybe some writeback is necessary */ | 3809 | /* maybe some writeback is necessary */ |
@@ -4152,15 +4129,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | |||
4152 | s->stat[MCS_PGMAJFAULT] += val; | 4129 | s->stat[MCS_PGMAJFAULT] += val; |
4153 | 4130 | ||
4154 | /* per zone stat */ | 4131 | /* per zone stat */ |
4155 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 4132 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); |
4156 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | 4133 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; |
4157 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); | 4134 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); |
4158 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | 4135 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; |
4159 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); | 4136 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); |
4160 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | 4137 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; |
4161 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); | 4138 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); |
4162 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | 4139 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; |
4163 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); | 4140 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); |
4164 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | 4141 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; |
4165 | } | 4142 | } |
4166 | 4143 | ||
@@ -4182,35 +4159,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4182 | struct cgroup *cont = m->private; | 4159 | struct cgroup *cont = m->private; |
4183 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4160 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); |
4184 | 4161 | ||
4185 | total_nr = mem_cgroup_nr_lru_pages(mem_cont); | 4162 | total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); |
4186 | seq_printf(m, "total=%lu", total_nr); | 4163 | seq_printf(m, "total=%lu", total_nr); |
4187 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4164 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4188 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); | 4165 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); |
4189 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4166 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4190 | } | 4167 | } |
4191 | seq_putc(m, '\n'); | 4168 | seq_putc(m, '\n'); |
4192 | 4169 | ||
4193 | file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); | 4170 | file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); |
4194 | seq_printf(m, "file=%lu", file_nr); | 4171 | seq_printf(m, "file=%lu", file_nr); |
4195 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4172 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4196 | node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); | 4173 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4174 | LRU_ALL_FILE); | ||
4197 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4175 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4198 | } | 4176 | } |
4199 | seq_putc(m, '\n'); | 4177 | seq_putc(m, '\n'); |
4200 | 4178 | ||
4201 | anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); | 4179 | anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); |
4202 | seq_printf(m, "anon=%lu", anon_nr); | 4180 | seq_printf(m, "anon=%lu", anon_nr); |
4203 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4181 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4204 | node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); | 4182 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4183 | LRU_ALL_ANON); | ||
4205 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4184 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4206 | } | 4185 | } |
4207 | seq_putc(m, '\n'); | 4186 | seq_putc(m, '\n'); |
4208 | 4187 | ||
4209 | unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); | 4188 | unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); |
4210 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4189 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4211 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4190 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4212 | node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, | 4191 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4213 | nid); | 4192 | BIT(LRU_UNEVICTABLE)); |
4214 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4193 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4215 | } | 4194 | } |
4216 | seq_putc(m, '\n'); | 4195 | seq_putc(m, '\n'); |
@@ -4288,7 +4267,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) | |||
4288 | { | 4267 | { |
4289 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 4268 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4290 | 4269 | ||
4291 | return get_swappiness(memcg); | 4270 | return mem_cgroup_swappiness(memcg); |
4292 | } | 4271 | } |
4293 | 4272 | ||
4294 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | 4273 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, |
@@ -4578,15 +4557,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | |||
4578 | if (!event) | 4557 | if (!event) |
4579 | return -ENOMEM; | 4558 | return -ENOMEM; |
4580 | 4559 | ||
4581 | mutex_lock(&memcg_oom_mutex); | 4560 | spin_lock(&memcg_oom_lock); |
4582 | 4561 | ||
4583 | event->eventfd = eventfd; | 4562 | event->eventfd = eventfd; |
4584 | list_add(&event->list, &memcg->oom_notify); | 4563 | list_add(&event->list, &memcg->oom_notify); |
4585 | 4564 | ||
4586 | /* already in OOM ? */ | 4565 | /* already in OOM ? */ |
4587 | if (atomic_read(&memcg->oom_lock)) | 4566 | if (atomic_read(&memcg->under_oom)) |
4588 | eventfd_signal(eventfd, 1); | 4567 | eventfd_signal(eventfd, 1); |
4589 | mutex_unlock(&memcg_oom_mutex); | 4568 | spin_unlock(&memcg_oom_lock); |
4590 | 4569 | ||
4591 | return 0; | 4570 | return 0; |
4592 | } | 4571 | } |
@@ -4600,7 +4579,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4600 | 4579 | ||
4601 | BUG_ON(type != _OOM_TYPE); | 4580 | BUG_ON(type != _OOM_TYPE); |
4602 | 4581 | ||
4603 | mutex_lock(&memcg_oom_mutex); | 4582 | spin_lock(&memcg_oom_lock); |
4604 | 4583 | ||
4605 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | 4584 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { |
4606 | if (ev->eventfd == eventfd) { | 4585 | if (ev->eventfd == eventfd) { |
@@ -4609,7 +4588,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4609 | } | 4588 | } |
4610 | } | 4589 | } |
4611 | 4590 | ||
4612 | mutex_unlock(&memcg_oom_mutex); | 4591 | spin_unlock(&memcg_oom_lock); |
4613 | } | 4592 | } |
4614 | 4593 | ||
4615 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | 4594 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, |
@@ -4619,7 +4598,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | |||
4619 | 4598 | ||
4620 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | 4599 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); |
4621 | 4600 | ||
4622 | if (atomic_read(&mem->oom_lock)) | 4601 | if (atomic_read(&mem->under_oom)) |
4623 | cb->fill(cb, "under_oom", 1); | 4602 | cb->fill(cb, "under_oom", 1); |
4624 | else | 4603 | else |
4625 | cb->fill(cb, "under_oom", 0); | 4604 | cb->fill(cb, "under_oom", 0); |
@@ -4963,9 +4942,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4963 | int cpu; | 4942 | int cpu; |
4964 | enable_swap_cgroup(); | 4943 | enable_swap_cgroup(); |
4965 | parent = NULL; | 4944 | parent = NULL; |
4966 | root_mem_cgroup = mem; | ||
4967 | if (mem_cgroup_soft_limit_tree_init()) | 4945 | if (mem_cgroup_soft_limit_tree_init()) |
4968 | goto free_out; | 4946 | goto free_out; |
4947 | root_mem_cgroup = mem; | ||
4969 | for_each_possible_cpu(cpu) { | 4948 | for_each_possible_cpu(cpu) { |
4970 | struct memcg_stock_pcp *stock = | 4949 | struct memcg_stock_pcp *stock = |
4971 | &per_cpu(memcg_stock, cpu); | 4950 | &per_cpu(memcg_stock, cpu); |
@@ -4997,14 +4976,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4997 | INIT_LIST_HEAD(&mem->oom_notify); | 4976 | INIT_LIST_HEAD(&mem->oom_notify); |
4998 | 4977 | ||
4999 | if (parent) | 4978 | if (parent) |
5000 | mem->swappiness = get_swappiness(parent); | 4979 | mem->swappiness = mem_cgroup_swappiness(parent); |
5001 | atomic_set(&mem->refcnt, 1); | 4980 | atomic_set(&mem->refcnt, 1); |
5002 | mem->move_charge_at_immigrate = 0; | 4981 | mem->move_charge_at_immigrate = 0; |
5003 | mutex_init(&mem->thresholds_lock); | 4982 | mutex_init(&mem->thresholds_lock); |
5004 | return &mem->css; | 4983 | return &mem->css; |
5005 | free_out: | 4984 | free_out: |
5006 | __mem_cgroup_free(mem); | 4985 | __mem_cgroup_free(mem); |
5007 | root_mem_cgroup = NULL; | ||
5008 | return ERR_PTR(error); | 4986 | return ERR_PTR(error); |
5009 | } | 4987 | } |
5010 | 4988 | ||
@@ -5181,15 +5159,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5181 | pgoff = pte_to_pgoff(ptent); | 5159 | pgoff = pte_to_pgoff(ptent); |
5182 | 5160 | ||
5183 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 5161 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
5184 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | 5162 | page = find_get_page(mapping, pgoff); |
5185 | page = find_get_page(mapping, pgoff); | 5163 | |
5186 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | 5164 | #ifdef CONFIG_SWAP |
5187 | swp_entry_t ent; | 5165 | /* shmem/tmpfs may report page out on swap: account for that too. */ |
5188 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | 5166 | if (radix_tree_exceptional_entry(page)) { |
5167 | swp_entry_t swap = radix_to_swp_entry(page); | ||
5189 | if (do_swap_account) | 5168 | if (do_swap_account) |
5190 | entry->val = ent.val; | 5169 | *entry = swap; |
5170 | page = find_get_page(&swapper_space, swap.val); | ||
5191 | } | 5171 | } |
5192 | 5172 | #endif | |
5193 | return page; | 5173 | return page; |
5194 | } | 5174 | } |
5195 | 5175 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059..2b43ba051ac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | 54 | #include <linux/memory_hotplug.h> |
55 | #include <linux/mm_inline.h> | 55 | #include <linux/mm_inline.h> |
56 | #include <linux/kfifo.h> | ||
56 | #include "internal.h" | 57 | #include "internal.h" |
57 | 58 | ||
58 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 59 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno) | |||
1178 | __memory_failure(pfn, trapno, 0); | 1179 | __memory_failure(pfn, trapno, 0); |
1179 | } | 1180 | } |
1180 | 1181 | ||
1182 | #define MEMORY_FAILURE_FIFO_ORDER 4 | ||
1183 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) | ||
1184 | |||
1185 | struct memory_failure_entry { | ||
1186 | unsigned long pfn; | ||
1187 | int trapno; | ||
1188 | int flags; | ||
1189 | }; | ||
1190 | |||
1191 | struct memory_failure_cpu { | ||
1192 | DECLARE_KFIFO(fifo, struct memory_failure_entry, | ||
1193 | MEMORY_FAILURE_FIFO_SIZE); | ||
1194 | spinlock_t lock; | ||
1195 | struct work_struct work; | ||
1196 | }; | ||
1197 | |||
1198 | static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); | ||
1199 | |||
1200 | /** | ||
1201 | * memory_failure_queue - Schedule handling memory failure of a page. | ||
1202 | * @pfn: Page Number of the corrupted page | ||
1203 | * @trapno: Trap number reported in the signal to user space. | ||
1204 | * @flags: Flags for memory failure handling | ||
1205 | * | ||
1206 | * This function is called by the low level hardware error handler | ||
1207 | * when it detects hardware memory corruption of a page. It schedules | ||
1208 | * the recovering of error page, including dropping pages, killing | ||
1209 | * processes etc. | ||
1210 | * | ||
1211 | * The function is primarily of use for corruptions that | ||
1212 | * happen outside the current execution context (e.g. when | ||
1213 | * detected by a background scrubber) | ||
1214 | * | ||
1215 | * Can run in IRQ context. | ||
1216 | */ | ||
1217 | void memory_failure_queue(unsigned long pfn, int trapno, int flags) | ||
1218 | { | ||
1219 | struct memory_failure_cpu *mf_cpu; | ||
1220 | unsigned long proc_flags; | ||
1221 | struct memory_failure_entry entry = { | ||
1222 | .pfn = pfn, | ||
1223 | .trapno = trapno, | ||
1224 | .flags = flags, | ||
1225 | }; | ||
1226 | |||
1227 | mf_cpu = &get_cpu_var(memory_failure_cpu); | ||
1228 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1229 | if (kfifo_put(&mf_cpu->fifo, &entry)) | ||
1230 | schedule_work_on(smp_processor_id(), &mf_cpu->work); | ||
1231 | else | ||
1232 | pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", | ||
1233 | pfn); | ||
1234 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1235 | put_cpu_var(memory_failure_cpu); | ||
1236 | } | ||
1237 | EXPORT_SYMBOL_GPL(memory_failure_queue); | ||
1238 | |||
1239 | static void memory_failure_work_func(struct work_struct *work) | ||
1240 | { | ||
1241 | struct memory_failure_cpu *mf_cpu; | ||
1242 | struct memory_failure_entry entry = { 0, }; | ||
1243 | unsigned long proc_flags; | ||
1244 | int gotten; | ||
1245 | |||
1246 | mf_cpu = &__get_cpu_var(memory_failure_cpu); | ||
1247 | for (;;) { | ||
1248 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1249 | gotten = kfifo_get(&mf_cpu->fifo, &entry); | ||
1250 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1251 | if (!gotten) | ||
1252 | break; | ||
1253 | __memory_failure(entry.pfn, entry.trapno, entry.flags); | ||
1254 | } | ||
1255 | } | ||
1256 | |||
1257 | static int __init memory_failure_init(void) | ||
1258 | { | ||
1259 | struct memory_failure_cpu *mf_cpu; | ||
1260 | int cpu; | ||
1261 | |||
1262 | for_each_possible_cpu(cpu) { | ||
1263 | mf_cpu = &per_cpu(memory_failure_cpu, cpu); | ||
1264 | spin_lock_init(&mf_cpu->lock); | ||
1265 | INIT_KFIFO(mf_cpu->fifo); | ||
1266 | INIT_WORK(&mf_cpu->work, memory_failure_work_func); | ||
1267 | } | ||
1268 | |||
1269 | return 0; | ||
1270 | } | ||
1271 | core_initcall(memory_failure_init); | ||
1272 | |||
1181 | /** | 1273 | /** |
1182 | * unpoison_memory - Unpoison a previously poisoned page | 1274 | * unpoison_memory - Unpoison a previously poisoned page |
1183 | * @pfn: Page number of the to be unpoisoned page | 1275 | * @pfn: Page number of the to be unpoisoned page |
diff --git a/mm/memory.c b/mm/memory.c index 9b8a01d941c..b2b87315cdc 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1290 | return addr; | 1290 | return addr; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | #ifdef CONFIG_PREEMPT | ||
1294 | # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) | ||
1295 | #else | ||
1296 | /* No preempt: go for improved straight-line efficiency */ | ||
1297 | # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) | ||
1298 | #endif | ||
1299 | |||
1300 | /** | 1293 | /** |
1301 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1294 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1302 | * @tlb: address of the caller's struct mmu_gather | 1295 | * @tlb: address of the caller's struct mmu_gather |
@@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1310 | * | 1303 | * |
1311 | * Unmap all pages in the vma list. | 1304 | * Unmap all pages in the vma list. |
1312 | * | 1305 | * |
1313 | * We aim to not hold locks for too long (for scheduling latency reasons). | ||
1314 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | ||
1315 | * return the ending mmu_gather to the caller. | ||
1316 | * | ||
1317 | * Only addresses between `start' and `end' will be unmapped. | 1306 | * Only addresses between `start' and `end' will be unmapped. |
1318 | * | 1307 | * |
1319 | * The VMA list must be sorted in ascending virtual address order. | 1308 | * The VMA list must be sorted in ascending virtual address order. |
@@ -1514,7 +1503,7 @@ split_fallthrough: | |||
1514 | } | 1503 | } |
1515 | 1504 | ||
1516 | if (flags & FOLL_GET) | 1505 | if (flags & FOLL_GET) |
1517 | get_page(page); | 1506 | get_page_foll(page); |
1518 | if (flags & FOLL_TOUCH) { | 1507 | if (flags & FOLL_TOUCH) { |
1519 | if ((flags & FOLL_WRITE) && | 1508 | if ((flags & FOLL_WRITE) && |
1520 | !pte_dirty(pte) && !PageDirty(page)) | 1509 | !pte_dirty(pte) && !PageDirty(page)) |
@@ -1816,7 +1805,63 @@ next_page: | |||
1816 | } | 1805 | } |
1817 | EXPORT_SYMBOL(__get_user_pages); | 1806 | EXPORT_SYMBOL(__get_user_pages); |
1818 | 1807 | ||
1819 | /** | 1808 | /* |
1809 | * fixup_user_fault() - manually resolve a user page fault | ||
1810 | * @tsk: the task_struct to use for page fault accounting, or | ||
1811 | * NULL if faults are not to be recorded. | ||
1812 | * @mm: mm_struct of target mm | ||
1813 | * @address: user address | ||
1814 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
1815 | * | ||
1816 | * This is meant to be called in the specific scenario where for locking reasons | ||
1817 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
1818 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
1819 | * trying again. | ||
1820 | * | ||
1821 | * Typically this is meant to be used by the futex code. | ||
1822 | * | ||
1823 | * The main difference with get_user_pages() is that this function will | ||
1824 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
1825 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
1826 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
1827 | * | ||
1828 | * This is important for some architectures where those bits also gate the | ||
1829 | * access permission to the page because they are maintained in software. On | ||
1830 | * such architectures, gup() will not be enough to make a subsequent access | ||
1831 | * succeed. | ||
1832 | * | ||
1833 | * This should be called with the mm_sem held for read. | ||
1834 | */ | ||
1835 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
1836 | unsigned long address, unsigned int fault_flags) | ||
1837 | { | ||
1838 | struct vm_area_struct *vma; | ||
1839 | int ret; | ||
1840 | |||
1841 | vma = find_extend_vma(mm, address); | ||
1842 | if (!vma || address < vma->vm_start) | ||
1843 | return -EFAULT; | ||
1844 | |||
1845 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
1846 | if (ret & VM_FAULT_ERROR) { | ||
1847 | if (ret & VM_FAULT_OOM) | ||
1848 | return -ENOMEM; | ||
1849 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
1850 | return -EHWPOISON; | ||
1851 | if (ret & VM_FAULT_SIGBUS) | ||
1852 | return -EFAULT; | ||
1853 | BUG(); | ||
1854 | } | ||
1855 | if (tsk) { | ||
1856 | if (ret & VM_FAULT_MAJOR) | ||
1857 | tsk->maj_flt++; | ||
1858 | else | ||
1859 | tsk->min_flt++; | ||
1860 | } | ||
1861 | return 0; | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1820 | * get_user_pages() - pin user pages in memory | 1865 | * get_user_pages() - pin user pages in memory |
1821 | * @tsk: the task_struct to use for page fault accounting, or | 1866 | * @tsk: the task_struct to use for page fault accounting, or |
1822 | * NULL if faults are not to be recorded. | 1867 | * NULL if faults are not to be recorded. |
@@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3104 | pte_t *page_table; | 3149 | pte_t *page_table; |
3105 | spinlock_t *ptl; | 3150 | spinlock_t *ptl; |
3106 | struct page *page; | 3151 | struct page *page; |
3152 | struct page *cow_page; | ||
3107 | pte_t entry; | 3153 | pte_t entry; |
3108 | int anon = 0; | 3154 | int anon = 0; |
3109 | int charged = 0; | ||
3110 | struct page *dirty_page = NULL; | 3155 | struct page *dirty_page = NULL; |
3111 | struct vm_fault vmf; | 3156 | struct vm_fault vmf; |
3112 | int ret; | 3157 | int ret; |
3113 | int page_mkwrite = 0; | 3158 | int page_mkwrite = 0; |
3114 | 3159 | ||
3160 | /* | ||
3161 | * If we do COW later, allocate page befor taking lock_page() | ||
3162 | * on the file cache page. This will reduce lock holding time. | ||
3163 | */ | ||
3164 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { | ||
3165 | |||
3166 | if (unlikely(anon_vma_prepare(vma))) | ||
3167 | return VM_FAULT_OOM; | ||
3168 | |||
3169 | cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
3170 | if (!cow_page) | ||
3171 | return VM_FAULT_OOM; | ||
3172 | |||
3173 | if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { | ||
3174 | page_cache_release(cow_page); | ||
3175 | return VM_FAULT_OOM; | ||
3176 | } | ||
3177 | } else | ||
3178 | cow_page = NULL; | ||
3179 | |||
3115 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 3180 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); |
3116 | vmf.pgoff = pgoff; | 3181 | vmf.pgoff = pgoff; |
3117 | vmf.flags = flags; | 3182 | vmf.flags = flags; |
@@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3120 | ret = vma->vm_ops->fault(vma, &vmf); | 3185 | ret = vma->vm_ops->fault(vma, &vmf); |
3121 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | 3186 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
3122 | VM_FAULT_RETRY))) | 3187 | VM_FAULT_RETRY))) |
3123 | return ret; | 3188 | goto uncharge_out; |
3124 | 3189 | ||
3125 | if (unlikely(PageHWPoison(vmf.page))) { | 3190 | if (unlikely(PageHWPoison(vmf.page))) { |
3126 | if (ret & VM_FAULT_LOCKED) | 3191 | if (ret & VM_FAULT_LOCKED) |
3127 | unlock_page(vmf.page); | 3192 | unlock_page(vmf.page); |
3128 | return VM_FAULT_HWPOISON; | 3193 | ret = VM_FAULT_HWPOISON; |
3194 | goto uncharge_out; | ||
3129 | } | 3195 | } |
3130 | 3196 | ||
3131 | /* | 3197 | /* |
@@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3143 | page = vmf.page; | 3209 | page = vmf.page; |
3144 | if (flags & FAULT_FLAG_WRITE) { | 3210 | if (flags & FAULT_FLAG_WRITE) { |
3145 | if (!(vma->vm_flags & VM_SHARED)) { | 3211 | if (!(vma->vm_flags & VM_SHARED)) { |
3212 | page = cow_page; | ||
3146 | anon = 1; | 3213 | anon = 1; |
3147 | if (unlikely(anon_vma_prepare(vma))) { | ||
3148 | ret = VM_FAULT_OOM; | ||
3149 | goto out; | ||
3150 | } | ||
3151 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
3152 | vma, address); | ||
3153 | if (!page) { | ||
3154 | ret = VM_FAULT_OOM; | ||
3155 | goto out; | ||
3156 | } | ||
3157 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
3158 | ret = VM_FAULT_OOM; | ||
3159 | page_cache_release(page); | ||
3160 | goto out; | ||
3161 | } | ||
3162 | charged = 1; | ||
3163 | copy_user_highpage(page, vmf.page, address, vma); | 3214 | copy_user_highpage(page, vmf.page, address, vma); |
3164 | __SetPageUptodate(page); | 3215 | __SetPageUptodate(page); |
3165 | } else { | 3216 | } else { |
@@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3228 | /* no need to invalidate: a not-present page won't be cached */ | 3279 | /* no need to invalidate: a not-present page won't be cached */ |
3229 | update_mmu_cache(vma, address, page_table); | 3280 | update_mmu_cache(vma, address, page_table); |
3230 | } else { | 3281 | } else { |
3231 | if (charged) | 3282 | if (cow_page) |
3232 | mem_cgroup_uncharge_page(page); | 3283 | mem_cgroup_uncharge_page(cow_page); |
3233 | if (anon) | 3284 | if (anon) |
3234 | page_cache_release(page); | 3285 | page_cache_release(page); |
3235 | else | 3286 | else |
@@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3238 | 3289 | ||
3239 | pte_unmap_unlock(page_table, ptl); | 3290 | pte_unmap_unlock(page_table, ptl); |
3240 | 3291 | ||
3241 | out: | ||
3242 | if (dirty_page) { | 3292 | if (dirty_page) { |
3243 | struct address_space *mapping = page->mapping; | 3293 | struct address_space *mapping = page->mapping; |
3244 | 3294 | ||
@@ -3268,6 +3318,13 @@ out: | |||
3268 | unwritable_page: | 3318 | unwritable_page: |
3269 | page_cache_release(page); | 3319 | page_cache_release(page); |
3270 | return ret; | 3320 | return ret; |
3321 | uncharge_out: | ||
3322 | /* fs's fault handler get error */ | ||
3323 | if (cow_page) { | ||
3324 | mem_cgroup_uncharge_page(cow_page); | ||
3325 | page_cache_release(cow_page); | ||
3326 | } | ||
3327 | return ret; | ||
3271 | } | 3328 | } |
3272 | 3329 | ||
3273 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3330 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b5a11..6e7d8b21dbf 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -34,6 +34,17 @@ | |||
34 | 34 | ||
35 | #include "internal.h" | 35 | #include "internal.h" |
36 | 36 | ||
37 | /* | ||
38 | * online_page_callback contains pointer to current page onlining function. | ||
39 | * Initially it is generic_online_page(). If it is required it could be | ||
40 | * changed by calling set_online_page_callback() for callback registration | ||
41 | * and restore_online_page_callback() for generic callback restore. | ||
42 | */ | ||
43 | |||
44 | static void generic_online_page(struct page *page); | ||
45 | |||
46 | static online_page_callback_t online_page_callback = generic_online_page; | ||
47 | |||
37 | DEFINE_MUTEX(mem_hotplug_mutex); | 48 | DEFINE_MUTEX(mem_hotplug_mutex); |
38 | 49 | ||
39 | void lock_memory_hotplug(void) | 50 | void lock_memory_hotplug(void) |
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
361 | } | 372 | } |
362 | EXPORT_SYMBOL_GPL(__remove_pages); | 373 | EXPORT_SYMBOL_GPL(__remove_pages); |
363 | 374 | ||
364 | void online_page(struct page *page) | 375 | int set_online_page_callback(online_page_callback_t callback) |
376 | { | ||
377 | int rc = -EINVAL; | ||
378 | |||
379 | lock_memory_hotplug(); | ||
380 | |||
381 | if (online_page_callback == generic_online_page) { | ||
382 | online_page_callback = callback; | ||
383 | rc = 0; | ||
384 | } | ||
385 | |||
386 | unlock_memory_hotplug(); | ||
387 | |||
388 | return rc; | ||
389 | } | ||
390 | EXPORT_SYMBOL_GPL(set_online_page_callback); | ||
391 | |||
392 | int restore_online_page_callback(online_page_callback_t callback) | ||
393 | { | ||
394 | int rc = -EINVAL; | ||
395 | |||
396 | lock_memory_hotplug(); | ||
397 | |||
398 | if (online_page_callback == callback) { | ||
399 | online_page_callback = generic_online_page; | ||
400 | rc = 0; | ||
401 | } | ||
402 | |||
403 | unlock_memory_hotplug(); | ||
404 | |||
405 | return rc; | ||
406 | } | ||
407 | EXPORT_SYMBOL_GPL(restore_online_page_callback); | ||
408 | |||
409 | void __online_page_set_limits(struct page *page) | ||
365 | { | 410 | { |
366 | unsigned long pfn = page_to_pfn(page); | 411 | unsigned long pfn = page_to_pfn(page); |
367 | 412 | ||
368 | totalram_pages++; | ||
369 | if (pfn >= num_physpages) | 413 | if (pfn >= num_physpages) |
370 | num_physpages = pfn + 1; | 414 | num_physpages = pfn + 1; |
415 | } | ||
416 | EXPORT_SYMBOL_GPL(__online_page_set_limits); | ||
417 | |||
418 | void __online_page_increment_counters(struct page *page) | ||
419 | { | ||
420 | totalram_pages++; | ||
371 | 421 | ||
372 | #ifdef CONFIG_HIGHMEM | 422 | #ifdef CONFIG_HIGHMEM |
373 | if (PageHighMem(page)) | 423 | if (PageHighMem(page)) |
374 | totalhigh_pages++; | 424 | totalhigh_pages++; |
375 | #endif | 425 | #endif |
426 | } | ||
427 | EXPORT_SYMBOL_GPL(__online_page_increment_counters); | ||
376 | 428 | ||
429 | void __online_page_free(struct page *page) | ||
430 | { | ||
377 | ClearPageReserved(page); | 431 | ClearPageReserved(page); |
378 | init_page_count(page); | 432 | init_page_count(page); |
379 | __free_page(page); | 433 | __free_page(page); |
380 | } | 434 | } |
435 | EXPORT_SYMBOL_GPL(__online_page_free); | ||
436 | |||
437 | static void generic_online_page(struct page *page) | ||
438 | { | ||
439 | __online_page_set_limits(page); | ||
440 | __online_page_increment_counters(page); | ||
441 | __online_page_free(page); | ||
442 | } | ||
381 | 443 | ||
382 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | 444 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
383 | void *arg) | 445 | void *arg) |
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
388 | if (PageReserved(pfn_to_page(start_pfn))) | 450 | if (PageReserved(pfn_to_page(start_pfn))) |
389 | for (i = 0; i < nr_pages; i++) { | 451 | for (i = 0; i < nr_pages; i++) { |
390 | page = pfn_to_page(start_pfn + i); | 452 | page = pfn_to_page(start_pfn + i); |
391 | online_page(page); | 453 | (*online_page_callback)(page); |
392 | onlined_pages++; | 454 | onlined_pages++; |
393 | } | 455 | } |
394 | *(unsigned long *)arg = onlined_pages; | 456 | *(unsigned long *)arg = onlined_pages; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7fb9d25c54..2775fd04924 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -93,6 +93,7 @@ | |||
93 | 93 | ||
94 | #include <asm/tlbflush.h> | 94 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 95 | #include <asm/uaccess.h> |
96 | #include <linux/random.h> | ||
96 | 97 | ||
97 | #include "internal.h" | 98 | #include "internal.h" |
98 | 99 | ||
@@ -643,14 +644,22 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
643 | if (!vma || vma->vm_start > start) | 644 | if (!vma || vma->vm_start > start) |
644 | return -EFAULT; | 645 | return -EFAULT; |
645 | 646 | ||
647 | if (start > vma->vm_start) | ||
648 | prev = vma; | ||
649 | |||
646 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { | 650 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { |
647 | next = vma->vm_next; | 651 | next = vma->vm_next; |
648 | vmstart = max(start, vma->vm_start); | 652 | vmstart = max(start, vma->vm_start); |
649 | vmend = min(end, vma->vm_end); | 653 | vmend = min(end, vma->vm_end); |
650 | 654 | ||
651 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 655 | if (mpol_equal(vma_policy(vma), new_pol)) |
656 | continue; | ||
657 | |||
658 | pgoff = vma->vm_pgoff + | ||
659 | ((vmstart - vma->vm_start) >> PAGE_SHIFT); | ||
652 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, | 660 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
653 | vma->anon_vma, vma->vm_file, pgoff, new_pol); | 661 | vma->anon_vma, vma->vm_file, pgoff, |
662 | new_pol); | ||
654 | if (prev) { | 663 | if (prev) { |
655 | vma = prev; | 664 | vma = prev; |
656 | next = vma->vm_next; | 665 | next = vma->vm_next; |
@@ -1411,7 +1420,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, | |||
1411 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); | 1420 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); |
1412 | 1421 | ||
1413 | if (!err && nmask) { | 1422 | if (!err && nmask) { |
1414 | err = copy_from_user(bm, nm, alloc_size); | 1423 | unsigned long copy_size; |
1424 | copy_size = min_t(unsigned long, sizeof(bm), alloc_size); | ||
1425 | err = copy_from_user(bm, nm, copy_size); | ||
1415 | /* ensure entire bitmap is zeroed */ | 1426 | /* ensure entire bitmap is zeroed */ |
1416 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); | 1427 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); |
1417 | err |= compat_put_bitmap(nmask, bm, nr_bits); | 1428 | err |= compat_put_bitmap(nmask, bm, nr_bits); |
@@ -1645,6 +1656,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1645 | return interleave_nodes(pol); | 1656 | return interleave_nodes(pol); |
1646 | } | 1657 | } |
1647 | 1658 | ||
1659 | /* | ||
1660 | * Return the bit number of a random bit set in the nodemask. | ||
1661 | * (returns -1 if nodemask is empty) | ||
1662 | */ | ||
1663 | int node_random(const nodemask_t *maskp) | ||
1664 | { | ||
1665 | int w, bit = -1; | ||
1666 | |||
1667 | w = nodes_weight(*maskp); | ||
1668 | if (w) | ||
1669 | bit = bitmap_ord_to_pos(maskp->bits, | ||
1670 | get_random_int() % w, MAX_NUMNODES); | ||
1671 | return bit; | ||
1672 | } | ||
1673 | |||
1648 | #ifdef CONFIG_HUGETLBFS | 1674 | #ifdef CONFIG_HUGETLBFS |
1649 | /* | 1675 | /* |
1650 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) | 1676 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) |
diff --git a/mm/migrate.c b/mm/migrate.c index 666e4e67741..14d0a6a632f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
120 | 120 | ||
121 | ptep = pte_offset_map(pmd, addr); | 121 | ptep = pte_offset_map(pmd, addr); |
122 | 122 | ||
123 | if (!is_swap_pte(*ptep)) { | 123 | /* |
124 | pte_unmap(ptep); | 124 | * Peek to check is_swap_pte() before taking ptlock? No, we |
125 | goto out; | 125 | * can race mremap's move_ptes(), which skips anon_vma lock. |
126 | } | 126 | */ |
127 | 127 | ||
128 | ptl = pte_lockptr(mm, pmd); | 128 | ptl = pte_lockptr(mm, pmd); |
129 | } | 129 | } |
diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c7..636a86876ff 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
69 | * file will not get a swp_entry_t in its pte, but rather it is like | 69 | * file will not get a swp_entry_t in its pte, but rather it is like |
70 | * any other file mapping (ie. marked !present and faulted in with | 70 | * any other file mapping (ie. marked !present and faulted in with |
71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. | 71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. |
72 | * | ||
73 | * However when tmpfs moves the page from pagecache and into swapcache, | ||
74 | * it is still in core, but the find_get_page below won't find it. | ||
75 | * No big deal, but make a note of it. | ||
76 | */ | 72 | */ |
77 | page = find_get_page(mapping, pgoff); | 73 | page = find_get_page(mapping, pgoff); |
74 | #ifdef CONFIG_SWAP | ||
75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | ||
76 | if (radix_tree_exceptional_entry(page)) { | ||
77 | swp_entry_t swap = radix_to_swp_entry(page); | ||
78 | page = find_get_page(&swapper_space, swap.val); | ||
79 | } | ||
80 | #endif | ||
78 | if (page) { | 81 | if (page) { |
79 | present = PageUptodate(page); | 82 | present = PageUptodate(page); |
80 | page_cache_release(page); | 83 | page_cache_release(page); |
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
122 | return 0; | 122 | return 0; |
123 | 123 | ||
124 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 124 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
125 | unsigned long n; | 125 | free = global_page_state(NR_FREE_PAGES); |
126 | free += global_page_state(NR_FILE_PAGES); | ||
127 | |||
128 | /* | ||
129 | * shmem pages shouldn't be counted as free in this | ||
130 | * case, they can't be purged, only swapped out, and | ||
131 | * that won't affect the overall amount of available | ||
132 | * memory in the system. | ||
133 | */ | ||
134 | free -= global_page_state(NR_SHMEM); | ||
126 | 135 | ||
127 | free = global_page_state(NR_FILE_PAGES); | ||
128 | free += nr_swap_pages; | 136 | free += nr_swap_pages; |
129 | 137 | ||
130 | /* | 138 | /* |
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
136 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 144 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
137 | 145 | ||
138 | /* | 146 | /* |
139 | * Leave the last 3% for root | ||
140 | */ | ||
141 | if (!cap_sys_admin) | ||
142 | free -= free / 32; | ||
143 | |||
144 | if (free > pages) | ||
145 | return 0; | ||
146 | |||
147 | /* | ||
148 | * nr_free_pages() is very expensive on large systems, | ||
149 | * only call if we're about to fail. | ||
150 | */ | ||
151 | n = nr_free_pages(); | ||
152 | |||
153 | /* | ||
154 | * Leave reserved pages. The pages are not for anonymous pages. | 147 | * Leave reserved pages. The pages are not for anonymous pages. |
155 | */ | 148 | */ |
156 | if (n <= totalreserve_pages) | 149 | if (free <= totalreserve_pages) |
157 | goto error; | 150 | goto error; |
158 | else | 151 | else |
159 | n -= totalreserve_pages; | 152 | free -= totalreserve_pages; |
160 | 153 | ||
161 | /* | 154 | /* |
162 | * Leave the last 3% for root | 155 | * Leave the last 3% for root |
163 | */ | 156 | */ |
164 | if (!cap_sys_admin) | 157 | if (!cap_sys_admin) |
165 | n -= n / 32; | 158 | free -= free / 32; |
166 | free += n; | ||
167 | 159 | ||
168 | if (free > pages) | 160 | if (free > pages) |
169 | return 0; | 161 | return 0; |
diff --git a/mm/nommu.c b/mm/nommu.c index 9edc897a397..4358032566e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/tracehook.h> | ||
26 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 26 | #include <linux/backing-dev.h> |
28 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
1087 | * it's being traced - otherwise breakpoints set in it may interfere | 1086 | * it's being traced - otherwise breakpoints set in it may interfere |
1088 | * with another untraced process | 1087 | * with another untraced process |
1089 | */ | 1088 | */ |
1090 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) | 1089 | if ((flags & MAP_PRIVATE) && current->ptrace) |
1091 | vm_flags &= ~VM_MAYSHARE; | 1090 | vm_flags &= ~VM_MAYSHARE; |
1092 | 1091 | ||
1093 | return vm_flags; | 1092 | return vm_flags; |
@@ -1885,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1885 | return 0; | 1884 | return 0; |
1886 | 1885 | ||
1887 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 1886 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
1888 | unsigned long n; | 1887 | free = global_page_state(NR_FREE_PAGES); |
1888 | free += global_page_state(NR_FILE_PAGES); | ||
1889 | |||
1890 | /* | ||
1891 | * shmem pages shouldn't be counted as free in this | ||
1892 | * case, they can't be purged, only swapped out, and | ||
1893 | * that won't affect the overall amount of available | ||
1894 | * memory in the system. | ||
1895 | */ | ||
1896 | free -= global_page_state(NR_SHMEM); | ||
1889 | 1897 | ||
1890 | free = global_page_state(NR_FILE_PAGES); | ||
1891 | free += nr_swap_pages; | 1898 | free += nr_swap_pages; |
1892 | 1899 | ||
1893 | /* | 1900 | /* |
@@ -1899,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1899 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 1906 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
1900 | 1907 | ||
1901 | /* | 1908 | /* |
1902 | * Leave the last 3% for root | ||
1903 | */ | ||
1904 | if (!cap_sys_admin) | ||
1905 | free -= free / 32; | ||
1906 | |||
1907 | if (free > pages) | ||
1908 | return 0; | ||
1909 | |||
1910 | /* | ||
1911 | * nr_free_pages() is very expensive on large systems, | ||
1912 | * only call if we're about to fail. | ||
1913 | */ | ||
1914 | n = nr_free_pages(); | ||
1915 | |||
1916 | /* | ||
1917 | * Leave reserved pages. The pages are not for anonymous pages. | 1909 | * Leave reserved pages. The pages are not for anonymous pages. |
1918 | */ | 1910 | */ |
1919 | if (n <= totalreserve_pages) | 1911 | if (free <= totalreserve_pages) |
1920 | goto error; | 1912 | goto error; |
1921 | else | 1913 | else |
1922 | n -= totalreserve_pages; | 1914 | free -= totalreserve_pages; |
1923 | 1915 | ||
1924 | /* | 1916 | /* |
1925 | * Leave the last 3% for root | 1917 | * Leave the last 3% for root |
1926 | */ | 1918 | */ |
1927 | if (!cap_sys_admin) | 1919 | if (!cap_sys_admin) |
1928 | n -= n / 32; | 1920 | free -= free / 32; |
1929 | free += n; | ||
1930 | 1921 | ||
1931 | if (free > pages) | 1922 | if (free > pages) |
1932 | return 0; | 1923 | return 0; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca35..e9a17857a20 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -162,7 +162,7 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
162 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | 162 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, |
163 | const nodemask_t *nodemask, unsigned long totalpages) | 163 | const nodemask_t *nodemask, unsigned long totalpages) |
164 | { | 164 | { |
165 | int points; | 165 | long points; |
166 | 166 | ||
167 | if (oom_unkillable_task(p, mem, nodemask)) | 167 | if (oom_unkillable_task(p, mem, nodemask)) |
168 | return 0; | 168 | return 0; |
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
303 | do_each_thread(g, p) { | 303 | do_each_thread(g, p) { |
304 | unsigned int points; | 304 | unsigned int points; |
305 | 305 | ||
306 | if (!p->mm) | 306 | if (p->exit_state) |
307 | continue; | 307 | continue; |
308 | if (oom_unkillable_task(p, mem, nodemask)) | 308 | if (oom_unkillable_task(p, mem, nodemask)) |
309 | continue; | 309 | continue; |
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
319 | */ | 319 | */ |
320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | 320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
321 | return ERR_PTR(-1UL); | 321 | return ERR_PTR(-1UL); |
322 | if (!p->mm) | ||
323 | continue; | ||
322 | 324 | ||
323 | if (p->flags & PF_EXITING) { | 325 | if (p->flags & PF_EXITING) { |
324 | /* | 326 | /* |
@@ -339,8 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
339 | * then wait for it to finish before killing | 341 | * then wait for it to finish before killing |
340 | * some other task unnecessarily. | 342 | * some other task unnecessarily. |
341 | */ | 343 | */ |
342 | if (!(task_ptrace(p->group_leader) & | 344 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) |
343 | PT_TRACE_EXIT)) | ||
344 | return ERR_PTR(-1UL); | 345 | return ERR_PTR(-1UL); |
345 | } | 346 | } |
346 | } | 347 | } |
@@ -488,7 +489,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
488 | 489 | ||
489 | /* | 490 | /* |
490 | * If any of p's children has a different mm and is eligible for kill, | 491 | * If any of p's children has a different mm and is eligible for kill, |
491 | * the one with the highest badness() score is sacrificed for its | 492 | * the one with the highest oom_badness() score is sacrificed for its |
492 | * parent. This attempts to lose the minimal amount of work done while | 493 | * parent. This attempts to lose the minimal amount of work done while |
493 | * still freeing memory. | 494 | * still freeing memory. |
494 | */ | 495 | */ |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f69886242..0e309cd1b5b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -37,6 +37,16 @@ | |||
37 | #include <trace/events/writeback.h> | 37 | #include <trace/events/writeback.h> |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Sleep at most 200ms at a time in balance_dirty_pages(). | ||
41 | */ | ||
42 | #define MAX_PAUSE max(HZ/5, 1) | ||
43 | |||
44 | /* | ||
45 | * Estimate write bandwidth at 200ms intervals. | ||
46 | */ | ||
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | ||
48 | |||
49 | /* | ||
40 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
41 | * will look to see if it needs to force writeback or throttling. | 51 | * will look to see if it needs to force writeback or throttling. |
42 | */ | 52 | */ |
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode); | |||
111 | 121 | ||
112 | /* End of sysctl-exported parameters */ | 122 | /* End of sysctl-exported parameters */ |
113 | 123 | ||
124 | unsigned long global_dirty_limit; | ||
114 | 125 | ||
115 | /* | 126 | /* |
116 | * Scale the writeback cache size proportional to the relative writeout speeds. | 127 | * Scale the writeback cache size proportional to the relative writeout speeds. |
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
219 | */ | 230 | */ |
220 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 231 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | { | 232 | { |
233 | __inc_bdi_stat(bdi, BDI_WRITTEN); | ||
222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 234 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | bdi->max_prop_frac); | 235 | bdi->max_prop_frac); |
224 | } | 236 | } |
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk) | |||
244 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 256 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | long *numerator, long *denominator) | 257 | long *numerator, long *denominator) |
246 | { | 258 | { |
247 | if (bdi_cap_writeback_dirty(bdi)) { | 259 | prop_fraction_percpu(&vm_completions, &bdi->completions, |
248 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
249 | numerator, denominator); | 260 | numerator, denominator); |
250 | } else { | ||
251 | *numerator = 0; | ||
252 | *denominator = 1; | ||
253 | } | ||
254 | } | 261 | } |
255 | 262 | ||
256 | static inline void task_dirties_fraction(struct task_struct *tsk, | 263 | static inline void task_dirties_fraction(struct task_struct *tsk, |
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | 281 | * effectively curb the growth of dirty pages. Light dirtiers with high enough |
275 | * dirty threshold may never get throttled. | 282 | * dirty threshold may never get throttled. |
276 | */ | 283 | */ |
284 | #define TASK_LIMIT_FRACTION 8 | ||
277 | static unsigned long task_dirty_limit(struct task_struct *tsk, | 285 | static unsigned long task_dirty_limit(struct task_struct *tsk, |
278 | unsigned long bdi_dirty) | 286 | unsigned long bdi_dirty) |
279 | { | 287 | { |
280 | long numerator, denominator; | 288 | long numerator, denominator; |
281 | unsigned long dirty = bdi_dirty; | 289 | unsigned long dirty = bdi_dirty; |
282 | u64 inv = dirty >> 3; | 290 | u64 inv = dirty / TASK_LIMIT_FRACTION; |
283 | 291 | ||
284 | task_dirties_fraction(tsk, &numerator, &denominator); | 292 | task_dirties_fraction(tsk, &numerator, &denominator); |
285 | inv *= numerator; | 293 | inv *= numerator; |
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk, | |||
290 | return max(dirty, bdi_dirty/2); | 298 | return max(dirty, bdi_dirty/2); |
291 | } | 299 | } |
292 | 300 | ||
301 | /* Minimum limit for any task */ | ||
302 | static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | ||
303 | { | ||
304 | return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | ||
305 | } | ||
306 | |||
293 | /* | 307 | /* |
294 | * | 308 | * |
295 | */ | 309 | */ |
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void) | |||
397 | return x + 1; /* Ensure that we never return 0 */ | 411 | return x + 1; /* Ensure that we never return 0 */ |
398 | } | 412 | } |
399 | 413 | ||
414 | static unsigned long hard_dirty_limit(unsigned long thresh) | ||
415 | { | ||
416 | return max(thresh, global_dirty_limit); | ||
417 | } | ||
418 | |||
400 | /* | 419 | /* |
401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | 420 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | * | 421 | * |
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
435 | } | 454 | } |
436 | *pbackground = background; | 455 | *pbackground = background; |
437 | *pdirty = dirty; | 456 | *pdirty = dirty; |
457 | trace_global_dirty_state(background, dirty); | ||
438 | } | 458 | } |
439 | 459 | ||
440 | /* | 460 | /** |
441 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 461 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
462 | * @bdi: the backing_dev_info to query | ||
463 | * @dirty: global dirty limit in pages | ||
442 | * | 464 | * |
443 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | 465 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of |
466 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
467 | * And the "limit" in the name is not seriously taken as hard limit in | ||
468 | * balance_dirty_pages(). | ||
469 | * | ||
470 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent | ||
444 | * - starving fast devices | 471 | * - starving fast devices |
445 | * - piling up dirty pages (that will take long time to sync) on slow devices | 472 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | * | 473 | * |
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
468 | return bdi_dirty; | 495 | return bdi_dirty; |
469 | } | 496 | } |
470 | 497 | ||
498 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | ||
499 | unsigned long elapsed, | ||
500 | unsigned long written) | ||
501 | { | ||
502 | const unsigned long period = roundup_pow_of_two(3 * HZ); | ||
503 | unsigned long avg = bdi->avg_write_bandwidth; | ||
504 | unsigned long old = bdi->write_bandwidth; | ||
505 | u64 bw; | ||
506 | |||
507 | /* | ||
508 | * bw = written * HZ / elapsed | ||
509 | * | ||
510 | * bw * elapsed + write_bandwidth * (period - elapsed) | ||
511 | * write_bandwidth = --------------------------------------------------- | ||
512 | * period | ||
513 | */ | ||
514 | bw = written - bdi->written_stamp; | ||
515 | bw *= HZ; | ||
516 | if (unlikely(elapsed > period)) { | ||
517 | do_div(bw, elapsed); | ||
518 | avg = bw; | ||
519 | goto out; | ||
520 | } | ||
521 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | ||
522 | bw >>= ilog2(period); | ||
523 | |||
524 | /* | ||
525 | * one more level of smoothing, for filtering out sudden spikes | ||
526 | */ | ||
527 | if (avg > old && old >= (unsigned long)bw) | ||
528 | avg -= (avg - old) >> 3; | ||
529 | |||
530 | if (avg < old && old <= (unsigned long)bw) | ||
531 | avg += (old - avg) >> 3; | ||
532 | |||
533 | out: | ||
534 | bdi->write_bandwidth = bw; | ||
535 | bdi->avg_write_bandwidth = avg; | ||
536 | } | ||
537 | |||
538 | /* | ||
539 | * The global dirtyable memory and dirty threshold could be suddenly knocked | ||
540 | * down by a large amount (eg. on the startup of KVM in a swapless system). | ||
541 | * This may throw the system into deep dirty exceeded state and throttle | ||
542 | * heavy/light dirtiers alike. To retain good responsiveness, maintain | ||
543 | * global_dirty_limit for tracking slowly down to the knocked down dirty | ||
544 | * threshold. | ||
545 | */ | ||
546 | static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | ||
547 | { | ||
548 | unsigned long limit = global_dirty_limit; | ||
549 | |||
550 | /* | ||
551 | * Follow up in one step. | ||
552 | */ | ||
553 | if (limit < thresh) { | ||
554 | limit = thresh; | ||
555 | goto update; | ||
556 | } | ||
557 | |||
558 | /* | ||
559 | * Follow down slowly. Use the higher one as the target, because thresh | ||
560 | * may drop below dirty. This is exactly the reason to introduce | ||
561 | * global_dirty_limit which is guaranteed to lie above the dirty pages. | ||
562 | */ | ||
563 | thresh = max(thresh, dirty); | ||
564 | if (limit > thresh) { | ||
565 | limit -= (limit - thresh) >> 5; | ||
566 | goto update; | ||
567 | } | ||
568 | return; | ||
569 | update: | ||
570 | global_dirty_limit = limit; | ||
571 | } | ||
572 | |||
573 | static void global_update_bandwidth(unsigned long thresh, | ||
574 | unsigned long dirty, | ||
575 | unsigned long now) | ||
576 | { | ||
577 | static DEFINE_SPINLOCK(dirty_lock); | ||
578 | static unsigned long update_time; | ||
579 | |||
580 | /* | ||
581 | * check locklessly first to optimize away locking for the most time | ||
582 | */ | ||
583 | if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | ||
584 | return; | ||
585 | |||
586 | spin_lock(&dirty_lock); | ||
587 | if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | ||
588 | update_dirty_limit(thresh, dirty); | ||
589 | update_time = now; | ||
590 | } | ||
591 | spin_unlock(&dirty_lock); | ||
592 | } | ||
593 | |||
594 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
595 | unsigned long thresh, | ||
596 | unsigned long dirty, | ||
597 | unsigned long bdi_thresh, | ||
598 | unsigned long bdi_dirty, | ||
599 | unsigned long start_time) | ||
600 | { | ||
601 | unsigned long now = jiffies; | ||
602 | unsigned long elapsed = now - bdi->bw_time_stamp; | ||
603 | unsigned long written; | ||
604 | |||
605 | /* | ||
606 | * rate-limit, only update once every 200ms. | ||
607 | */ | ||
608 | if (elapsed < BANDWIDTH_INTERVAL) | ||
609 | return; | ||
610 | |||
611 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | ||
612 | |||
613 | /* | ||
614 | * Skip quiet periods when disk bandwidth is under-utilized. | ||
615 | * (at least 1s idle time between two flusher runs) | ||
616 | */ | ||
617 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | ||
618 | goto snapshot; | ||
619 | |||
620 | if (thresh) | ||
621 | global_update_bandwidth(thresh, dirty, now); | ||
622 | |||
623 | bdi_update_write_bandwidth(bdi, elapsed, written); | ||
624 | |||
625 | snapshot: | ||
626 | bdi->written_stamp = written; | ||
627 | bdi->bw_time_stamp = now; | ||
628 | } | ||
629 | |||
630 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
631 | unsigned long thresh, | ||
632 | unsigned long dirty, | ||
633 | unsigned long bdi_thresh, | ||
634 | unsigned long bdi_dirty, | ||
635 | unsigned long start_time) | ||
636 | { | ||
637 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | ||
638 | return; | ||
639 | spin_lock(&bdi->wb.list_lock); | ||
640 | __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, | ||
641 | start_time); | ||
642 | spin_unlock(&bdi->wb.list_lock); | ||
643 | } | ||
644 | |||
471 | /* | 645 | /* |
472 | * balance_dirty_pages() must be called by processes which are generating dirty | 646 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | * data. It looks at the number of dirty pages in the machine and will force | 647 | * data. It looks at the number of dirty pages in the machine and will force |
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
478 | static void balance_dirty_pages(struct address_space *mapping, | 652 | static void balance_dirty_pages(struct address_space *mapping, |
479 | unsigned long write_chunk) | 653 | unsigned long write_chunk) |
480 | { | 654 | { |
481 | long nr_reclaimable, bdi_nr_reclaimable; | 655 | unsigned long nr_reclaimable, bdi_nr_reclaimable; |
482 | long nr_writeback, bdi_nr_writeback; | 656 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
657 | unsigned long bdi_dirty; | ||
483 | unsigned long background_thresh; | 658 | unsigned long background_thresh; |
484 | unsigned long dirty_thresh; | 659 | unsigned long dirty_thresh; |
485 | unsigned long bdi_thresh; | 660 | unsigned long bdi_thresh; |
661 | unsigned long task_bdi_thresh; | ||
662 | unsigned long min_task_bdi_thresh; | ||
486 | unsigned long pages_written = 0; | 663 | unsigned long pages_written = 0; |
487 | unsigned long pause = 1; | 664 | unsigned long pause = 1; |
488 | bool dirty_exceeded = false; | 665 | bool dirty_exceeded = false; |
666 | bool clear_dirty_exceeded = true; | ||
489 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 667 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
668 | unsigned long start_time = jiffies; | ||
490 | 669 | ||
491 | for (;;) { | 670 | for (;;) { |
492 | struct writeback_control wbc = { | ||
493 | .sync_mode = WB_SYNC_NONE, | ||
494 | .older_than_this = NULL, | ||
495 | .nr_to_write = write_chunk, | ||
496 | .range_cyclic = 1, | ||
497 | }; | ||
498 | |||
499 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 671 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | global_page_state(NR_UNSTABLE_NFS); | 672 | global_page_state(NR_UNSTABLE_NFS); |
501 | nr_writeback = global_page_state(NR_WRITEBACK); | 673 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
502 | 674 | ||
503 | global_dirty_limits(&background_thresh, &dirty_thresh); | 675 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 676 | ||
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
507 | * catch-up. This avoids (excessively) small writeouts | 679 | * catch-up. This avoids (excessively) small writeouts |
508 | * when the bdi limits are ramping up. | 680 | * when the bdi limits are ramping up. |
509 | */ | 681 | */ |
510 | if (nr_reclaimable + nr_writeback <= | 682 | if (nr_dirty <= (background_thresh + dirty_thresh) / 2) |
511 | (background_thresh + dirty_thresh) / 2) | ||
512 | break; | 683 | break; |
513 | 684 | ||
514 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | bdi_thresh = task_dirty_limit(current, bdi_thresh); | 686 | min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); |
687 | task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
516 | 688 | ||
517 | /* | 689 | /* |
518 | * In order to avoid the stacked BDI deadlock we need | 690 | * In order to avoid the stacked BDI deadlock we need |
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
524 | * actually dirty; with m+n sitting in the percpu | 696 | * actually dirty; with m+n sitting in the percpu |
525 | * deltas. | 697 | * deltas. |
526 | */ | 698 | */ |
527 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 699 | if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { |
528 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 700 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 701 | bdi_dirty = bdi_nr_reclaimable + |
702 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
530 | } else { | 703 | } else { |
531 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 704 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 705 | bdi_dirty = bdi_nr_reclaimable + |
706 | bdi_stat(bdi, BDI_WRITEBACK); | ||
533 | } | 707 | } |
534 | 708 | ||
535 | /* | 709 | /* |
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
538 | * bdi or process from holding back light ones; The latter is | 712 | * bdi or process from holding back light ones; The latter is |
539 | * the last resort safeguard. | 713 | * the last resort safeguard. |
540 | */ | 714 | */ |
541 | dirty_exceeded = | 715 | dirty_exceeded = (bdi_dirty > task_bdi_thresh) || |
542 | (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | 716 | (nr_dirty > dirty_thresh); |
543 | || (nr_reclaimable + nr_writeback > dirty_thresh); | 717 | clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && |
718 | (nr_dirty <= dirty_thresh); | ||
544 | 719 | ||
545 | if (!dirty_exceeded) | 720 | if (!dirty_exceeded) |
546 | break; | 721 | break; |
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
548 | if (!bdi->dirty_exceeded) | 723 | if (!bdi->dirty_exceeded) |
549 | bdi->dirty_exceeded = 1; | 724 | bdi->dirty_exceeded = 1; |
550 | 725 | ||
726 | bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, | ||
727 | bdi_thresh, bdi_dirty, start_time); | ||
728 | |||
551 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 729 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
552 | * Unstable writes are a feature of certain networked | 730 | * Unstable writes are a feature of certain networked |
553 | * filesystems (i.e. NFS) in which data may have been | 731 | * filesystems (i.e. NFS) in which data may have been |
@@ -557,17 +735,29 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
557 | * threshold otherwise wait until the disk writes catch | 735 | * threshold otherwise wait until the disk writes catch |
558 | * up. | 736 | * up. |
559 | */ | 737 | */ |
560 | trace_wbc_balance_dirty_start(&wbc, bdi); | 738 | trace_balance_dirty_start(bdi); |
561 | if (bdi_nr_reclaimable > bdi_thresh) { | 739 | if (bdi_nr_reclaimable > task_bdi_thresh) { |
562 | writeback_inodes_wb(&bdi->wb, &wbc); | 740 | pages_written += writeback_inodes_wb(&bdi->wb, |
563 | pages_written += write_chunk - wbc.nr_to_write; | 741 | write_chunk); |
564 | trace_wbc_balance_dirty_written(&wbc, bdi); | 742 | trace_balance_dirty_written(bdi, pages_written); |
565 | if (pages_written >= write_chunk) | 743 | if (pages_written >= write_chunk) |
566 | break; /* We've done our duty */ | 744 | break; /* We've done our duty */ |
567 | } | 745 | } |
568 | trace_wbc_balance_dirty_wait(&wbc, bdi); | ||
569 | __set_current_state(TASK_UNINTERRUPTIBLE); | 746 | __set_current_state(TASK_UNINTERRUPTIBLE); |
570 | io_schedule_timeout(pause); | 747 | io_schedule_timeout(pause); |
748 | trace_balance_dirty_wait(bdi); | ||
749 | |||
750 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
751 | /* | ||
752 | * max-pause area. If dirty exceeded but still within this | ||
753 | * area, no need to sleep for more than 200ms: (a) 8 pages per | ||
754 | * 200ms is typically more than enough to curb heavy dirtiers; | ||
755 | * (b) the pause time limit makes the dirtiers more responsive. | ||
756 | */ | ||
757 | if (nr_dirty < dirty_thresh && | ||
758 | bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && | ||
759 | time_after(jiffies, start_time + MAX_PAUSE)) | ||
760 | break; | ||
571 | 761 | ||
572 | /* | 762 | /* |
573 | * Increase the delay for each loop, up to our previous | 763 | * Increase the delay for each loop, up to our previous |
@@ -578,7 +768,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
578 | pause = HZ / 10; | 768 | pause = HZ / 10; |
579 | } | 769 | } |
580 | 770 | ||
581 | if (!dirty_exceeded && bdi->dirty_exceeded) | 771 | /* Clear dirty_exceeded flag only when no task can exceed the limit */ |
772 | if (clear_dirty_exceeded && bdi->dirty_exceeded) | ||
582 | bdi->dirty_exceeded = 0; | 773 | bdi->dirty_exceeded = 0; |
583 | 774 | ||
584 | if (writeback_in_progress(bdi)) | 775 | if (writeback_in_progress(bdi)) |
@@ -626,9 +817,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | |||
626 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 817 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | unsigned long nr_pages_dirtied) | 818 | unsigned long nr_pages_dirtied) |
628 | { | 819 | { |
820 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
629 | unsigned long ratelimit; | 821 | unsigned long ratelimit; |
630 | unsigned long *p; | 822 | unsigned long *p; |
631 | 823 | ||
824 | if (!bdi_cap_account_dirty(bdi)) | ||
825 | return; | ||
826 | |||
632 | ratelimit = ratelimit_pages; | 827 | ratelimit = ratelimit_pages; |
633 | if (mapping->backing_dev_info->dirty_exceeded) | 828 | if (mapping->backing_dev_info->dirty_exceeded) |
634 | ratelimit = 8; | 829 | ratelimit = 8; |
@@ -892,12 +1087,12 @@ int write_cache_pages(struct address_space *mapping, | |||
892 | range_whole = 1; | 1087 | range_whole = 1; |
893 | cycled = 1; /* ignore range_cyclic tests */ | 1088 | cycled = 1; /* ignore range_cyclic tests */ |
894 | } | 1089 | } |
895 | if (wbc->sync_mode == WB_SYNC_ALL) | 1090 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
896 | tag = PAGECACHE_TAG_TOWRITE; | 1091 | tag = PAGECACHE_TAG_TOWRITE; |
897 | else | 1092 | else |
898 | tag = PAGECACHE_TAG_DIRTY; | 1093 | tag = PAGECACHE_TAG_DIRTY; |
899 | retry: | 1094 | retry: |
900 | if (wbc->sync_mode == WB_SYNC_ALL) | 1095 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
901 | tag_pages_for_writeback(mapping, index, end); | 1096 | tag_pages_for_writeback(mapping, index, end); |
902 | done_index = index; | 1097 | done_index = index; |
903 | while (!done && (index <= end)) { | 1098 | while (!done && (index <= end)) { |
@@ -1141,7 +1336,6 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
1141 | void account_page_writeback(struct page *page) | 1336 | void account_page_writeback(struct page *page) |
1142 | { | 1337 | { |
1143 | inc_zone_page_state(page, NR_WRITEBACK); | 1338 | inc_zone_page_state(page, NR_WRITEBACK); |
1144 | inc_zone_page_state(page, NR_WRITTEN); | ||
1145 | } | 1339 | } |
1146 | EXPORT_SYMBOL(account_page_writeback); | 1340 | EXPORT_SYMBOL(account_page_writeback); |
1147 | 1341 | ||
@@ -1358,8 +1552,10 @@ int test_clear_page_writeback(struct page *page) | |||
1358 | } else { | 1552 | } else { |
1359 | ret = TestClearPageWriteback(page); | 1553 | ret = TestClearPageWriteback(page); |
1360 | } | 1554 | } |
1361 | if (ret) | 1555 | if (ret) { |
1362 | dec_zone_page_state(page, NR_WRITEBACK); | 1556 | dec_zone_page_state(page, NR_WRITEBACK); |
1557 | inc_zone_page_state(page, NR_WRITTEN); | ||
1558 | } | ||
1363 | return ret; | 1559 | return ret; |
1364 | } | 1560 | } |
1365 | 1561 | ||
@@ -1405,10 +1601,6 @@ EXPORT_SYMBOL(test_set_page_writeback); | |||
1405 | */ | 1601 | */ |
1406 | int mapping_tagged(struct address_space *mapping, int tag) | 1602 | int mapping_tagged(struct address_space *mapping, int tag) |
1407 | { | 1603 | { |
1408 | int ret; | 1604 | return radix_tree_tagged(&mapping->page_tree, tag); |
1409 | rcu_read_lock(); | ||
1410 | ret = radix_tree_tagged(&mapping->page_tree, tag); | ||
1411 | rcu_read_unlock(); | ||
1412 | return ret; | ||
1413 | } | 1605 | } |
1414 | EXPORT_SYMBOL(mapping_tagged); | 1606 | EXPORT_SYMBOL(mapping_tagged); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8985acdab..8859578e4bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -127,6 +127,20 @@ void pm_restrict_gfp_mask(void) | |||
127 | saved_gfp_mask = gfp_allowed_mask; | 127 | saved_gfp_mask = gfp_allowed_mask; |
128 | gfp_allowed_mask &= ~GFP_IOFS; | 128 | gfp_allowed_mask &= ~GFP_IOFS; |
129 | } | 129 | } |
130 | |||
131 | static bool pm_suspending(void) | ||
132 | { | ||
133 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) | ||
134 | return false; | ||
135 | return true; | ||
136 | } | ||
137 | |||
138 | #else | ||
139 | |||
140 | static bool pm_suspending(void) | ||
141 | { | ||
142 | return false; | ||
143 | } | ||
130 | #endif /* CONFIG_PM_SLEEP */ | 144 | #endif /* CONFIG_PM_SLEEP */ |
131 | 145 | ||
132 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 146 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -176,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
176 | }; | 190 | }; |
177 | 191 | ||
178 | int min_free_kbytes = 1024; | 192 | int min_free_kbytes = 1024; |
193 | int min_free_order_shift = 1; | ||
179 | 194 | ||
180 | static unsigned long __meminitdata nr_kernel_pages; | 195 | static unsigned long __meminitdata nr_kernel_pages; |
181 | static unsigned long __meminitdata nr_all_pages; | 196 | static unsigned long __meminitdata nr_all_pages; |
@@ -355,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
355 | __SetPageHead(page); | 370 | __SetPageHead(page); |
356 | for (i = 1; i < nr_pages; i++) { | 371 | for (i = 1; i < nr_pages; i++) { |
357 | struct page *p = page + i; | 372 | struct page *p = page + i; |
358 | |||
359 | __SetPageTail(p); | 373 | __SetPageTail(p); |
374 | set_page_count(p, 0); | ||
360 | p->first_page = page; | 375 | p->first_page = page; |
361 | } | 376 | } |
362 | } | 377 | } |
@@ -1370,21 +1385,12 @@ failed: | |||
1370 | 1385 | ||
1371 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1386 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1372 | 1387 | ||
1373 | static struct fail_page_alloc_attr { | 1388 | static struct { |
1374 | struct fault_attr attr; | 1389 | struct fault_attr attr; |
1375 | 1390 | ||
1376 | u32 ignore_gfp_highmem; | 1391 | u32 ignore_gfp_highmem; |
1377 | u32 ignore_gfp_wait; | 1392 | u32 ignore_gfp_wait; |
1378 | u32 min_order; | 1393 | u32 min_order; |
1379 | |||
1380 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
1381 | |||
1382 | struct dentry *ignore_gfp_highmem_file; | ||
1383 | struct dentry *ignore_gfp_wait_file; | ||
1384 | struct dentry *min_order_file; | ||
1385 | |||
1386 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
1387 | |||
1388 | } fail_page_alloc = { | 1394 | } fail_page_alloc = { |
1389 | .attr = FAULT_ATTR_INITIALIZER, | 1395 | .attr = FAULT_ATTR_INITIALIZER, |
1390 | .ignore_gfp_wait = 1, | 1396 | .ignore_gfp_wait = 1, |
@@ -1418,36 +1424,27 @@ static int __init fail_page_alloc_debugfs(void) | |||
1418 | { | 1424 | { |
1419 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 1425 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
1420 | struct dentry *dir; | 1426 | struct dentry *dir; |
1421 | int err; | ||
1422 | |||
1423 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | ||
1424 | "fail_page_alloc"); | ||
1425 | if (err) | ||
1426 | return err; | ||
1427 | dir = fail_page_alloc.attr.dentries.dir; | ||
1428 | |||
1429 | fail_page_alloc.ignore_gfp_wait_file = | ||
1430 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
1431 | &fail_page_alloc.ignore_gfp_wait); | ||
1432 | |||
1433 | fail_page_alloc.ignore_gfp_highmem_file = | ||
1434 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
1435 | &fail_page_alloc.ignore_gfp_highmem); | ||
1436 | fail_page_alloc.min_order_file = | ||
1437 | debugfs_create_u32("min-order", mode, dir, | ||
1438 | &fail_page_alloc.min_order); | ||
1439 | |||
1440 | if (!fail_page_alloc.ignore_gfp_wait_file || | ||
1441 | !fail_page_alloc.ignore_gfp_highmem_file || | ||
1442 | !fail_page_alloc.min_order_file) { | ||
1443 | err = -ENOMEM; | ||
1444 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | ||
1445 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | ||
1446 | debugfs_remove(fail_page_alloc.min_order_file); | ||
1447 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | ||
1448 | } | ||
1449 | 1427 | ||
1450 | return err; | 1428 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
1429 | &fail_page_alloc.attr); | ||
1430 | if (IS_ERR(dir)) | ||
1431 | return PTR_ERR(dir); | ||
1432 | |||
1433 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
1434 | &fail_page_alloc.ignore_gfp_wait)) | ||
1435 | goto fail; | ||
1436 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
1437 | &fail_page_alloc.ignore_gfp_highmem)) | ||
1438 | goto fail; | ||
1439 | if (!debugfs_create_u32("min-order", mode, dir, | ||
1440 | &fail_page_alloc.min_order)) | ||
1441 | goto fail; | ||
1442 | |||
1443 | return 0; | ||
1444 | fail: | ||
1445 | debugfs_remove_recursive(dir); | ||
1446 | |||
1447 | return -ENOMEM; | ||
1451 | } | 1448 | } |
1452 | 1449 | ||
1453 | late_initcall(fail_page_alloc_debugfs); | 1450 | late_initcall(fail_page_alloc_debugfs); |
@@ -1487,7 +1484,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1487 | free_pages -= z->free_area[o].nr_free << o; | 1484 | free_pages -= z->free_area[o].nr_free << o; |
1488 | 1485 | ||
1489 | /* Require fewer higher order pages to be free */ | 1486 | /* Require fewer higher order pages to be free */ |
1490 | min >>= 1; | 1487 | min >>= min_free_order_shift; |
1491 | 1488 | ||
1492 | if (free_pages <= min) | 1489 | if (free_pages <= min) |
1493 | return false; | 1490 | return false; |
@@ -1616,6 +1613,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1616 | set_bit(i, zlc->fullzones); | 1613 | set_bit(i, zlc->fullzones); |
1617 | } | 1614 | } |
1618 | 1615 | ||
1616 | /* | ||
1617 | * clear all zones full, called after direct reclaim makes progress so that | ||
1618 | * a zone that was recently full is not skipped over for up to a second | ||
1619 | */ | ||
1620 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1621 | { | ||
1622 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1623 | |||
1624 | zlc = zonelist->zlcache_ptr; | ||
1625 | if (!zlc) | ||
1626 | return; | ||
1627 | |||
1628 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1629 | } | ||
1630 | |||
1619 | #else /* CONFIG_NUMA */ | 1631 | #else /* CONFIG_NUMA */ |
1620 | 1632 | ||
1621 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1633 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
@@ -1632,6 +1644,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | |||
1632 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1644 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1633 | { | 1645 | { |
1634 | } | 1646 | } |
1647 | |||
1648 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1649 | { | ||
1650 | } | ||
1635 | #endif /* CONFIG_NUMA */ | 1651 | #endif /* CONFIG_NUMA */ |
1636 | 1652 | ||
1637 | /* | 1653 | /* |
@@ -1664,7 +1680,7 @@ zonelist_scan: | |||
1664 | continue; | 1680 | continue; |
1665 | if ((alloc_flags & ALLOC_CPUSET) && | 1681 | if ((alloc_flags & ALLOC_CPUSET) && |
1666 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1682 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1667 | goto try_next_zone; | 1683 | continue; |
1668 | 1684 | ||
1669 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1685 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1670 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1686 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
@@ -1676,17 +1692,36 @@ zonelist_scan: | |||
1676 | classzone_idx, alloc_flags)) | 1692 | classzone_idx, alloc_flags)) |
1677 | goto try_this_zone; | 1693 | goto try_this_zone; |
1678 | 1694 | ||
1695 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1696 | /* | ||
1697 | * we do zlc_setup if there are multiple nodes | ||
1698 | * and before considering the first zone allowed | ||
1699 | * by the cpuset. | ||
1700 | */ | ||
1701 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1702 | zlc_active = 1; | ||
1703 | did_zlc_setup = 1; | ||
1704 | } | ||
1705 | |||
1679 | if (zone_reclaim_mode == 0) | 1706 | if (zone_reclaim_mode == 0) |
1680 | goto this_zone_full; | 1707 | goto this_zone_full; |
1681 | 1708 | ||
1709 | /* | ||
1710 | * As we may have just activated ZLC, check if the first | ||
1711 | * eligible zone has failed zone_reclaim recently. | ||
1712 | */ | ||
1713 | if (NUMA_BUILD && zlc_active && | ||
1714 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1715 | continue; | ||
1716 | |||
1682 | ret = zone_reclaim(zone, gfp_mask, order); | 1717 | ret = zone_reclaim(zone, gfp_mask, order); |
1683 | switch (ret) { | 1718 | switch (ret) { |
1684 | case ZONE_RECLAIM_NOSCAN: | 1719 | case ZONE_RECLAIM_NOSCAN: |
1685 | /* did not scan */ | 1720 | /* did not scan */ |
1686 | goto try_next_zone; | 1721 | continue; |
1687 | case ZONE_RECLAIM_FULL: | 1722 | case ZONE_RECLAIM_FULL: |
1688 | /* scanned but unreclaimable */ | 1723 | /* scanned but unreclaimable */ |
1689 | goto this_zone_full; | 1724 | continue; |
1690 | default: | 1725 | default: |
1691 | /* did we reclaim enough */ | 1726 | /* did we reclaim enough */ |
1692 | if (!zone_watermark_ok(zone, order, mark, | 1727 | if (!zone_watermark_ok(zone, order, mark, |
@@ -1703,16 +1738,6 @@ try_this_zone: | |||
1703 | this_zone_full: | 1738 | this_zone_full: |
1704 | if (NUMA_BUILD) | 1739 | if (NUMA_BUILD) |
1705 | zlc_mark_zone_full(zonelist, z); | 1740 | zlc_mark_zone_full(zonelist, z); |
1706 | try_next_zone: | ||
1707 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1708 | /* | ||
1709 | * we do zlc_setup after the first zone is tried but only | ||
1710 | * if there are multiple nodes make it worthwhile | ||
1711 | */ | ||
1712 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1713 | zlc_active = 1; | ||
1714 | did_zlc_setup = 1; | ||
1715 | } | ||
1716 | } | 1741 | } |
1717 | 1742 | ||
1718 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1743 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
@@ -1954,6 +1979,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1954 | if (unlikely(!(*did_some_progress))) | 1979 | if (unlikely(!(*did_some_progress))) |
1955 | return NULL; | 1980 | return NULL; |
1956 | 1981 | ||
1982 | /* After successful reclaim, reconsider all zones for allocation */ | ||
1983 | if (NUMA_BUILD) | ||
1984 | zlc_clear_zones_full(zonelist); | ||
1985 | |||
1957 | retry: | 1986 | retry: |
1958 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1987 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1959 | zonelist, high_zoneidx, | 1988 | zonelist, high_zoneidx, |
@@ -2193,6 +2222,14 @@ rebalance: | |||
2193 | 2222 | ||
2194 | goto restart; | 2223 | goto restart; |
2195 | } | 2224 | } |
2225 | |||
2226 | /* | ||
2227 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can | ||
2228 | * prevent reclaim making forward progress without | ||
2229 | * invoking OOM. Bail if we are suspending | ||
2230 | */ | ||
2231 | if (pm_suspending()) | ||
2232 | goto nopage; | ||
2196 | } | 2233 | } |
2197 | 2234 | ||
2198 | /* Check if we should retry the allocation */ | 2235 | /* Check if we should retry the allocation */ |
@@ -3356,9 +3393,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3356 | unsigned long block_migratetype; | 3393 | unsigned long block_migratetype; |
3357 | int reserve; | 3394 | int reserve; |
3358 | 3395 | ||
3359 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 3396 | /* |
3397 | * Get the start pfn, end pfn and the number of blocks to reserve | ||
3398 | * We have to be careful to be aligned to pageblock_nr_pages to | ||
3399 | * make sure that we always check pfn_valid for the first page in | ||
3400 | * the block. | ||
3401 | */ | ||
3360 | start_pfn = zone->zone_start_pfn; | 3402 | start_pfn = zone->zone_start_pfn; |
3361 | end_pfn = start_pfn + zone->spanned_pages; | 3403 | end_pfn = start_pfn + zone->spanned_pages; |
3404 | start_pfn = roundup(start_pfn, pageblock_nr_pages); | ||
3362 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3405 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
3363 | pageblock_order; | 3406 | pageblock_order; |
3364 | 3407 | ||
@@ -4585,6 +4628,60 @@ void __init sort_node_map(void) | |||
4585 | cmp_node_active_region, NULL); | 4628 | cmp_node_active_region, NULL); |
4586 | } | 4629 | } |
4587 | 4630 | ||
4631 | /** | ||
4632 | * node_map_pfn_alignment - determine the maximum internode alignment | ||
4633 | * | ||
4634 | * This function should be called after node map is populated and sorted. | ||
4635 | * It calculates the maximum power of two alignment which can distinguish | ||
4636 | * all the nodes. | ||
4637 | * | ||
4638 | * For example, if all nodes are 1GiB and aligned to 1GiB, the return value | ||
4639 | * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the | ||
4640 | * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is | ||
4641 | * shifted, 1GiB is enough and this function will indicate so. | ||
4642 | * | ||
4643 | * This is used to test whether pfn -> nid mapping of the chosen memory | ||
4644 | * model has fine enough granularity to avoid incorrect mapping for the | ||
4645 | * populated node map. | ||
4646 | * | ||
4647 | * Returns the determined alignment in pfn's. 0 if there is no alignment | ||
4648 | * requirement (single node). | ||
4649 | */ | ||
4650 | unsigned long __init node_map_pfn_alignment(void) | ||
4651 | { | ||
4652 | unsigned long accl_mask = 0, last_end = 0; | ||
4653 | int last_nid = -1; | ||
4654 | int i; | ||
4655 | |||
4656 | for_each_active_range_index_in_nid(i, MAX_NUMNODES) { | ||
4657 | int nid = early_node_map[i].nid; | ||
4658 | unsigned long start = early_node_map[i].start_pfn; | ||
4659 | unsigned long end = early_node_map[i].end_pfn; | ||
4660 | unsigned long mask; | ||
4661 | |||
4662 | if (!start || last_nid < 0 || last_nid == nid) { | ||
4663 | last_nid = nid; | ||
4664 | last_end = end; | ||
4665 | continue; | ||
4666 | } | ||
4667 | |||
4668 | /* | ||
4669 | * Start with a mask granular enough to pin-point to the | ||
4670 | * start pfn and tick off bits one-by-one until it becomes | ||
4671 | * too coarse to separate the current node from the last. | ||
4672 | */ | ||
4673 | mask = ~((1 << __ffs(start)) - 1); | ||
4674 | while (mask && last_end <= (start & (mask << 1))) | ||
4675 | mask <<= 1; | ||
4676 | |||
4677 | /* accumulate all internode masks */ | ||
4678 | accl_mask |= mask; | ||
4679 | } | ||
4680 | |||
4681 | /* convert mask to number of pages */ | ||
4682 | return ~accl_mask + 1; | ||
4683 | } | ||
4684 | |||
4588 | /* Find the lowest pfn for a node */ | 4685 | /* Find the lowest pfn for a node */ |
4589 | static unsigned long __init find_min_pfn_for_node(int nid) | 4686 | static unsigned long __init find_min_pfn_for_node(int nid) |
4590 | { | 4687 | { |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 53bffc6c293..39d216d535e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, | |||
225 | unsigned long start, end, pfn; | 225 | unsigned long start, end, pfn; |
226 | int fail = 0; | 226 | int fail = 0; |
227 | 227 | ||
228 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 228 | start = SECTION_ALIGN_DOWN(start_pfn); |
229 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 229 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
230 | 230 | ||
231 | if (nid == -1) { | 231 | if (nid == -1) { |
232 | /* | 232 | /* |
@@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn, | |||
258 | { | 258 | { |
259 | unsigned long start, end, pfn; | 259 | unsigned long start, end, pfn; |
260 | 260 | ||
261 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 261 | start = SECTION_ALIGN_DOWN(start_pfn); |
262 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 262 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
263 | 263 | ||
264 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | 264 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) |
265 | __free_page_cgroup(pfn); | 265 | __free_page_cgroup(pfn); |
@@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
537 | nomem: | 537 | nomem: |
538 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | 538 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); |
539 | printk(KERN_INFO | 539 | printk(KERN_INFO |
540 | "swap_cgroup can be disabled by noswapaccount boot option\n"); | 540 | "swap_cgroup can be disabled by swapaccount=0 boot option\n"); |
541 | return -ENOMEM; | 541 | return -ENOMEM; |
542 | } | 542 | } |
543 | 543 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d53361..2f5cf10ff66 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
126 | 126 | ||
127 | return 0; | 127 | return 0; |
128 | } | 128 | } |
129 | #endif | 129 | |
130 | static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) | ||
131 | { | ||
132 | struct vm_area_struct *vma; | ||
133 | |||
134 | /* We don't need vma lookup at all. */ | ||
135 | if (!walk->hugetlb_entry) | ||
136 | return NULL; | ||
137 | |||
138 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | ||
139 | vma = find_vma(walk->mm, addr); | ||
140 | if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) | ||
141 | return vma; | ||
142 | |||
143 | return NULL; | ||
144 | } | ||
145 | |||
146 | #else /* CONFIG_HUGETLB_PAGE */ | ||
147 | static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) | ||
148 | { | ||
149 | return NULL; | ||
150 | } | ||
151 | |||
152 | static int walk_hugetlb_range(struct vm_area_struct *vma, | ||
153 | unsigned long addr, unsigned long end, | ||
154 | struct mm_walk *walk) | ||
155 | { | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
160 | |||
161 | |||
130 | 162 | ||
131 | /** | 163 | /** |
132 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
144 | * associated range, and a copy of the original mm_walk for access to | 176 | * associated range, and a copy of the original mm_walk for access to |
145 | * the ->private or ->mm fields. | 177 | * the ->private or ->mm fields. |
146 | * | 178 | * |
147 | * No locks are taken, but the bottom level iterator will map PTE | 179 | * Usually no locks are taken, but splitting transparent huge page may |
180 | * take page table lock. And the bottom level iterator will map PTE | ||
148 | * directories from highmem if necessary. | 181 | * directories from highmem if necessary. |
149 | * | 182 | * |
150 | * If any callback returns a non-zero value, the walk is aborted and | 183 | * If any callback returns a non-zero value, the walk is aborted and |
151 | * the return value is propagated back to the caller. Otherwise 0 is returned. | 184 | * the return value is propagated back to the caller. Otherwise 0 is returned. |
185 | * | ||
186 | * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry | ||
187 | * is !NULL. | ||
152 | */ | 188 | */ |
153 | int walk_page_range(unsigned long addr, unsigned long end, | 189 | int walk_page_range(unsigned long addr, unsigned long end, |
154 | struct mm_walk *walk) | 190 | struct mm_walk *walk) |
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
165 | 201 | ||
166 | pgd = pgd_offset(walk->mm, addr); | 202 | pgd = pgd_offset(walk->mm, addr); |
167 | do { | 203 | do { |
168 | struct vm_area_struct *uninitialized_var(vma); | 204 | struct vm_area_struct *vma; |
169 | 205 | ||
170 | next = pgd_addr_end(addr, end); | 206 | next = pgd_addr_end(addr, end); |
171 | 207 | ||
172 | #ifdef CONFIG_HUGETLB_PAGE | ||
173 | /* | 208 | /* |
174 | * handle hugetlb vma individually because pagetable walk for | 209 | * handle hugetlb vma individually because pagetable walk for |
175 | * the hugetlb page is dependent on the architecture and | 210 | * the hugetlb page is dependent on the architecture and |
176 | * we can't handled it in the same manner as non-huge pages. | 211 | * we can't handled it in the same manner as non-huge pages. |
177 | */ | 212 | */ |
178 | vma = find_vma(walk->mm, addr); | 213 | vma = hugetlb_vma(addr, walk); |
179 | if (vma && is_vm_hugetlb_page(vma)) { | 214 | if (vma) { |
180 | if (vma->vm_end < next) | 215 | if (vma->vm_end < next) |
181 | next = vma->vm_end; | 216 | next = vma->vm_end; |
182 | /* | 217 | /* |
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
189 | pgd = pgd_offset(walk->mm, next); | 224 | pgd = pgd_offset(walk->mm, next); |
190 | continue; | 225 | continue; |
191 | } | 226 | } |
192 | #endif | 227 | |
193 | if (pgd_none_or_clear_bad(pgd)) { | 228 | if (pgd_none_or_clear_bad(pgd)) { |
194 | if (walk->pte_hole) | 229 | if (walk->pte_hole) |
195 | err = walk->pte_hole(addr, next, walk); | 230 | err = walk->pte_hole(addr, next, walk); |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index ea534960a04..bfad7246665 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, | |||
143 | int page_start, int page_end) | 143 | int page_start, int page_end) |
144 | { | 144 | { |
145 | flush_cache_vunmap( | 145 | flush_cache_vunmap( |
146 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | 146 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
147 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | 147 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
148 | } | 148 | } |
149 | 149 | ||
150 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | 150 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
@@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, | |||
206 | int page_start, int page_end) | 206 | int page_start, int page_end) |
207 | { | 207 | { |
208 | flush_tlb_kernel_range( | 208 | flush_tlb_kernel_range( |
209 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | 209 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
210 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | 210 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
211 | } | 211 | } |
212 | 212 | ||
213 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, | 213 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, |
@@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | |||
284 | int page_start, int page_end) | 284 | int page_start, int page_end) |
285 | { | 285 | { |
286 | flush_cache_vmap( | 286 | flush_cache_vmap( |
287 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | 287 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
288 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | 288 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
289 | } | 289 | } |
290 | 290 | ||
291 | /** | 291 | /** |
diff --git a/mm/percpu.c b/mm/percpu.c index bf80e55dbed..0ae7a09141e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly; | |||
116 | static int pcpu_nr_slots __read_mostly; | 116 | static int pcpu_nr_slots __read_mostly; |
117 | static size_t pcpu_chunk_struct_size __read_mostly; | 117 | static size_t pcpu_chunk_struct_size __read_mostly; |
118 | 118 | ||
119 | /* cpus with the lowest and highest unit numbers */ | 119 | /* cpus with the lowest and highest unit addresses */ |
120 | static unsigned int pcpu_first_unit_cpu __read_mostly; | 120 | static unsigned int pcpu_low_unit_cpu __read_mostly; |
121 | static unsigned int pcpu_last_unit_cpu __read_mostly; | 121 | static unsigned int pcpu_high_unit_cpu __read_mostly; |
122 | 122 | ||
123 | /* the address of the first chunk which starts with the kernel static area */ | 123 | /* the address of the first chunk which starts with the kernel static area */ |
124 | void *pcpu_base_addr __read_mostly; | 124 | void *pcpu_base_addr __read_mostly; |
@@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) | |||
984 | { | 984 | { |
985 | void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); | 985 | void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); |
986 | bool in_first_chunk = false; | 986 | bool in_first_chunk = false; |
987 | unsigned long first_start, first_end; | 987 | unsigned long first_low, first_high; |
988 | unsigned int cpu; | 988 | unsigned int cpu; |
989 | 989 | ||
990 | /* | 990 | /* |
991 | * The following test on first_start/end isn't strictly | 991 | * The following test on unit_low/high isn't strictly |
992 | * necessary but will speed up lookups of addresses which | 992 | * necessary but will speed up lookups of addresses which |
993 | * aren't in the first chunk. | 993 | * aren't in the first chunk. |
994 | */ | 994 | */ |
995 | first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); | 995 | first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); |
996 | first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, | 996 | first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, |
997 | pcpu_unit_pages); | 997 | pcpu_unit_pages); |
998 | if ((unsigned long)addr >= first_start && | 998 | if ((unsigned long)addr >= first_low && |
999 | (unsigned long)addr < first_end) { | 999 | (unsigned long)addr < first_high) { |
1000 | for_each_possible_cpu(cpu) { | 1000 | for_each_possible_cpu(cpu) { |
1001 | void *start = per_cpu_ptr(base, cpu); | 1001 | void *start = per_cpu_ptr(base, cpu); |
1002 | 1002 | ||
@@ -1011,9 +1011,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) | |||
1011 | if (!is_vmalloc_addr(addr)) | 1011 | if (!is_vmalloc_addr(addr)) |
1012 | return __pa(addr); | 1012 | return __pa(addr); |
1013 | else | 1013 | else |
1014 | return page_to_phys(vmalloc_to_page(addr)); | 1014 | return page_to_phys(vmalloc_to_page(addr)) + |
1015 | offset_in_page(addr); | ||
1015 | } else | 1016 | } else |
1016 | return page_to_phys(pcpu_addr_to_page(addr)); | 1017 | return page_to_phys(pcpu_addr_to_page(addr)) + |
1018 | offset_in_page(addr); | ||
1017 | } | 1019 | } |
1018 | 1020 | ||
1019 | /** | 1021 | /** |
@@ -1233,7 +1235,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1233 | 1235 | ||
1234 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) | 1236 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) |
1235 | unit_map[cpu] = UINT_MAX; | 1237 | unit_map[cpu] = UINT_MAX; |
1236 | pcpu_first_unit_cpu = NR_CPUS; | 1238 | |
1239 | pcpu_low_unit_cpu = NR_CPUS; | ||
1240 | pcpu_high_unit_cpu = NR_CPUS; | ||
1237 | 1241 | ||
1238 | for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { | 1242 | for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { |
1239 | const struct pcpu_group_info *gi = &ai->groups[group]; | 1243 | const struct pcpu_group_info *gi = &ai->groups[group]; |
@@ -1253,9 +1257,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1253 | unit_map[cpu] = unit + i; | 1257 | unit_map[cpu] = unit + i; |
1254 | unit_off[cpu] = gi->base_offset + i * ai->unit_size; | 1258 | unit_off[cpu] = gi->base_offset + i * ai->unit_size; |
1255 | 1259 | ||
1256 | if (pcpu_first_unit_cpu == NR_CPUS) | 1260 | /* determine low/high unit_cpu */ |
1257 | pcpu_first_unit_cpu = cpu; | 1261 | if (pcpu_low_unit_cpu == NR_CPUS || |
1258 | pcpu_last_unit_cpu = cpu; | 1262 | unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) |
1263 | pcpu_low_unit_cpu = cpu; | ||
1264 | if (pcpu_high_unit_cpu == NR_CPUS || | ||
1265 | unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) | ||
1266 | pcpu_high_unit_cpu = cpu; | ||
1259 | } | 1267 | } |
1260 | } | 1268 | } |
1261 | pcpu_nr_units = unit; | 1269 | pcpu_nr_units = unit; |
@@ -21,7 +21,6 @@ | |||
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * inode->i_alloc_sem (vmtruncate_range) | ||
25 | * mm->mmap_sem | 24 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
@@ -32,11 +31,11 @@ | |||
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
36 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
37 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
38 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
39 | * within inode_wb_list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
40 | * | 39 | * |
41 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
42 | * ->tasklist_lock | 41 | * ->tasklist_lock |
@@ -870,11 +869,11 @@ int page_referenced(struct page *page, | |||
870 | vm_flags); | 869 | vm_flags); |
871 | if (we_locked) | 870 | if (we_locked) |
872 | unlock_page(page); | 871 | unlock_page(page); |
872 | |||
873 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
874 | referenced++; | ||
873 | } | 875 | } |
874 | out: | 876 | out: |
875 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
876 | referenced++; | ||
877 | |||
878 | return referenced; | 877 | return referenced; |
879 | } | 878 | } |
880 | 879 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index fcedf5464eb..fba53caba0d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -6,7 +6,8 @@ | |||
6 | * 2000-2001 Christoph Rohland | 6 | * 2000-2001 Christoph Rohland |
7 | * 2000-2001 SAP AG | 7 | * 2000-2001 SAP AG |
8 | * 2002 Red Hat Inc. | 8 | * 2002 Red Hat Inc. |
9 | * Copyright (C) 2002-2005 Hugh Dickins. | 9 | * Copyright (C) 2002-2011 Hugh Dickins. |
10 | * Copyright (C) 2011 Google Inc. | ||
10 | * Copyright (C) 2002-2005 VERITAS Software Corporation. | 11 | * Copyright (C) 2002-2005 VERITAS Software Corporation. |
11 | * Copyright (C) 2004 Andi Kleen, SuSE Labs | 12 | * Copyright (C) 2004 Andi Kleen, SuSE Labs |
12 | * | 13 | * |
@@ -28,7 +29,6 @@ | |||
28 | #include <linux/file.h> | 29 | #include <linux/file.h> |
29 | #include <linux/mm.h> | 30 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 31 | #include <linux/module.h> |
31 | #include <linux/percpu_counter.h> | ||
32 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
33 | 33 | ||
34 | static struct vfsmount *shm_mnt; | 34 | static struct vfsmount *shm_mnt; |
@@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt; | |||
51 | #include <linux/shmem_fs.h> | 51 | #include <linux/shmem_fs.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | ||
55 | #include <linux/percpu_counter.h> | ||
56 | #include <linux/splice.h> | ||
54 | #include <linux/security.h> | 57 | #include <linux/security.h> |
55 | #include <linux/swapops.h> | 58 | #include <linux/swapops.h> |
56 | #include <linux/mempolicy.h> | 59 | #include <linux/mempolicy.h> |
@@ -62,43 +65,17 @@ static struct vfsmount *shm_mnt; | |||
62 | #include <linux/magic.h> | 65 | #include <linux/magic.h> |
63 | 66 | ||
64 | #include <asm/uaccess.h> | 67 | #include <asm/uaccess.h> |
65 | #include <asm/div64.h> | ||
66 | #include <asm/pgtable.h> | 68 | #include <asm/pgtable.h> |
67 | 69 | ||
68 | /* | ||
69 | * The maximum size of a shmem/tmpfs file is limited by the maximum size of | ||
70 | * its triple-indirect swap vector - see illustration at shmem_swp_entry(). | ||
71 | * | ||
72 | * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, | ||
73 | * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum | ||
74 | * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, | ||
75 | * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. | ||
76 | * | ||
77 | * We use / and * instead of shifts in the definitions below, so that the swap | ||
78 | * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. | ||
79 | */ | ||
80 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | ||
81 | #define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | ||
82 | |||
83 | #define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) | ||
84 | #define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) | ||
85 | |||
86 | #define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) | ||
87 | #define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) | ||
88 | |||
89 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | 70 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) |
90 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) | 71 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) |
91 | 72 | ||
92 | /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ | ||
93 | #define SHMEM_PAGEIN VM_READ | ||
94 | #define SHMEM_TRUNCATE VM_WRITE | ||
95 | |||
96 | /* Definition to limit shmem_truncate's steps between cond_rescheds */ | ||
97 | #define LATENCY_LIMIT 64 | ||
98 | |||
99 | /* Pretend that each entry is of this size in directory's i_size */ | 73 | /* Pretend that each entry is of this size in directory's i_size */ |
100 | #define BOGO_DIRENT_SIZE 20 | 74 | #define BOGO_DIRENT_SIZE 20 |
101 | 75 | ||
76 | /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ | ||
77 | #define SHORT_SYMLINK_LEN 128 | ||
78 | |||
102 | struct shmem_xattr { | 79 | struct shmem_xattr { |
103 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ | 80 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ |
104 | char *name; /* xattr name */ | 81 | char *name; /* xattr name */ |
@@ -106,7 +83,7 @@ struct shmem_xattr { | |||
106 | char value[0]; | 83 | char value[0]; |
107 | }; | 84 | }; |
108 | 85 | ||
109 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 86 | /* Flag allocation requirements to shmem_getpage */ |
110 | enum sgp_type { | 87 | enum sgp_type { |
111 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 88 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
112 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
@@ -126,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void) | |||
126 | } | 103 | } |
127 | #endif | 104 | #endif |
128 | 105 | ||
129 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 106 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
130 | struct page **pagep, enum sgp_type sgp, int *type); | 107 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
131 | |||
132 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | ||
133 | { | ||
134 | /* | ||
135 | * The above definition of ENTRIES_PER_PAGE, and the use of | ||
136 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | ||
137 | * might be reconsidered if it ever diverges from PAGE_SIZE. | ||
138 | * | ||
139 | * Mobility flags are masked out as swap vectors cannot move | ||
140 | */ | ||
141 | return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, | ||
142 | PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
143 | } | ||
144 | |||
145 | static inline void shmem_dir_free(struct page *page) | ||
146 | { | ||
147 | __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
148 | } | ||
149 | |||
150 | static struct page **shmem_dir_map(struct page *page) | ||
151 | { | ||
152 | return (struct page **)kmap_atomic(page, KM_USER0); | ||
153 | } | ||
154 | |||
155 | static inline void shmem_dir_unmap(struct page **dir) | ||
156 | { | ||
157 | kunmap_atomic(dir, KM_USER0); | ||
158 | } | ||
159 | |||
160 | static swp_entry_t *shmem_swp_map(struct page *page) | ||
161 | { | ||
162 | return (swp_entry_t *)kmap_atomic(page, KM_USER1); | ||
163 | } | ||
164 | |||
165 | static inline void shmem_swp_balance_unmap(void) | ||
166 | { | ||
167 | /* | ||
168 | * When passing a pointer to an i_direct entry, to code which | ||
169 | * also handles indirect entries and so will shmem_swp_unmap, | ||
170 | * we must arrange for the preempt count to remain in balance. | ||
171 | * What kmap_atomic of a lowmem page does depends on config | ||
172 | * and architecture, so pretend to kmap_atomic some lowmem page. | ||
173 | */ | ||
174 | (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); | ||
175 | } | ||
176 | 108 | ||
177 | static inline void shmem_swp_unmap(swp_entry_t *entry) | 109 | static inline int shmem_getpage(struct inode *inode, pgoff_t index, |
110 | struct page **pagep, enum sgp_type sgp, int *fault_type) | ||
178 | { | 111 | { |
179 | kunmap_atomic(entry, KM_USER1); | 112 | return shmem_getpage_gfp(inode, index, pagep, sgp, |
113 | mapping_gfp_mask(inode->i_mapping), fault_type); | ||
180 | } | 114 | } |
181 | 115 | ||
182 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) | 116 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) |
@@ -236,17 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | |||
236 | static LIST_HEAD(shmem_swaplist); | 170 | static LIST_HEAD(shmem_swaplist); |
237 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 171 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
238 | 172 | ||
239 | static void shmem_free_blocks(struct inode *inode, long pages) | ||
240 | { | ||
241 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
242 | if (sbinfo->max_blocks) { | ||
243 | percpu_counter_add(&sbinfo->used_blocks, -pages); | ||
244 | spin_lock(&inode->i_lock); | ||
245 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | ||
246 | spin_unlock(&inode->i_lock); | ||
247 | } | ||
248 | } | ||
249 | |||
250 | static int shmem_reserve_inode(struct super_block *sb) | 173 | static int shmem_reserve_inode(struct super_block *sb) |
251 | { | 174 | { |
252 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 175 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
@@ -273,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb) | |||
273 | } | 196 | } |
274 | 197 | ||
275 | /** | 198 | /** |
276 | * shmem_recalc_inode - recalculate the size of an inode | 199 | * shmem_recalc_inode - recalculate the block usage of an inode |
277 | * @inode: inode to recalc | 200 | * @inode: inode to recalc |
278 | * | 201 | * |
279 | * We have to calculate the free blocks since the mm can drop | 202 | * We have to calculate the free blocks since the mm can drop |
@@ -291,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode) | |||
291 | 214 | ||
292 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; | 215 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; |
293 | if (freed > 0) { | 216 | if (freed > 0) { |
217 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
218 | if (sbinfo->max_blocks) | ||
219 | percpu_counter_add(&sbinfo->used_blocks, -freed); | ||
294 | info->alloced -= freed; | 220 | info->alloced -= freed; |
221 | inode->i_blocks -= freed * BLOCKS_PER_PAGE; | ||
295 | shmem_unacct_blocks(info->flags, freed); | 222 | shmem_unacct_blocks(info->flags, freed); |
296 | shmem_free_blocks(inode, freed); | ||
297 | } | 223 | } |
298 | } | 224 | } |
299 | 225 | ||
300 | /** | 226 | /* |
301 | * shmem_swp_entry - find the swap vector position in the info structure | 227 | * Replace item expected in radix tree by a new item, while holding tree lock. |
302 | * @info: info structure for the inode | 228 | */ |
303 | * @index: index of the page to find | 229 | static int shmem_radix_tree_replace(struct address_space *mapping, |
304 | * @page: optional page to add to the structure. Has to be preset to | 230 | pgoff_t index, void *expected, void *replacement) |
305 | * all zeros | 231 | { |
306 | * | 232 | void **pslot; |
307 | * If there is no space allocated yet it will return NULL when | 233 | void *item = NULL; |
308 | * page is NULL, else it will use the page for the needed block, | 234 | |
309 | * setting it to NULL on return to indicate that it has been used. | 235 | VM_BUG_ON(!expected); |
310 | * | 236 | pslot = radix_tree_lookup_slot(&mapping->page_tree, index); |
311 | * The swap vector is organized the following way: | 237 | if (pslot) |
312 | * | 238 | item = radix_tree_deref_slot_protected(pslot, |
313 | * There are SHMEM_NR_DIRECT entries directly stored in the | 239 | &mapping->tree_lock); |
314 | * shmem_inode_info structure. So small files do not need an addional | 240 | if (item != expected) |
315 | * allocation. | 241 | return -ENOENT; |
316 | * | 242 | if (replacement) |
317 | * For pages with index > SHMEM_NR_DIRECT there is the pointer | 243 | radix_tree_replace_slot(pslot, replacement); |
318 | * i_indirect which points to a page which holds in the first half | 244 | else |
319 | * doubly indirect blocks, in the second half triple indirect blocks: | 245 | radix_tree_delete(&mapping->page_tree, index); |
320 | * | 246 | return 0; |
321 | * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the | 247 | } |
322 | * following layout (for SHMEM_NR_DIRECT == 16): | 248 | |
323 | * | 249 | /* |
324 | * i_indirect -> dir --> 16-19 | 250 | * Like add_to_page_cache_locked, but error if expected item has gone. |
325 | * | +-> 20-23 | ||
326 | * | | ||
327 | * +-->dir2 --> 24-27 | ||
328 | * | +-> 28-31 | ||
329 | * | +-> 32-35 | ||
330 | * | +-> 36-39 | ||
331 | * | | ||
332 | * +-->dir3 --> 40-43 | ||
333 | * +-> 44-47 | ||
334 | * +-> 48-51 | ||
335 | * +-> 52-55 | ||
336 | */ | 251 | */ |
337 | static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) | 252 | static int shmem_add_to_page_cache(struct page *page, |
253 | struct address_space *mapping, | ||
254 | pgoff_t index, gfp_t gfp, void *expected) | ||
338 | { | 255 | { |
339 | unsigned long offset; | 256 | int error = 0; |
340 | struct page **dir; | ||
341 | struct page *subdir; | ||
342 | 257 | ||
343 | if (index < SHMEM_NR_DIRECT) { | 258 | VM_BUG_ON(!PageLocked(page)); |
344 | shmem_swp_balance_unmap(); | 259 | VM_BUG_ON(!PageSwapBacked(page)); |
345 | return info->i_direct+index; | ||
346 | } | ||
347 | if (!info->i_indirect) { | ||
348 | if (page) { | ||
349 | info->i_indirect = *page; | ||
350 | *page = NULL; | ||
351 | } | ||
352 | return NULL; /* need another page */ | ||
353 | } | ||
354 | 260 | ||
355 | index -= SHMEM_NR_DIRECT; | 261 | if (!expected) |
356 | offset = index % ENTRIES_PER_PAGE; | 262 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); |
357 | index /= ENTRIES_PER_PAGE; | 263 | if (!error) { |
358 | dir = shmem_dir_map(info->i_indirect); | 264 | page_cache_get(page); |
359 | 265 | page->mapping = mapping; | |
360 | if (index >= ENTRIES_PER_PAGE/2) { | 266 | page->index = index; |
361 | index -= ENTRIES_PER_PAGE/2; | 267 | |
362 | dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; | 268 | spin_lock_irq(&mapping->tree_lock); |
363 | index %= ENTRIES_PER_PAGE; | 269 | if (!expected) |
364 | subdir = *dir; | 270 | error = radix_tree_insert(&mapping->page_tree, |
365 | if (!subdir) { | 271 | index, page); |
366 | if (page) { | 272 | else |
367 | *dir = *page; | 273 | error = shmem_radix_tree_replace(mapping, index, |
368 | *page = NULL; | 274 | expected, page); |
369 | } | 275 | if (!error) { |
370 | shmem_dir_unmap(dir); | 276 | mapping->nrpages++; |
371 | return NULL; /* need another page */ | 277 | __inc_zone_page_state(page, NR_FILE_PAGES); |
372 | } | 278 | __inc_zone_page_state(page, NR_SHMEM); |
373 | shmem_dir_unmap(dir); | 279 | spin_unlock_irq(&mapping->tree_lock); |
374 | dir = shmem_dir_map(subdir); | 280 | } else { |
375 | } | 281 | page->mapping = NULL; |
376 | 282 | spin_unlock_irq(&mapping->tree_lock); | |
377 | dir += index; | 283 | page_cache_release(page); |
378 | subdir = *dir; | ||
379 | if (!subdir) { | ||
380 | if (!page || !(subdir = *page)) { | ||
381 | shmem_dir_unmap(dir); | ||
382 | return NULL; /* need a page */ | ||
383 | } | 284 | } |
384 | *dir = subdir; | 285 | if (!expected) |
385 | *page = NULL; | 286 | radix_tree_preload_end(); |
386 | } | 287 | } |
387 | shmem_dir_unmap(dir); | 288 | if (error) |
388 | return shmem_swp_map(subdir) + offset; | 289 | mem_cgroup_uncharge_cache_page(page); |
290 | return error; | ||
389 | } | 291 | } |
390 | 292 | ||
391 | static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) | 293 | /* |
294 | * Like delete_from_page_cache, but substitutes swap for page. | ||
295 | */ | ||
296 | static void shmem_delete_from_page_cache(struct page *page, void *radswap) | ||
392 | { | 297 | { |
393 | long incdec = value? 1: -1; | 298 | struct address_space *mapping = page->mapping; |
299 | int error; | ||
394 | 300 | ||
395 | entry->val = value; | 301 | spin_lock_irq(&mapping->tree_lock); |
396 | info->swapped += incdec; | 302 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); |
397 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { | 303 | page->mapping = NULL; |
398 | struct page *page = kmap_atomic_to_page(entry); | 304 | mapping->nrpages--; |
399 | set_page_private(page, page_private(page) + incdec); | 305 | __dec_zone_page_state(page, NR_FILE_PAGES); |
400 | } | 306 | __dec_zone_page_state(page, NR_SHMEM); |
307 | spin_unlock_irq(&mapping->tree_lock); | ||
308 | page_cache_release(page); | ||
309 | BUG_ON(error); | ||
401 | } | 310 | } |
402 | 311 | ||
403 | /** | 312 | /* |
404 | * shmem_swp_alloc - get the position of the swap entry for the page. | 313 | * Like find_get_pages, but collecting swap entries as well as pages. |
405 | * @info: info structure for the inode | ||
406 | * @index: index of the page to find | ||
407 | * @sgp: check and recheck i_size? skip allocation? | ||
408 | * | ||
409 | * If the entry does not exist, allocate it. | ||
410 | */ | 314 | */ |
411 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) | 315 | static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, |
412 | { | 316 | pgoff_t start, unsigned int nr_pages, |
413 | struct inode *inode = &info->vfs_inode; | 317 | struct page **pages, pgoff_t *indices) |
414 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 318 | { |
415 | struct page *page = NULL; | 319 | unsigned int i; |
416 | swp_entry_t *entry; | 320 | unsigned int ret; |
417 | 321 | unsigned int nr_found; | |
418 | if (sgp != SGP_WRITE && | 322 | |
419 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | 323 | rcu_read_lock(); |
420 | return ERR_PTR(-EINVAL); | 324 | restart: |
421 | 325 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | |
422 | while (!(entry = shmem_swp_entry(info, index, &page))) { | 326 | (void ***)pages, indices, start, nr_pages); |
423 | if (sgp == SGP_READ) | 327 | ret = 0; |
424 | return shmem_swp_map(ZERO_PAGE(0)); | 328 | for (i = 0; i < nr_found; i++) { |
425 | /* | 329 | struct page *page; |
426 | * Test used_blocks against 1 less max_blocks, since we have 1 data | 330 | repeat: |
427 | * page (and perhaps indirect index pages) yet to allocate: | 331 | page = radix_tree_deref_slot((void **)pages[i]); |
428 | * a waste to allocate index if we cannot allocate data. | 332 | if (unlikely(!page)) |
429 | */ | 333 | continue; |
430 | if (sbinfo->max_blocks) { | 334 | if (radix_tree_exception(page)) { |
431 | if (percpu_counter_compare(&sbinfo->used_blocks, | 335 | if (radix_tree_deref_retry(page)) |
432 | sbinfo->max_blocks - 1) >= 0) | 336 | goto restart; |
433 | return ERR_PTR(-ENOSPC); | 337 | /* |
434 | percpu_counter_inc(&sbinfo->used_blocks); | 338 | * Otherwise, we must be storing a swap entry |
435 | spin_lock(&inode->i_lock); | 339 | * here as an exceptional entry: so return it |
436 | inode->i_blocks += BLOCKS_PER_PAGE; | 340 | * without attempting to raise page count. |
437 | spin_unlock(&inode->i_lock); | 341 | */ |
342 | goto export; | ||
438 | } | 343 | } |
344 | if (!page_cache_get_speculative(page)) | ||
345 | goto repeat; | ||
439 | 346 | ||
440 | spin_unlock(&info->lock); | 347 | /* Has the page moved? */ |
441 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); | 348 | if (unlikely(page != *((void **)pages[i]))) { |
442 | spin_lock(&info->lock); | 349 | page_cache_release(page); |
443 | 350 | goto repeat; | |
444 | if (!page) { | ||
445 | shmem_free_blocks(inode, 1); | ||
446 | return ERR_PTR(-ENOMEM); | ||
447 | } | ||
448 | if (sgp != SGP_WRITE && | ||
449 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
450 | entry = ERR_PTR(-EINVAL); | ||
451 | break; | ||
452 | } | 351 | } |
453 | if (info->next_index <= index) | 352 | export: |
454 | info->next_index = index + 1; | 353 | indices[ret] = indices[i]; |
455 | } | 354 | pages[ret] = page; |
456 | if (page) { | 355 | ret++; |
457 | /* another task gave its page, or truncated the file */ | 356 | } |
458 | shmem_free_blocks(inode, 1); | 357 | if (unlikely(!ret && nr_found)) |
459 | shmem_dir_free(page); | 358 | goto restart; |
460 | } | 359 | rcu_read_unlock(); |
461 | if (info->next_index <= index && !IS_ERR(entry)) | 360 | return ret; |
462 | info->next_index = index + 1; | ||
463 | return entry; | ||
464 | } | 361 | } |
465 | 362 | ||
466 | /** | 363 | /* |
467 | * shmem_free_swp - free some swap entries in a directory | 364 | * Remove swap entry from radix tree, free the swap and its page cache. |
468 | * @dir: pointer to the directory | ||
469 | * @edir: pointer after last entry of the directory | ||
470 | * @punch_lock: pointer to spinlock when needed for the holepunch case | ||
471 | */ | 365 | */ |
472 | static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, | 366 | static int shmem_free_swap(struct address_space *mapping, |
473 | spinlock_t *punch_lock) | 367 | pgoff_t index, void *radswap) |
474 | { | 368 | { |
475 | spinlock_t *punch_unlock = NULL; | 369 | int error; |
476 | swp_entry_t *ptr; | 370 | |
477 | int freed = 0; | 371 | spin_lock_irq(&mapping->tree_lock); |
478 | 372 | error = shmem_radix_tree_replace(mapping, index, radswap, NULL); | |
479 | for (ptr = dir; ptr < edir; ptr++) { | 373 | spin_unlock_irq(&mapping->tree_lock); |
480 | if (ptr->val) { | 374 | if (!error) |
481 | if (unlikely(punch_lock)) { | 375 | free_swap_and_cache(radix_to_swp_entry(radswap)); |
482 | punch_unlock = punch_lock; | 376 | return error; |
483 | punch_lock = NULL; | ||
484 | spin_lock(punch_unlock); | ||
485 | if (!ptr->val) | ||
486 | continue; | ||
487 | } | ||
488 | free_swap_and_cache(*ptr); | ||
489 | *ptr = (swp_entry_t){0}; | ||
490 | freed++; | ||
491 | } | ||
492 | } | ||
493 | if (punch_unlock) | ||
494 | spin_unlock(punch_unlock); | ||
495 | return freed; | ||
496 | } | ||
497 | |||
498 | static int shmem_map_and_free_swp(struct page *subdir, int offset, | ||
499 | int limit, struct page ***dir, spinlock_t *punch_lock) | ||
500 | { | ||
501 | swp_entry_t *ptr; | ||
502 | int freed = 0; | ||
503 | |||
504 | ptr = shmem_swp_map(subdir); | ||
505 | for (; offset < limit; offset += LATENCY_LIMIT) { | ||
506 | int size = limit - offset; | ||
507 | if (size > LATENCY_LIMIT) | ||
508 | size = LATENCY_LIMIT; | ||
509 | freed += shmem_free_swp(ptr+offset, ptr+offset+size, | ||
510 | punch_lock); | ||
511 | if (need_resched()) { | ||
512 | shmem_swp_unmap(ptr); | ||
513 | if (*dir) { | ||
514 | shmem_dir_unmap(*dir); | ||
515 | *dir = NULL; | ||
516 | } | ||
517 | cond_resched(); | ||
518 | ptr = shmem_swp_map(subdir); | ||
519 | } | ||
520 | } | ||
521 | shmem_swp_unmap(ptr); | ||
522 | return freed; | ||
523 | } | 377 | } |
524 | 378 | ||
525 | static void shmem_free_pages(struct list_head *next) | 379 | /* |
380 | * Pagevec may contain swap entries, so shuffle up pages before releasing. | ||
381 | */ | ||
382 | static void shmem_pagevec_release(struct pagevec *pvec) | ||
526 | { | 383 | { |
527 | struct page *page; | 384 | int i, j; |
528 | int freed = 0; | 385 | |
529 | 386 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { | |
530 | do { | 387 | struct page *page = pvec->pages[i]; |
531 | page = container_of(next, struct page, lru); | 388 | if (!radix_tree_exceptional_entry(page)) |
532 | next = next->next; | 389 | pvec->pages[j++] = page; |
533 | shmem_dir_free(page); | 390 | } |
534 | freed++; | 391 | pvec->nr = j; |
535 | if (freed >= LATENCY_LIMIT) { | 392 | pagevec_release(pvec); |
536 | cond_resched(); | ||
537 | freed = 0; | ||
538 | } | ||
539 | } while (next); | ||
540 | } | 393 | } |
541 | 394 | ||
542 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 395 | /* |
396 | * Remove range of pages and swap entries from radix tree, and free them. | ||
397 | */ | ||
398 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
543 | { | 399 | { |
400 | struct address_space *mapping = inode->i_mapping; | ||
544 | struct shmem_inode_info *info = SHMEM_I(inode); | 401 | struct shmem_inode_info *info = SHMEM_I(inode); |
545 | unsigned long idx; | 402 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
546 | unsigned long size; | 403 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
547 | unsigned long limit; | 404 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); |
548 | unsigned long stage; | 405 | struct pagevec pvec; |
549 | unsigned long diroff; | 406 | pgoff_t indices[PAGEVEC_SIZE]; |
550 | struct page **dir; | ||
551 | struct page *topdir; | ||
552 | struct page *middir; | ||
553 | struct page *subdir; | ||
554 | swp_entry_t *ptr; | ||
555 | LIST_HEAD(pages_to_free); | ||
556 | long nr_pages_to_free = 0; | ||
557 | long nr_swaps_freed = 0; | 407 | long nr_swaps_freed = 0; |
558 | int offset; | 408 | pgoff_t index; |
559 | int freed; | 409 | int i; |
560 | int punch_hole; | ||
561 | spinlock_t *needs_lock; | ||
562 | spinlock_t *punch_lock; | ||
563 | unsigned long upper_limit; | ||
564 | 410 | ||
565 | truncate_inode_pages_range(inode->i_mapping, start, end); | 411 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); |
566 | 412 | ||
567 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 413 | pagevec_init(&pvec, 0); |
568 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 414 | index = start; |
569 | if (idx >= info->next_index) | 415 | while (index <= end) { |
570 | return; | 416 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
417 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | ||
418 | pvec.pages, indices); | ||
419 | if (!pvec.nr) | ||
420 | break; | ||
421 | mem_cgroup_uncharge_start(); | ||
422 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
423 | struct page *page = pvec.pages[i]; | ||
571 | 424 | ||
572 | spin_lock(&info->lock); | 425 | index = indices[i]; |
573 | info->flags |= SHMEM_TRUNCATE; | 426 | if (index > end) |
574 | if (likely(end == (loff_t) -1)) { | 427 | break; |
575 | limit = info->next_index; | ||
576 | upper_limit = SHMEM_MAX_INDEX; | ||
577 | info->next_index = idx; | ||
578 | needs_lock = NULL; | ||
579 | punch_hole = 0; | ||
580 | } else { | ||
581 | if (end + 1 >= inode->i_size) { /* we may free a little more */ | ||
582 | limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> | ||
583 | PAGE_CACHE_SHIFT; | ||
584 | upper_limit = SHMEM_MAX_INDEX; | ||
585 | } else { | ||
586 | limit = (end + 1) >> PAGE_CACHE_SHIFT; | ||
587 | upper_limit = limit; | ||
588 | } | ||
589 | needs_lock = &info->lock; | ||
590 | punch_hole = 1; | ||
591 | } | ||
592 | 428 | ||
593 | topdir = info->i_indirect; | 429 | if (radix_tree_exceptional_entry(page)) { |
594 | if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { | 430 | nr_swaps_freed += !shmem_free_swap(mapping, |
595 | info->i_indirect = NULL; | 431 | index, page); |
596 | nr_pages_to_free++; | 432 | continue; |
597 | list_add(&topdir->lru, &pages_to_free); | 433 | } |
434 | |||
435 | if (!trylock_page(page)) | ||
436 | continue; | ||
437 | if (page->mapping == mapping) { | ||
438 | VM_BUG_ON(PageWriteback(page)); | ||
439 | truncate_inode_page(mapping, page); | ||
440 | } | ||
441 | unlock_page(page); | ||
442 | } | ||
443 | shmem_pagevec_release(&pvec); | ||
444 | mem_cgroup_uncharge_end(); | ||
445 | cond_resched(); | ||
446 | index++; | ||
598 | } | 447 | } |
599 | spin_unlock(&info->lock); | ||
600 | 448 | ||
601 | if (info->swapped && idx < SHMEM_NR_DIRECT) { | 449 | if (partial) { |
602 | ptr = info->i_direct; | 450 | struct page *page = NULL; |
603 | size = limit; | 451 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
604 | if (size > SHMEM_NR_DIRECT) | 452 | if (page) { |
605 | size = SHMEM_NR_DIRECT; | 453 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
606 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); | 454 | set_page_dirty(page); |
455 | unlock_page(page); | ||
456 | page_cache_release(page); | ||
457 | } | ||
607 | } | 458 | } |
608 | 459 | ||
609 | /* | 460 | index = start; |
610 | * If there are no indirect blocks or we are punching a hole | 461 | for ( ; ; ) { |
611 | * below indirect blocks, nothing to be done. | 462 | cond_resched(); |
612 | */ | 463 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
613 | if (!topdir || limit <= SHMEM_NR_DIRECT) | 464 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
614 | goto done2; | 465 | pvec.pages, indices); |
466 | if (!pvec.nr) { | ||
467 | if (index == start) | ||
468 | break; | ||
469 | index = start; | ||
470 | continue; | ||
471 | } | ||
472 | if (index == start && indices[0] > end) { | ||
473 | shmem_pagevec_release(&pvec); | ||
474 | break; | ||
475 | } | ||
476 | mem_cgroup_uncharge_start(); | ||
477 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
478 | struct page *page = pvec.pages[i]; | ||
615 | 479 | ||
616 | /* | 480 | index = indices[i]; |
617 | * The truncation case has already dropped info->lock, and we're safe | 481 | if (index > end) |
618 | * because i_size and next_index have already been lowered, preventing | 482 | break; |
619 | * access beyond. But in the punch_hole case, we still need to take | ||
620 | * the lock when updating the swap directory, because there might be | ||
621 | * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or | ||
622 | * shmem_writepage. However, whenever we find we can remove a whole | ||
623 | * directory page (not at the misaligned start or end of the range), | ||
624 | * we first NULLify its pointer in the level above, and then have no | ||
625 | * need to take the lock when updating its contents: needs_lock and | ||
626 | * punch_lock (either pointing to info->lock or NULL) manage this. | ||
627 | */ | ||
628 | 483 | ||
629 | upper_limit -= SHMEM_NR_DIRECT; | 484 | if (radix_tree_exceptional_entry(page)) { |
630 | limit -= SHMEM_NR_DIRECT; | 485 | nr_swaps_freed += !shmem_free_swap(mapping, |
631 | idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; | 486 | index, page); |
632 | offset = idx % ENTRIES_PER_PAGE; | 487 | continue; |
633 | idx -= offset; | ||
634 | |||
635 | dir = shmem_dir_map(topdir); | ||
636 | stage = ENTRIES_PER_PAGEPAGE/2; | ||
637 | if (idx < ENTRIES_PER_PAGEPAGE/2) { | ||
638 | middir = topdir; | ||
639 | diroff = idx/ENTRIES_PER_PAGE; | ||
640 | } else { | ||
641 | dir += ENTRIES_PER_PAGE/2; | ||
642 | dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; | ||
643 | while (stage <= idx) | ||
644 | stage += ENTRIES_PER_PAGEPAGE; | ||
645 | middir = *dir; | ||
646 | if (*dir) { | ||
647 | diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % | ||
648 | ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; | ||
649 | if (!diroff && !offset && upper_limit >= stage) { | ||
650 | if (needs_lock) { | ||
651 | spin_lock(needs_lock); | ||
652 | *dir = NULL; | ||
653 | spin_unlock(needs_lock); | ||
654 | needs_lock = NULL; | ||
655 | } else | ||
656 | *dir = NULL; | ||
657 | nr_pages_to_free++; | ||
658 | list_add(&middir->lru, &pages_to_free); | ||
659 | } | 488 | } |
660 | shmem_dir_unmap(dir); | ||
661 | dir = shmem_dir_map(middir); | ||
662 | } else { | ||
663 | diroff = 0; | ||
664 | offset = 0; | ||
665 | idx = stage; | ||
666 | } | ||
667 | } | ||
668 | 489 | ||
669 | for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { | 490 | lock_page(page); |
670 | if (unlikely(idx == stage)) { | 491 | if (page->mapping == mapping) { |
671 | shmem_dir_unmap(dir); | 492 | VM_BUG_ON(PageWriteback(page)); |
672 | dir = shmem_dir_map(topdir) + | 493 | truncate_inode_page(mapping, page); |
673 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
674 | while (!*dir) { | ||
675 | dir++; | ||
676 | idx += ENTRIES_PER_PAGEPAGE; | ||
677 | if (idx >= limit) | ||
678 | goto done1; | ||
679 | } | 494 | } |
680 | stage = idx + ENTRIES_PER_PAGEPAGE; | 495 | unlock_page(page); |
681 | middir = *dir; | ||
682 | if (punch_hole) | ||
683 | needs_lock = &info->lock; | ||
684 | if (upper_limit >= stage) { | ||
685 | if (needs_lock) { | ||
686 | spin_lock(needs_lock); | ||
687 | *dir = NULL; | ||
688 | spin_unlock(needs_lock); | ||
689 | needs_lock = NULL; | ||
690 | } else | ||
691 | *dir = NULL; | ||
692 | nr_pages_to_free++; | ||
693 | list_add(&middir->lru, &pages_to_free); | ||
694 | } | ||
695 | shmem_dir_unmap(dir); | ||
696 | cond_resched(); | ||
697 | dir = shmem_dir_map(middir); | ||
698 | diroff = 0; | ||
699 | } | ||
700 | punch_lock = needs_lock; | ||
701 | subdir = dir[diroff]; | ||
702 | if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { | ||
703 | if (needs_lock) { | ||
704 | spin_lock(needs_lock); | ||
705 | dir[diroff] = NULL; | ||
706 | spin_unlock(needs_lock); | ||
707 | punch_lock = NULL; | ||
708 | } else | ||
709 | dir[diroff] = NULL; | ||
710 | nr_pages_to_free++; | ||
711 | list_add(&subdir->lru, &pages_to_free); | ||
712 | } | ||
713 | if (subdir && page_private(subdir) /* has swap entries */) { | ||
714 | size = limit - idx; | ||
715 | if (size > ENTRIES_PER_PAGE) | ||
716 | size = ENTRIES_PER_PAGE; | ||
717 | freed = shmem_map_and_free_swp(subdir, | ||
718 | offset, size, &dir, punch_lock); | ||
719 | if (!dir) | ||
720 | dir = shmem_dir_map(middir); | ||
721 | nr_swaps_freed += freed; | ||
722 | if (offset || punch_lock) { | ||
723 | spin_lock(&info->lock); | ||
724 | set_page_private(subdir, | ||
725 | page_private(subdir) - freed); | ||
726 | spin_unlock(&info->lock); | ||
727 | } else | ||
728 | BUG_ON(page_private(subdir) != freed); | ||
729 | } | 496 | } |
730 | offset = 0; | 497 | shmem_pagevec_release(&pvec); |
731 | } | 498 | mem_cgroup_uncharge_end(); |
732 | done1: | 499 | index++; |
733 | shmem_dir_unmap(dir); | ||
734 | done2: | ||
735 | if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { | ||
736 | /* | ||
737 | * Call truncate_inode_pages again: racing shmem_unuse_inode | ||
738 | * may have swizzled a page in from swap since | ||
739 | * truncate_pagecache or generic_delete_inode did it, before we | ||
740 | * lowered next_index. Also, though shmem_getpage checks | ||
741 | * i_size before adding to cache, no recheck after: so fix the | ||
742 | * narrow window there too. | ||
743 | */ | ||
744 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
745 | } | 500 | } |
746 | 501 | ||
747 | spin_lock(&info->lock); | 502 | spin_lock(&info->lock); |
748 | info->flags &= ~SHMEM_TRUNCATE; | ||
749 | info->swapped -= nr_swaps_freed; | 503 | info->swapped -= nr_swaps_freed; |
750 | if (nr_pages_to_free) | ||
751 | shmem_free_blocks(inode, nr_pages_to_free); | ||
752 | shmem_recalc_inode(inode); | 504 | shmem_recalc_inode(inode); |
753 | spin_unlock(&info->lock); | 505 | spin_unlock(&info->lock); |
754 | 506 | ||
755 | /* | 507 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
756 | * Empty swap vector directory pages to be freed? | ||
757 | */ | ||
758 | if (!list_empty(&pages_to_free)) { | ||
759 | pages_to_free.prev->next = NULL; | ||
760 | shmem_free_pages(pages_to_free.next); | ||
761 | } | ||
762 | } | 508 | } |
763 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 509 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
764 | 510 | ||
@@ -774,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
774 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | 520 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
775 | loff_t oldsize = inode->i_size; | 521 | loff_t oldsize = inode->i_size; |
776 | loff_t newsize = attr->ia_size; | 522 | loff_t newsize = attr->ia_size; |
777 | struct page *page = NULL; | ||
778 | 523 | ||
779 | if (newsize < oldsize) { | ||
780 | /* | ||
781 | * If truncating down to a partial page, then | ||
782 | * if that page is already allocated, hold it | ||
783 | * in memory until the truncation is over, so | ||
784 | * truncate_partial_page cannot miss it were | ||
785 | * it assigned to swap. | ||
786 | */ | ||
787 | if (newsize & (PAGE_CACHE_SIZE-1)) { | ||
788 | (void) shmem_getpage(inode, | ||
789 | newsize >> PAGE_CACHE_SHIFT, | ||
790 | &page, SGP_READ, NULL); | ||
791 | if (page) | ||
792 | unlock_page(page); | ||
793 | } | ||
794 | /* | ||
795 | * Reset SHMEM_PAGEIN flag so that shmem_truncate can | ||
796 | * detect if any pages might have been added to cache | ||
797 | * after truncate_inode_pages. But we needn't bother | ||
798 | * if it's being fully truncated to zero-length: the | ||
799 | * nrpages check is efficient enough in that case. | ||
800 | */ | ||
801 | if (newsize) { | ||
802 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
803 | spin_lock(&info->lock); | ||
804 | info->flags &= ~SHMEM_PAGEIN; | ||
805 | spin_unlock(&info->lock); | ||
806 | } | ||
807 | } | ||
808 | if (newsize != oldsize) { | 524 | if (newsize != oldsize) { |
809 | i_size_write(inode, newsize); | 525 | i_size_write(inode, newsize); |
810 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 526 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
@@ -816,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
816 | /* unmap again to remove racily COWed private pages */ | 532 | /* unmap again to remove racily COWed private pages */ |
817 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | 533 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); |
818 | } | 534 | } |
819 | if (page) | ||
820 | page_cache_release(page); | ||
821 | } | 535 | } |
822 | 536 | ||
823 | setattr_copy(inode, attr); | 537 | setattr_copy(inode, attr); |
@@ -842,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode) | |||
842 | list_del_init(&info->swaplist); | 556 | list_del_init(&info->swaplist); |
843 | mutex_unlock(&shmem_swaplist_mutex); | 557 | mutex_unlock(&shmem_swaplist_mutex); |
844 | } | 558 | } |
845 | } | 559 | } else |
560 | kfree(info->symlink); | ||
846 | 561 | ||
847 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { | 562 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { |
848 | kfree(xattr->name); | 563 | kfree(xattr->name); |
@@ -853,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode) | |||
853 | end_writeback(inode); | 568 | end_writeback(inode); |
854 | } | 569 | } |
855 | 570 | ||
856 | static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) | 571 | /* |
857 | { | 572 | * If swap found in inode, free it and move page from swapcache to filecache. |
858 | swp_entry_t *ptr; | 573 | */ |
859 | 574 | static int shmem_unuse_inode(struct shmem_inode_info *info, | |
860 | for (ptr = dir; ptr < edir; ptr++) { | 575 | swp_entry_t swap, struct page *page) |
861 | if (ptr->val == entry.val) | ||
862 | return ptr - dir; | ||
863 | } | ||
864 | return -1; | ||
865 | } | ||
866 | |||
867 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | ||
868 | { | 576 | { |
869 | struct address_space *mapping; | 577 | struct address_space *mapping = info->vfs_inode.i_mapping; |
870 | unsigned long idx; | 578 | void *radswap; |
871 | unsigned long size; | 579 | pgoff_t index; |
872 | unsigned long limit; | ||
873 | unsigned long stage; | ||
874 | struct page **dir; | ||
875 | struct page *subdir; | ||
876 | swp_entry_t *ptr; | ||
877 | int offset; | ||
878 | int error; | 580 | int error; |
879 | 581 | ||
880 | idx = 0; | 582 | radswap = swp_to_radix_entry(swap); |
881 | ptr = info->i_direct; | 583 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
882 | spin_lock(&info->lock); | 584 | if (index == -1) |
883 | if (!info->swapped) { | 585 | return 0; |
884 | list_del_init(&info->swaplist); | ||
885 | goto lost2; | ||
886 | } | ||
887 | limit = info->next_index; | ||
888 | size = limit; | ||
889 | if (size > SHMEM_NR_DIRECT) | ||
890 | size = SHMEM_NR_DIRECT; | ||
891 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
892 | if (offset >= 0) { | ||
893 | shmem_swp_balance_unmap(); | ||
894 | goto found; | ||
895 | } | ||
896 | if (!info->i_indirect) | ||
897 | goto lost2; | ||
898 | |||
899 | dir = shmem_dir_map(info->i_indirect); | ||
900 | stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; | ||
901 | |||
902 | for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { | ||
903 | if (unlikely(idx == stage)) { | ||
904 | shmem_dir_unmap(dir-1); | ||
905 | if (cond_resched_lock(&info->lock)) { | ||
906 | /* check it has not been truncated */ | ||
907 | if (limit > info->next_index) { | ||
908 | limit = info->next_index; | ||
909 | if (idx >= limit) | ||
910 | goto lost2; | ||
911 | } | ||
912 | } | ||
913 | dir = shmem_dir_map(info->i_indirect) + | ||
914 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
915 | while (!*dir) { | ||
916 | dir++; | ||
917 | idx += ENTRIES_PER_PAGEPAGE; | ||
918 | if (idx >= limit) | ||
919 | goto lost1; | ||
920 | } | ||
921 | stage = idx + ENTRIES_PER_PAGEPAGE; | ||
922 | subdir = *dir; | ||
923 | shmem_dir_unmap(dir); | ||
924 | dir = shmem_dir_map(subdir); | ||
925 | } | ||
926 | subdir = *dir; | ||
927 | if (subdir && page_private(subdir)) { | ||
928 | ptr = shmem_swp_map(subdir); | ||
929 | size = limit - idx; | ||
930 | if (size > ENTRIES_PER_PAGE) | ||
931 | size = ENTRIES_PER_PAGE; | ||
932 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
933 | shmem_swp_unmap(ptr); | ||
934 | if (offset >= 0) { | ||
935 | shmem_dir_unmap(dir); | ||
936 | ptr = shmem_swp_map(subdir); | ||
937 | goto found; | ||
938 | } | ||
939 | } | ||
940 | } | ||
941 | lost1: | ||
942 | shmem_dir_unmap(dir-1); | ||
943 | lost2: | ||
944 | spin_unlock(&info->lock); | ||
945 | return 0; | ||
946 | found: | ||
947 | idx += offset; | ||
948 | ptr += offset; | ||
949 | 586 | ||
950 | /* | 587 | /* |
951 | * Move _head_ to start search for next from here. | 588 | * Move _head_ to start search for next from here. |
952 | * But be careful: shmem_evict_inode checks list_empty without taking | 589 | * But be careful: shmem_evict_inode checks list_empty without taking |
953 | * mutex, and there's an instant in list_move_tail when info->swaplist | 590 | * mutex, and there's an instant in list_move_tail when info->swaplist |
954 | * would appear empty, if it were the only one on shmem_swaplist. We | 591 | * would appear empty, if it were the only one on shmem_swaplist. |
955 | * could avoid doing it if inode NULL; or use this minor optimization. | ||
956 | */ | 592 | */ |
957 | if (shmem_swaplist.next != &info->swaplist) | 593 | if (shmem_swaplist.next != &info->swaplist) |
958 | list_move_tail(&shmem_swaplist, &info->swaplist); | 594 | list_move_tail(&shmem_swaplist, &info->swaplist); |
@@ -962,42 +598,34 @@ found: | |||
962 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 598 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
963 | * beneath us (pagelock doesn't help until the page is in pagecache). | 599 | * beneath us (pagelock doesn't help until the page is in pagecache). |
964 | */ | 600 | */ |
965 | mapping = info->vfs_inode.i_mapping; | 601 | error = shmem_add_to_page_cache(page, mapping, index, |
966 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); | 602 | GFP_NOWAIT, radswap); |
967 | /* which does mem_cgroup_uncharge_cache_page on error */ | 603 | /* which does mem_cgroup_uncharge_cache_page on error */ |
968 | 604 | ||
969 | if (error == -EEXIST) { | 605 | if (error != -ENOMEM) { |
970 | struct page *filepage = find_get_page(mapping, idx); | 606 | /* |
971 | error = 1; | 607 | * Truncation and eviction use free_swap_and_cache(), which |
972 | if (filepage) { | 608 | * only does trylock page: if we raced, best clean up here. |
973 | /* | 609 | */ |
974 | * There might be a more uptodate page coming down | ||
975 | * from a stacked writepage: forget our swappage if so. | ||
976 | */ | ||
977 | if (PageUptodate(filepage)) | ||
978 | error = 0; | ||
979 | page_cache_release(filepage); | ||
980 | } | ||
981 | } | ||
982 | if (!error) { | ||
983 | delete_from_swap_cache(page); | 610 | delete_from_swap_cache(page); |
984 | set_page_dirty(page); | 611 | set_page_dirty(page); |
985 | info->flags |= SHMEM_PAGEIN; | 612 | if (!error) { |
986 | shmem_swp_set(info, ptr, 0); | 613 | spin_lock(&info->lock); |
987 | swap_free(entry); | 614 | info->swapped--; |
615 | spin_unlock(&info->lock); | ||
616 | swap_free(swap); | ||
617 | } | ||
988 | error = 1; /* not an error, but entry was found */ | 618 | error = 1; /* not an error, but entry was found */ |
989 | } | 619 | } |
990 | shmem_swp_unmap(ptr); | ||
991 | spin_unlock(&info->lock); | ||
992 | return error; | 620 | return error; |
993 | } | 621 | } |
994 | 622 | ||
995 | /* | 623 | /* |
996 | * shmem_unuse() search for an eventually swapped out shmem page. | 624 | * Search through swapped inodes to find and replace swap by page. |
997 | */ | 625 | */ |
998 | int shmem_unuse(swp_entry_t entry, struct page *page) | 626 | int shmem_unuse(swp_entry_t swap, struct page *page) |
999 | { | 627 | { |
1000 | struct list_head *p, *next; | 628 | struct list_head *this, *next; |
1001 | struct shmem_inode_info *info; | 629 | struct shmem_inode_info *info; |
1002 | int found = 0; | 630 | int found = 0; |
1003 | int error; | 631 | int error; |
@@ -1006,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1006 | * Charge page using GFP_KERNEL while we can wait, before taking | 634 | * Charge page using GFP_KERNEL while we can wait, before taking |
1007 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 635 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
1008 | * Charged back to the user (not to caller) when swap account is used. | 636 | * Charged back to the user (not to caller) when swap account is used. |
1009 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
1010 | */ | 637 | */ |
1011 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 638 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
1012 | if (error) | 639 | if (error) |
1013 | goto out; | 640 | goto out; |
1014 | /* | 641 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
1015 | * Try to preload while we can wait, to not make a habit of | ||
1016 | * draining atomic reserves; but don't latch on to this cpu, | ||
1017 | * it's okay if sometimes we get rescheduled after this. | ||
1018 | */ | ||
1019 | error = radix_tree_preload(GFP_KERNEL); | ||
1020 | if (error) | ||
1021 | goto uncharge; | ||
1022 | radix_tree_preload_end(); | ||
1023 | 642 | ||
1024 | mutex_lock(&shmem_swaplist_mutex); | 643 | mutex_lock(&shmem_swaplist_mutex); |
1025 | list_for_each_safe(p, next, &shmem_swaplist) { | 644 | list_for_each_safe(this, next, &shmem_swaplist) { |
1026 | info = list_entry(p, struct shmem_inode_info, swaplist); | 645 | info = list_entry(this, struct shmem_inode_info, swaplist); |
1027 | found = shmem_unuse_inode(info, entry, page); | 646 | if (info->swapped) |
647 | found = shmem_unuse_inode(info, swap, page); | ||
648 | else | ||
649 | list_del_init(&info->swaplist); | ||
1028 | cond_resched(); | 650 | cond_resched(); |
1029 | if (found) | 651 | if (found) |
1030 | break; | 652 | break; |
1031 | } | 653 | } |
1032 | mutex_unlock(&shmem_swaplist_mutex); | 654 | mutex_unlock(&shmem_swaplist_mutex); |
1033 | 655 | ||
1034 | uncharge: | ||
1035 | if (!found) | 656 | if (!found) |
1036 | mem_cgroup_uncharge_cache_page(page); | 657 | mem_cgroup_uncharge_cache_page(page); |
1037 | if (found < 0) | 658 | if (found < 0) |
@@ -1048,10 +669,10 @@ out: | |||
1048 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) | 669 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) |
1049 | { | 670 | { |
1050 | struct shmem_inode_info *info; | 671 | struct shmem_inode_info *info; |
1051 | swp_entry_t *entry, swap; | ||
1052 | struct address_space *mapping; | 672 | struct address_space *mapping; |
1053 | unsigned long index; | ||
1054 | struct inode *inode; | 673 | struct inode *inode; |
674 | swp_entry_t swap; | ||
675 | pgoff_t index; | ||
1055 | 676 | ||
1056 | BUG_ON(!PageLocked(page)); | 677 | BUG_ON(!PageLocked(page)); |
1057 | mapping = page->mapping; | 678 | mapping = page->mapping; |
@@ -1066,69 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1066 | /* | 687 | /* |
1067 | * shmem_backing_dev_info's capabilities prevent regular writeback or | 688 | * shmem_backing_dev_info's capabilities prevent regular writeback or |
1068 | * sync from ever calling shmem_writepage; but a stacking filesystem | 689 | * sync from ever calling shmem_writepage; but a stacking filesystem |
1069 | * may use the ->writepage of its underlying filesystem, in which case | 690 | * might use ->writepage of its underlying filesystem, in which case |
1070 | * tmpfs should write out to swap only in response to memory pressure, | 691 | * tmpfs should write out to swap only in response to memory pressure, |
1071 | * and not for the writeback threads or sync. However, in those cases, | 692 | * and not for the writeback threads or sync. |
1072 | * we do still want to check if there's a redundant swappage to be | ||
1073 | * discarded. | ||
1074 | */ | 693 | */ |
1075 | if (wbc->for_reclaim) | 694 | if (!wbc->for_reclaim) { |
1076 | swap = get_swap_page(); | 695 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
1077 | else | 696 | goto redirty; |
1078 | swap.val = 0; | 697 | } |
698 | swap = get_swap_page(); | ||
699 | if (!swap.val) | ||
700 | goto redirty; | ||
1079 | 701 | ||
1080 | /* | 702 | /* |
1081 | * Add inode to shmem_unuse()'s list of swapped-out inodes, | 703 | * Add inode to shmem_unuse()'s list of swapped-out inodes, |
1082 | * if it's not already there. Do it now because we cannot take | 704 | * if it's not already there. Do it now before the page is |
1083 | * mutex while holding spinlock, and must do so before the page | 705 | * moved to swap cache, when its pagelock no longer protects |
1084 | * is moved to swap cache, when its pagelock no longer protects | ||
1085 | * the inode from eviction. But don't unlock the mutex until | 706 | * the inode from eviction. But don't unlock the mutex until |
1086 | * we've taken the spinlock, because shmem_unuse_inode() will | 707 | * we've incremented swapped, because shmem_unuse_inode() will |
1087 | * prune a !swapped inode from the swaplist under both locks. | 708 | * prune a !swapped inode from the swaplist under this mutex. |
1088 | */ | 709 | */ |
1089 | if (swap.val) { | 710 | mutex_lock(&shmem_swaplist_mutex); |
1090 | mutex_lock(&shmem_swaplist_mutex); | 711 | if (list_empty(&info->swaplist)) |
1091 | if (list_empty(&info->swaplist)) | 712 | list_add_tail(&info->swaplist, &shmem_swaplist); |
1092 | list_add_tail(&info->swaplist, &shmem_swaplist); | ||
1093 | } | ||
1094 | |||
1095 | spin_lock(&info->lock); | ||
1096 | if (swap.val) | ||
1097 | mutex_unlock(&shmem_swaplist_mutex); | ||
1098 | |||
1099 | if (index >= info->next_index) { | ||
1100 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | ||
1101 | goto unlock; | ||
1102 | } | ||
1103 | entry = shmem_swp_entry(info, index, NULL); | ||
1104 | if (entry->val) { | ||
1105 | /* | ||
1106 | * The more uptodate page coming down from a stacked | ||
1107 | * writepage should replace our old swappage. | ||
1108 | */ | ||
1109 | free_swap_and_cache(*entry); | ||
1110 | shmem_swp_set(info, entry, 0); | ||
1111 | } | ||
1112 | shmem_recalc_inode(inode); | ||
1113 | 713 | ||
1114 | if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 714 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1115 | delete_from_page_cache(page); | ||
1116 | shmem_swp_set(info, entry, swap.val); | ||
1117 | shmem_swp_unmap(entry); | ||
1118 | swap_shmem_alloc(swap); | 715 | swap_shmem_alloc(swap); |
716 | shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); | ||
717 | |||
718 | spin_lock(&info->lock); | ||
719 | info->swapped++; | ||
720 | shmem_recalc_inode(inode); | ||
1119 | spin_unlock(&info->lock); | 721 | spin_unlock(&info->lock); |
722 | |||
723 | mutex_unlock(&shmem_swaplist_mutex); | ||
1120 | BUG_ON(page_mapped(page)); | 724 | BUG_ON(page_mapped(page)); |
1121 | swap_writepage(page, wbc); | 725 | swap_writepage(page, wbc); |
1122 | return 0; | 726 | return 0; |
1123 | } | 727 | } |
1124 | 728 | ||
1125 | shmem_swp_unmap(entry); | 729 | mutex_unlock(&shmem_swaplist_mutex); |
1126 | unlock: | ||
1127 | spin_unlock(&info->lock); | ||
1128 | /* | ||
1129 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
1130 | * clear SWAP_HAS_CACHE flag. | ||
1131 | */ | ||
1132 | swapcache_free(swap, NULL); | 730 | swapcache_free(swap, NULL); |
1133 | redirty: | 731 | redirty: |
1134 | set_page_dirty(page); | 732 | set_page_dirty(page); |
@@ -1165,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1165 | } | 763 | } |
1166 | #endif /* CONFIG_TMPFS */ | 764 | #endif /* CONFIG_TMPFS */ |
1167 | 765 | ||
1168 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 766 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1169 | struct shmem_inode_info *info, unsigned long idx) | 767 | struct shmem_inode_info *info, pgoff_t index) |
1170 | { | 768 | { |
1171 | struct mempolicy mpol, *spol; | 769 | struct mempolicy mpol, *spol; |
1172 | struct vm_area_struct pvma; | 770 | struct vm_area_struct pvma; |
1173 | struct page *page; | ||
1174 | 771 | ||
1175 | spol = mpol_cond_copy(&mpol, | 772 | spol = mpol_cond_copy(&mpol, |
1176 | mpol_shared_policy_lookup(&info->policy, idx)); | 773 | mpol_shared_policy_lookup(&info->policy, index)); |
1177 | 774 | ||
1178 | /* Create a pseudo vma that just contains the policy */ | 775 | /* Create a pseudo vma that just contains the policy */ |
1179 | pvma.vm_start = 0; | 776 | pvma.vm_start = 0; |
1180 | pvma.vm_pgoff = idx; | 777 | pvma.vm_pgoff = index; |
1181 | pvma.vm_ops = NULL; | 778 | pvma.vm_ops = NULL; |
1182 | pvma.vm_policy = spol; | 779 | pvma.vm_policy = spol; |
1183 | page = swapin_readahead(entry, gfp, &pvma, 0); | 780 | return swapin_readahead(swap, gfp, &pvma, 0); |
1184 | return page; | ||
1185 | } | 781 | } |
1186 | 782 | ||
1187 | static struct page *shmem_alloc_page(gfp_t gfp, | 783 | static struct page *shmem_alloc_page(gfp_t gfp, |
1188 | struct shmem_inode_info *info, unsigned long idx) | 784 | struct shmem_inode_info *info, pgoff_t index) |
1189 | { | 785 | { |
1190 | struct vm_area_struct pvma; | 786 | struct vm_area_struct pvma; |
1191 | 787 | ||
1192 | /* Create a pseudo vma that just contains the policy */ | 788 | /* Create a pseudo vma that just contains the policy */ |
1193 | pvma.vm_start = 0; | 789 | pvma.vm_start = 0; |
1194 | pvma.vm_pgoff = idx; | 790 | pvma.vm_pgoff = index; |
1195 | pvma.vm_ops = NULL; | 791 | pvma.vm_ops = NULL; |
1196 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 792 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
1197 | 793 | ||
1198 | /* | 794 | /* |
1199 | * alloc_page_vma() will drop the shared policy reference | 795 | * alloc_page_vma() will drop the shared policy reference |
@@ -1202,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1202 | } | 798 | } |
1203 | #else /* !CONFIG_NUMA */ | 799 | #else /* !CONFIG_NUMA */ |
1204 | #ifdef CONFIG_TMPFS | 800 | #ifdef CONFIG_TMPFS |
1205 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) | 801 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) |
1206 | { | 802 | { |
1207 | } | 803 | } |
1208 | #endif /* CONFIG_TMPFS */ | 804 | #endif /* CONFIG_TMPFS */ |
1209 | 805 | ||
1210 | static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 806 | static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1211 | struct shmem_inode_info *info, unsigned long idx) | 807 | struct shmem_inode_info *info, pgoff_t index) |
1212 | { | 808 | { |
1213 | return swapin_readahead(entry, gfp, NULL, 0); | 809 | return swapin_readahead(swap, gfp, NULL, 0); |
1214 | } | 810 | } |
1215 | 811 | ||
1216 | static inline struct page *shmem_alloc_page(gfp_t gfp, | 812 | static inline struct page *shmem_alloc_page(gfp_t gfp, |
1217 | struct shmem_inode_info *info, unsigned long idx) | 813 | struct shmem_inode_info *info, pgoff_t index) |
1218 | { | 814 | { |
1219 | return alloc_page(gfp); | 815 | return alloc_page(gfp); |
1220 | } | 816 | } |
@@ -1228,311 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1228 | #endif | 824 | #endif |
1229 | 825 | ||
1230 | /* | 826 | /* |
1231 | * shmem_getpage - either get the page from swap or allocate a new one | 827 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
1232 | * | 828 | * |
1233 | * If we allocate a new one we do not mark it dirty. That's up to the | 829 | * If we allocate a new one we do not mark it dirty. That's up to the |
1234 | * vm. If we swap it in we mark it dirty since we also free the swap | 830 | * vm. If we swap it in we mark it dirty since we also free the swap |
1235 | * entry since a page cannot live in both the swap and page cache | 831 | * entry since a page cannot live in both the swap and page cache |
1236 | */ | 832 | */ |
1237 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 833 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
1238 | struct page **pagep, enum sgp_type sgp, int *type) | 834 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) |
1239 | { | 835 | { |
1240 | struct address_space *mapping = inode->i_mapping; | 836 | struct address_space *mapping = inode->i_mapping; |
1241 | struct shmem_inode_info *info = SHMEM_I(inode); | 837 | struct shmem_inode_info *info; |
1242 | struct shmem_sb_info *sbinfo; | 838 | struct shmem_sb_info *sbinfo; |
1243 | struct page *filepage = *pagep; | 839 | struct page *page; |
1244 | struct page *swappage; | ||
1245 | struct page *prealloc_page = NULL; | ||
1246 | swp_entry_t *entry; | ||
1247 | swp_entry_t swap; | 840 | swp_entry_t swap; |
1248 | gfp_t gfp; | ||
1249 | int error; | 841 | int error; |
842 | int once = 0; | ||
1250 | 843 | ||
1251 | if (idx >= SHMEM_MAX_INDEX) | 844 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
1252 | return -EFBIG; | 845 | return -EFBIG; |
846 | repeat: | ||
847 | swap.val = 0; | ||
848 | page = find_lock_page(mapping, index); | ||
849 | if (radix_tree_exceptional_entry(page)) { | ||
850 | swap = radix_to_swp_entry(page); | ||
851 | page = NULL; | ||
852 | } | ||
1253 | 853 | ||
1254 | if (type) | 854 | if (sgp != SGP_WRITE && |
1255 | *type = 0; | 855 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
856 | error = -EINVAL; | ||
857 | goto failed; | ||
858 | } | ||
1256 | 859 | ||
1257 | /* | 860 | if (page || (sgp == SGP_READ && !swap.val)) { |
1258 | * Normally, filepage is NULL on entry, and either found | ||
1259 | * uptodate immediately, or allocated and zeroed, or read | ||
1260 | * in under swappage, which is then assigned to filepage. | ||
1261 | * But shmem_readpage (required for splice) passes in a locked | ||
1262 | * filepage, which may be found not uptodate by other callers | ||
1263 | * too, and may need to be copied from the swappage read in. | ||
1264 | */ | ||
1265 | repeat: | ||
1266 | if (!filepage) | ||
1267 | filepage = find_lock_page(mapping, idx); | ||
1268 | if (filepage && PageUptodate(filepage)) | ||
1269 | goto done; | ||
1270 | gfp = mapping_gfp_mask(mapping); | ||
1271 | if (!filepage) { | ||
1272 | /* | 861 | /* |
1273 | * Try to preload while we can wait, to not make a habit of | 862 | * Once we can get the page lock, it must be uptodate: |
1274 | * draining atomic reserves; but don't latch on to this cpu. | 863 | * if there were an error in reading back from swap, |
864 | * the page would not be inserted into the filecache. | ||
1275 | */ | 865 | */ |
1276 | error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); | 866 | BUG_ON(page && !PageUptodate(page)); |
1277 | if (error) | 867 | *pagep = page; |
1278 | goto failed; | 868 | return 0; |
1279 | radix_tree_preload_end(); | ||
1280 | if (sgp != SGP_READ && !prealloc_page) { | ||
1281 | /* We don't care if this fails */ | ||
1282 | prealloc_page = shmem_alloc_page(gfp, info, idx); | ||
1283 | if (prealloc_page) { | ||
1284 | if (mem_cgroup_cache_charge(prealloc_page, | ||
1285 | current->mm, GFP_KERNEL)) { | ||
1286 | page_cache_release(prealloc_page); | ||
1287 | prealloc_page = NULL; | ||
1288 | } | ||
1289 | } | ||
1290 | } | ||
1291 | } | 869 | } |
1292 | error = 0; | ||
1293 | 870 | ||
1294 | spin_lock(&info->lock); | 871 | /* |
1295 | shmem_recalc_inode(inode); | 872 | * Fast cache lookup did not find it: |
1296 | entry = shmem_swp_alloc(info, idx, sgp); | 873 | * bring it back from swap or allocate. |
1297 | if (IS_ERR(entry)) { | 874 | */ |
1298 | spin_unlock(&info->lock); | 875 | info = SHMEM_I(inode); |
1299 | error = PTR_ERR(entry); | 876 | sbinfo = SHMEM_SB(inode->i_sb); |
1300 | goto failed; | ||
1301 | } | ||
1302 | swap = *entry; | ||
1303 | 877 | ||
1304 | if (swap.val) { | 878 | if (swap.val) { |
1305 | /* Look it up and read it in.. */ | 879 | /* Look it up and read it in.. */ |
1306 | swappage = lookup_swap_cache(swap); | 880 | page = lookup_swap_cache(swap); |
1307 | if (!swappage) { | 881 | if (!page) { |
1308 | shmem_swp_unmap(entry); | ||
1309 | spin_unlock(&info->lock); | ||
1310 | /* here we actually do the io */ | 882 | /* here we actually do the io */ |
1311 | if (type) | 883 | if (fault_type) |
1312 | *type |= VM_FAULT_MAJOR; | 884 | *fault_type |= VM_FAULT_MAJOR; |
1313 | swappage = shmem_swapin(swap, gfp, info, idx); | 885 | page = shmem_swapin(swap, gfp, info, index); |
1314 | if (!swappage) { | 886 | if (!page) { |
1315 | spin_lock(&info->lock); | 887 | error = -ENOMEM; |
1316 | entry = shmem_swp_alloc(info, idx, sgp); | 888 | goto failed; |
1317 | if (IS_ERR(entry)) | ||
1318 | error = PTR_ERR(entry); | ||
1319 | else { | ||
1320 | if (entry->val == swap.val) | ||
1321 | error = -ENOMEM; | ||
1322 | shmem_swp_unmap(entry); | ||
1323 | } | ||
1324 | spin_unlock(&info->lock); | ||
1325 | if (error) | ||
1326 | goto failed; | ||
1327 | goto repeat; | ||
1328 | } | 889 | } |
1329 | wait_on_page_locked(swappage); | ||
1330 | page_cache_release(swappage); | ||
1331 | goto repeat; | ||
1332 | } | 890 | } |
1333 | 891 | ||
1334 | /* We have to do this with page locked to prevent races */ | 892 | /* We have to do this with page locked to prevent races */ |
1335 | if (!trylock_page(swappage)) { | 893 | lock_page(page); |
1336 | shmem_swp_unmap(entry); | 894 | if (!PageUptodate(page)) { |
1337 | spin_unlock(&info->lock); | ||
1338 | wait_on_page_locked(swappage); | ||
1339 | page_cache_release(swappage); | ||
1340 | goto repeat; | ||
1341 | } | ||
1342 | if (PageWriteback(swappage)) { | ||
1343 | shmem_swp_unmap(entry); | ||
1344 | spin_unlock(&info->lock); | ||
1345 | wait_on_page_writeback(swappage); | ||
1346 | unlock_page(swappage); | ||
1347 | page_cache_release(swappage); | ||
1348 | goto repeat; | ||
1349 | } | ||
1350 | if (!PageUptodate(swappage)) { | ||
1351 | shmem_swp_unmap(entry); | ||
1352 | spin_unlock(&info->lock); | ||
1353 | unlock_page(swappage); | ||
1354 | page_cache_release(swappage); | ||
1355 | error = -EIO; | 895 | error = -EIO; |
1356 | goto failed; | 896 | goto failed; |
1357 | } | 897 | } |
1358 | 898 | wait_on_page_writeback(page); | |
1359 | if (filepage) { | 899 | |
1360 | shmem_swp_set(info, entry, 0); | 900 | /* Someone may have already done it for us */ |
1361 | shmem_swp_unmap(entry); | 901 | if (page->mapping) { |
1362 | delete_from_swap_cache(swappage); | 902 | if (page->mapping == mapping && |
1363 | spin_unlock(&info->lock); | 903 | page->index == index) |
1364 | copy_highpage(filepage, swappage); | 904 | goto done; |
1365 | unlock_page(swappage); | 905 | error = -EEXIST; |
1366 | page_cache_release(swappage); | 906 | goto failed; |
1367 | flush_dcache_page(filepage); | ||
1368 | SetPageUptodate(filepage); | ||
1369 | set_page_dirty(filepage); | ||
1370 | swap_free(swap); | ||
1371 | } else if (!(error = add_to_page_cache_locked(swappage, mapping, | ||
1372 | idx, GFP_NOWAIT))) { | ||
1373 | info->flags |= SHMEM_PAGEIN; | ||
1374 | shmem_swp_set(info, entry, 0); | ||
1375 | shmem_swp_unmap(entry); | ||
1376 | delete_from_swap_cache(swappage); | ||
1377 | spin_unlock(&info->lock); | ||
1378 | filepage = swappage; | ||
1379 | set_page_dirty(filepage); | ||
1380 | swap_free(swap); | ||
1381 | } else { | ||
1382 | shmem_swp_unmap(entry); | ||
1383 | spin_unlock(&info->lock); | ||
1384 | if (error == -ENOMEM) { | ||
1385 | /* | ||
1386 | * reclaim from proper memory cgroup and | ||
1387 | * call memcg's OOM if needed. | ||
1388 | */ | ||
1389 | error = mem_cgroup_shmem_charge_fallback( | ||
1390 | swappage, | ||
1391 | current->mm, | ||
1392 | gfp); | ||
1393 | if (error) { | ||
1394 | unlock_page(swappage); | ||
1395 | page_cache_release(swappage); | ||
1396 | goto failed; | ||
1397 | } | ||
1398 | } | ||
1399 | unlock_page(swappage); | ||
1400 | page_cache_release(swappage); | ||
1401 | goto repeat; | ||
1402 | } | ||
1403 | } else if (sgp == SGP_READ && !filepage) { | ||
1404 | shmem_swp_unmap(entry); | ||
1405 | filepage = find_get_page(mapping, idx); | ||
1406 | if (filepage && | ||
1407 | (!PageUptodate(filepage) || !trylock_page(filepage))) { | ||
1408 | spin_unlock(&info->lock); | ||
1409 | wait_on_page_locked(filepage); | ||
1410 | page_cache_release(filepage); | ||
1411 | filepage = NULL; | ||
1412 | goto repeat; | ||
1413 | } | 907 | } |
908 | |||
909 | error = mem_cgroup_cache_charge(page, current->mm, | ||
910 | gfp & GFP_RECLAIM_MASK); | ||
911 | if (!error) | ||
912 | error = shmem_add_to_page_cache(page, mapping, index, | ||
913 | gfp, swp_to_radix_entry(swap)); | ||
914 | if (error) | ||
915 | goto failed; | ||
916 | |||
917 | spin_lock(&info->lock); | ||
918 | info->swapped--; | ||
919 | shmem_recalc_inode(inode); | ||
1414 | spin_unlock(&info->lock); | 920 | spin_unlock(&info->lock); |
921 | |||
922 | delete_from_swap_cache(page); | ||
923 | set_page_dirty(page); | ||
924 | swap_free(swap); | ||
925 | |||
1415 | } else { | 926 | } else { |
1416 | shmem_swp_unmap(entry); | 927 | if (shmem_acct_block(info->flags)) { |
1417 | sbinfo = SHMEM_SB(inode->i_sb); | 928 | error = -ENOSPC; |
929 | goto failed; | ||
930 | } | ||
1418 | if (sbinfo->max_blocks) { | 931 | if (sbinfo->max_blocks) { |
1419 | if (percpu_counter_compare(&sbinfo->used_blocks, | 932 | if (percpu_counter_compare(&sbinfo->used_blocks, |
1420 | sbinfo->max_blocks) >= 0 || | 933 | sbinfo->max_blocks) >= 0) { |
1421 | shmem_acct_block(info->flags)) | 934 | error = -ENOSPC; |
1422 | goto nospace; | 935 | goto unacct; |
1423 | percpu_counter_inc(&sbinfo->used_blocks); | ||
1424 | spin_lock(&inode->i_lock); | ||
1425 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
1426 | spin_unlock(&inode->i_lock); | ||
1427 | } else if (shmem_acct_block(info->flags)) | ||
1428 | goto nospace; | ||
1429 | |||
1430 | if (!filepage) { | ||
1431 | int ret; | ||
1432 | |||
1433 | if (!prealloc_page) { | ||
1434 | spin_unlock(&info->lock); | ||
1435 | filepage = shmem_alloc_page(gfp, info, idx); | ||
1436 | if (!filepage) { | ||
1437 | shmem_unacct_blocks(info->flags, 1); | ||
1438 | shmem_free_blocks(inode, 1); | ||
1439 | error = -ENOMEM; | ||
1440 | goto failed; | ||
1441 | } | ||
1442 | SetPageSwapBacked(filepage); | ||
1443 | |||
1444 | /* | ||
1445 | * Precharge page while we can wait, compensate | ||
1446 | * after | ||
1447 | */ | ||
1448 | error = mem_cgroup_cache_charge(filepage, | ||
1449 | current->mm, GFP_KERNEL); | ||
1450 | if (error) { | ||
1451 | page_cache_release(filepage); | ||
1452 | shmem_unacct_blocks(info->flags, 1); | ||
1453 | shmem_free_blocks(inode, 1); | ||
1454 | filepage = NULL; | ||
1455 | goto failed; | ||
1456 | } | ||
1457 | |||
1458 | spin_lock(&info->lock); | ||
1459 | } else { | ||
1460 | filepage = prealloc_page; | ||
1461 | prealloc_page = NULL; | ||
1462 | SetPageSwapBacked(filepage); | ||
1463 | } | 936 | } |
937 | percpu_counter_inc(&sbinfo->used_blocks); | ||
938 | } | ||
1464 | 939 | ||
1465 | entry = shmem_swp_alloc(info, idx, sgp); | 940 | page = shmem_alloc_page(gfp, info, index); |
1466 | if (IS_ERR(entry)) | 941 | if (!page) { |
1467 | error = PTR_ERR(entry); | 942 | error = -ENOMEM; |
1468 | else { | 943 | goto decused; |
1469 | swap = *entry; | ||
1470 | shmem_swp_unmap(entry); | ||
1471 | } | ||
1472 | ret = error || swap.val; | ||
1473 | if (ret) | ||
1474 | mem_cgroup_uncharge_cache_page(filepage); | ||
1475 | else | ||
1476 | ret = add_to_page_cache_lru(filepage, mapping, | ||
1477 | idx, GFP_NOWAIT); | ||
1478 | /* | ||
1479 | * At add_to_page_cache_lru() failure, uncharge will | ||
1480 | * be done automatically. | ||
1481 | */ | ||
1482 | if (ret) { | ||
1483 | spin_unlock(&info->lock); | ||
1484 | page_cache_release(filepage); | ||
1485 | shmem_unacct_blocks(info->flags, 1); | ||
1486 | shmem_free_blocks(inode, 1); | ||
1487 | filepage = NULL; | ||
1488 | if (error) | ||
1489 | goto failed; | ||
1490 | goto repeat; | ||
1491 | } | ||
1492 | info->flags |= SHMEM_PAGEIN; | ||
1493 | } | 944 | } |
1494 | 945 | ||
946 | SetPageSwapBacked(page); | ||
947 | __set_page_locked(page); | ||
948 | error = mem_cgroup_cache_charge(page, current->mm, | ||
949 | gfp & GFP_RECLAIM_MASK); | ||
950 | if (!error) | ||
951 | error = shmem_add_to_page_cache(page, mapping, index, | ||
952 | gfp, NULL); | ||
953 | if (error) | ||
954 | goto decused; | ||
955 | lru_cache_add_anon(page); | ||
956 | |||
957 | spin_lock(&info->lock); | ||
1495 | info->alloced++; | 958 | info->alloced++; |
959 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
960 | shmem_recalc_inode(inode); | ||
1496 | spin_unlock(&info->lock); | 961 | spin_unlock(&info->lock); |
1497 | clear_highpage(filepage); | 962 | |
1498 | flush_dcache_page(filepage); | 963 | clear_highpage(page); |
1499 | SetPageUptodate(filepage); | 964 | flush_dcache_page(page); |
965 | SetPageUptodate(page); | ||
1500 | if (sgp == SGP_DIRTY) | 966 | if (sgp == SGP_DIRTY) |
1501 | set_page_dirty(filepage); | 967 | set_page_dirty(page); |
1502 | } | 968 | } |
1503 | done: | 969 | done: |
1504 | *pagep = filepage; | 970 | /* Perhaps the file has been truncated since we checked */ |
1505 | error = 0; | 971 | if (sgp != SGP_WRITE && |
1506 | goto out; | 972 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
973 | error = -EINVAL; | ||
974 | goto trunc; | ||
975 | } | ||
976 | *pagep = page; | ||
977 | return 0; | ||
1507 | 978 | ||
1508 | nospace: | ||
1509 | /* | 979 | /* |
1510 | * Perhaps the page was brought in from swap between find_lock_page | 980 | * Error recovery. |
1511 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | ||
1512 | * but must also avoid reporting a spurious ENOSPC while working on a | ||
1513 | * full tmpfs. (When filepage has been passed in to shmem_getpage, it | ||
1514 | * is already in page cache, which prevents this race from occurring.) | ||
1515 | */ | 981 | */ |
1516 | if (!filepage) { | 982 | trunc: |
1517 | struct page *page = find_get_page(mapping, idx); | 983 | ClearPageDirty(page); |
1518 | if (page) { | 984 | delete_from_page_cache(page); |
1519 | spin_unlock(&info->lock); | 985 | spin_lock(&info->lock); |
1520 | page_cache_release(page); | 986 | info->alloced--; |
1521 | goto repeat; | 987 | inode->i_blocks -= BLOCKS_PER_PAGE; |
1522 | } | ||
1523 | } | ||
1524 | spin_unlock(&info->lock); | 988 | spin_unlock(&info->lock); |
1525 | error = -ENOSPC; | 989 | decused: |
990 | if (sbinfo->max_blocks) | ||
991 | percpu_counter_add(&sbinfo->used_blocks, -1); | ||
992 | unacct: | ||
993 | shmem_unacct_blocks(info->flags, 1); | ||
1526 | failed: | 994 | failed: |
1527 | if (*pagep != filepage) { | 995 | if (swap.val && error != -EINVAL) { |
1528 | unlock_page(filepage); | 996 | struct page *test = find_get_page(mapping, index); |
1529 | page_cache_release(filepage); | 997 | if (test && !radix_tree_exceptional_entry(test)) |
998 | page_cache_release(test); | ||
999 | /* Have another try if the entry has changed */ | ||
1000 | if (test != swp_to_radix_entry(swap)) | ||
1001 | error = -EEXIST; | ||
1530 | } | 1002 | } |
1531 | out: | 1003 | if (page) { |
1532 | if (prealloc_page) { | 1004 | unlock_page(page); |
1533 | mem_cgroup_uncharge_cache_page(prealloc_page); | 1005 | page_cache_release(page); |
1534 | page_cache_release(prealloc_page); | ||
1535 | } | 1006 | } |
1007 | if (error == -ENOSPC && !once++) { | ||
1008 | info = SHMEM_I(inode); | ||
1009 | spin_lock(&info->lock); | ||
1010 | shmem_recalc_inode(inode); | ||
1011 | spin_unlock(&info->lock); | ||
1012 | goto repeat; | ||
1013 | } | ||
1014 | if (error == -EEXIST) | ||
1015 | goto repeat; | ||
1536 | return error; | 1016 | return error; |
1537 | } | 1017 | } |
1538 | 1018 | ||
@@ -1540,36 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1540 | { | 1020 | { |
1541 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1021 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1542 | int error; | 1022 | int error; |
1543 | int ret; | 1023 | int ret = VM_FAULT_LOCKED; |
1544 | |||
1545 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
1546 | return VM_FAULT_SIGBUS; | ||
1547 | 1024 | ||
1548 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1025 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1549 | if (error) | 1026 | if (error) |
1550 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1027 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1028 | |||
1551 | if (ret & VM_FAULT_MAJOR) { | 1029 | if (ret & VM_FAULT_MAJOR) { |
1552 | count_vm_event(PGMAJFAULT); | 1030 | count_vm_event(PGMAJFAULT); |
1553 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1031 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1554 | } | 1032 | } |
1555 | return ret | VM_FAULT_LOCKED; | 1033 | return ret; |
1556 | } | 1034 | } |
1557 | 1035 | ||
1558 | #ifdef CONFIG_NUMA | 1036 | #ifdef CONFIG_NUMA |
1559 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1037 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) |
1560 | { | 1038 | { |
1561 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1039 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1562 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1040 | return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); |
1563 | } | 1041 | } |
1564 | 1042 | ||
1565 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | 1043 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
1566 | unsigned long addr) | 1044 | unsigned long addr) |
1567 | { | 1045 | { |
1568 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1046 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1569 | unsigned long idx; | 1047 | pgoff_t index; |
1570 | 1048 | ||
1571 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1049 | index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
1572 | return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); | 1050 | return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); |
1573 | } | 1051 | } |
1574 | #endif | 1052 | #endif |
1575 | 1053 | ||
@@ -1667,20 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1667 | 1145 | ||
1668 | #ifdef CONFIG_TMPFS | 1146 | #ifdef CONFIG_TMPFS |
1669 | static const struct inode_operations shmem_symlink_inode_operations; | 1147 | static const struct inode_operations shmem_symlink_inode_operations; |
1670 | static const struct inode_operations shmem_symlink_inline_operations; | 1148 | static const struct inode_operations shmem_short_symlink_operations; |
1671 | |||
1672 | /* | ||
1673 | * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; | ||
1674 | * but providing them allows a tmpfs file to be used for splice, sendfile, and | ||
1675 | * below the loop driver, in the generic fashion that many filesystems support. | ||
1676 | */ | ||
1677 | static int shmem_readpage(struct file *file, struct page *page) | ||
1678 | { | ||
1679 | struct inode *inode = page->mapping->host; | ||
1680 | int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); | ||
1681 | unlock_page(page); | ||
1682 | return error; | ||
1683 | } | ||
1684 | 1149 | ||
1685 | static int | 1150 | static int |
1686 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1151 | shmem_write_begin(struct file *file, struct address_space *mapping, |
@@ -1689,7 +1154,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1689 | { | 1154 | { |
1690 | struct inode *inode = mapping->host; | 1155 | struct inode *inode = mapping->host; |
1691 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1156 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1692 | *pagep = NULL; | ||
1693 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1157 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1694 | } | 1158 | } |
1695 | 1159 | ||
@@ -1714,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1714 | { | 1178 | { |
1715 | struct inode *inode = filp->f_path.dentry->d_inode; | 1179 | struct inode *inode = filp->f_path.dentry->d_inode; |
1716 | struct address_space *mapping = inode->i_mapping; | 1180 | struct address_space *mapping = inode->i_mapping; |
1717 | unsigned long index, offset; | 1181 | pgoff_t index; |
1182 | unsigned long offset; | ||
1718 | enum sgp_type sgp = SGP_READ; | 1183 | enum sgp_type sgp = SGP_READ; |
1719 | 1184 | ||
1720 | /* | 1185 | /* |
@@ -1730,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1730 | 1195 | ||
1731 | for (;;) { | 1196 | for (;;) { |
1732 | struct page *page = NULL; | 1197 | struct page *page = NULL; |
1733 | unsigned long end_index, nr, ret; | 1198 | pgoff_t end_index; |
1199 | unsigned long nr, ret; | ||
1734 | loff_t i_size = i_size_read(inode); | 1200 | loff_t i_size = i_size_read(inode); |
1735 | 1201 | ||
1736 | end_index = i_size >> PAGE_CACHE_SHIFT; | 1202 | end_index = i_size >> PAGE_CACHE_SHIFT; |
@@ -1846,6 +1312,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, | |||
1846 | return retval; | 1312 | return retval; |
1847 | } | 1313 | } |
1848 | 1314 | ||
1315 | static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | ||
1316 | struct pipe_inode_info *pipe, size_t len, | ||
1317 | unsigned int flags) | ||
1318 | { | ||
1319 | struct address_space *mapping = in->f_mapping; | ||
1320 | struct inode *inode = mapping->host; | ||
1321 | unsigned int loff, nr_pages, req_pages; | ||
1322 | struct page *pages[PIPE_DEF_BUFFERS]; | ||
1323 | struct partial_page partial[PIPE_DEF_BUFFERS]; | ||
1324 | struct page *page; | ||
1325 | pgoff_t index, end_index; | ||
1326 | loff_t isize, left; | ||
1327 | int error, page_nr; | ||
1328 | struct splice_pipe_desc spd = { | ||
1329 | .pages = pages, | ||
1330 | .partial = partial, | ||
1331 | .flags = flags, | ||
1332 | .ops = &page_cache_pipe_buf_ops, | ||
1333 | .spd_release = spd_release_page, | ||
1334 | }; | ||
1335 | |||
1336 | isize = i_size_read(inode); | ||
1337 | if (unlikely(*ppos >= isize)) | ||
1338 | return 0; | ||
1339 | |||
1340 | left = isize - *ppos; | ||
1341 | if (unlikely(left < len)) | ||
1342 | len = left; | ||
1343 | |||
1344 | if (splice_grow_spd(pipe, &spd)) | ||
1345 | return -ENOMEM; | ||
1346 | |||
1347 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1348 | loff = *ppos & ~PAGE_CACHE_MASK; | ||
1349 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1350 | nr_pages = min(req_pages, pipe->buffers); | ||
1351 | |||
1352 | spd.nr_pages = find_get_pages_contig(mapping, index, | ||
1353 | nr_pages, spd.pages); | ||
1354 | index += spd.nr_pages; | ||
1355 | error = 0; | ||
1356 | |||
1357 | while (spd.nr_pages < nr_pages) { | ||
1358 | error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); | ||
1359 | if (error) | ||
1360 | break; | ||
1361 | unlock_page(page); | ||
1362 | spd.pages[spd.nr_pages++] = page; | ||
1363 | index++; | ||
1364 | } | ||
1365 | |||
1366 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1367 | nr_pages = spd.nr_pages; | ||
1368 | spd.nr_pages = 0; | ||
1369 | |||
1370 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { | ||
1371 | unsigned int this_len; | ||
1372 | |||
1373 | if (!len) | ||
1374 | break; | ||
1375 | |||
1376 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); | ||
1377 | page = spd.pages[page_nr]; | ||
1378 | |||
1379 | if (!PageUptodate(page) || page->mapping != mapping) { | ||
1380 | error = shmem_getpage(inode, index, &page, | ||
1381 | SGP_CACHE, NULL); | ||
1382 | if (error) | ||
1383 | break; | ||
1384 | unlock_page(page); | ||
1385 | page_cache_release(spd.pages[page_nr]); | ||
1386 | spd.pages[page_nr] = page; | ||
1387 | } | ||
1388 | |||
1389 | isize = i_size_read(inode); | ||
1390 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | ||
1391 | if (unlikely(!isize || index > end_index)) | ||
1392 | break; | ||
1393 | |||
1394 | if (end_index == index) { | ||
1395 | unsigned int plen; | ||
1396 | |||
1397 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | ||
1398 | if (plen <= loff) | ||
1399 | break; | ||
1400 | |||
1401 | this_len = min(this_len, plen - loff); | ||
1402 | len = this_len; | ||
1403 | } | ||
1404 | |||
1405 | spd.partial[page_nr].offset = loff; | ||
1406 | spd.partial[page_nr].len = this_len; | ||
1407 | len -= this_len; | ||
1408 | loff = 0; | ||
1409 | spd.nr_pages++; | ||
1410 | index++; | ||
1411 | } | ||
1412 | |||
1413 | while (page_nr < nr_pages) | ||
1414 | page_cache_release(spd.pages[page_nr++]); | ||
1415 | |||
1416 | if (spd.nr_pages) | ||
1417 | error = splice_to_pipe(pipe, &spd); | ||
1418 | |||
1419 | splice_shrink_spd(pipe, &spd); | ||
1420 | |||
1421 | if (error > 0) { | ||
1422 | *ppos += error; | ||
1423 | file_accessed(in); | ||
1424 | } | ||
1425 | return error; | ||
1426 | } | ||
1427 | |||
1849 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1428 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1850 | { | 1429 | { |
1851 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1430 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -1855,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1855 | buf->f_namelen = NAME_MAX; | 1434 | buf->f_namelen = NAME_MAX; |
1856 | if (sbinfo->max_blocks) { | 1435 | if (sbinfo->max_blocks) { |
1857 | buf->f_blocks = sbinfo->max_blocks; | 1436 | buf->f_blocks = sbinfo->max_blocks; |
1858 | buf->f_bavail = buf->f_bfree = | 1437 | buf->f_bavail = |
1859 | sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); | 1438 | buf->f_bfree = sbinfo->max_blocks - |
1439 | percpu_counter_sum(&sbinfo->used_blocks); | ||
1860 | } | 1440 | } |
1861 | if (sbinfo->max_inodes) { | 1441 | if (sbinfo->max_inodes) { |
1862 | buf->f_files = sbinfo->max_inodes; | 1442 | buf->f_files = sbinfo->max_inodes; |
@@ -2006,7 +1586,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2006 | int error; | 1586 | int error; |
2007 | int len; | 1587 | int len; |
2008 | struct inode *inode; | 1588 | struct inode *inode; |
2009 | struct page *page = NULL; | 1589 | struct page *page; |
2010 | char *kaddr; | 1590 | char *kaddr; |
2011 | struct shmem_inode_info *info; | 1591 | struct shmem_inode_info *info; |
2012 | 1592 | ||
@@ -2030,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2030 | 1610 | ||
2031 | info = SHMEM_I(inode); | 1611 | info = SHMEM_I(inode); |
2032 | inode->i_size = len-1; | 1612 | inode->i_size = len-1; |
2033 | if (len <= SHMEM_SYMLINK_INLINE_LEN) { | 1613 | if (len <= SHORT_SYMLINK_LEN) { |
2034 | /* do it inline */ | 1614 | info->symlink = kmemdup(symname, len, GFP_KERNEL); |
2035 | memcpy(info->inline_symlink, symname, len); | 1615 | if (!info->symlink) { |
2036 | inode->i_op = &shmem_symlink_inline_operations; | 1616 | iput(inode); |
1617 | return -ENOMEM; | ||
1618 | } | ||
1619 | inode->i_op = &shmem_short_symlink_operations; | ||
2037 | } else { | 1620 | } else { |
2038 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); | 1621 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); |
2039 | if (error) { | 1622 | if (error) { |
@@ -2056,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2056 | return 0; | 1639 | return 0; |
2057 | } | 1640 | } |
2058 | 1641 | ||
2059 | static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) | 1642 | static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) |
2060 | { | 1643 | { |
2061 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); | 1644 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); |
2062 | return NULL; | 1645 | return NULL; |
2063 | } | 1646 | } |
2064 | 1647 | ||
2065 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) | 1648 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) |
2066 | { | 1649 | { |
2067 | struct page *page = NULL; | 1650 | struct page *page = NULL; |
2068 | int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); | 1651 | int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); |
2069 | nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); | 1652 | nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); |
2070 | if (page) | 1653 | if (page) |
2071 | unlock_page(page); | 1654 | unlock_page(page); |
2072 | return page; | 1655 | return page; |
@@ -2177,7 +1760,6 @@ out: | |||
2177 | return err; | 1760 | return err; |
2178 | } | 1761 | } |
2179 | 1762 | ||
2180 | |||
2181 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 1763 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
2182 | #ifdef CONFIG_TMPFS_POSIX_ACL | 1764 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2183 | &generic_acl_access_handler, | 1765 | &generic_acl_access_handler, |
@@ -2307,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) | |||
2307 | } | 1889 | } |
2308 | #endif /* CONFIG_TMPFS_XATTR */ | 1890 | #endif /* CONFIG_TMPFS_XATTR */ |
2309 | 1891 | ||
2310 | static const struct inode_operations shmem_symlink_inline_operations = { | 1892 | static const struct inode_operations shmem_short_symlink_operations = { |
2311 | .readlink = generic_readlink, | 1893 | .readlink = generic_readlink, |
2312 | .follow_link = shmem_follow_link_inline, | 1894 | .follow_link = shmem_follow_short_symlink, |
2313 | #ifdef CONFIG_TMPFS_XATTR | 1895 | #ifdef CONFIG_TMPFS_XATTR |
2314 | .setxattr = shmem_setxattr, | 1896 | .setxattr = shmem_setxattr, |
2315 | .getxattr = shmem_getxattr, | 1897 | .getxattr = shmem_getxattr, |
@@ -2509,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2509 | if (config.max_inodes < inodes) | 2091 | if (config.max_inodes < inodes) |
2510 | goto out; | 2092 | goto out; |
2511 | /* | 2093 | /* |
2512 | * Those tests also disallow limited->unlimited while any are in | 2094 | * Those tests disallow limited->unlimited while any are in use; |
2513 | * use, so i_blocks will always be zero when max_blocks is zero; | ||
2514 | * but we must separately disallow unlimited->limited, because | 2095 | * but we must separately disallow unlimited->limited, because |
2515 | * in that case we have no record of how much is already in use. | 2096 | * in that case we have no record of how much is already in use. |
2516 | */ | 2097 | */ |
@@ -2602,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2602 | goto failed; | 2183 | goto failed; |
2603 | sbinfo->free_inodes = sbinfo->max_inodes; | 2184 | sbinfo->free_inodes = sbinfo->max_inodes; |
2604 | 2185 | ||
2605 | sb->s_maxbytes = SHMEM_MAX_BYTES; | 2186 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
2606 | sb->s_blocksize = PAGE_CACHE_SIZE; | 2187 | sb->s_blocksize = PAGE_CACHE_SIZE; |
2607 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | 2188 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; |
2608 | sb->s_magic = TMPFS_MAGIC; | 2189 | sb->s_magic = TMPFS_MAGIC; |
@@ -2637,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep; | |||
2637 | 2218 | ||
2638 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2219 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2639 | { | 2220 | { |
2640 | struct shmem_inode_info *p; | 2221 | struct shmem_inode_info *info; |
2641 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); | 2222 | info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); |
2642 | if (!p) | 2223 | if (!info) |
2643 | return NULL; | 2224 | return NULL; |
2644 | return &p->vfs_inode; | 2225 | return &info->vfs_inode; |
2645 | } | 2226 | } |
2646 | 2227 | ||
2647 | static void shmem_i_callback(struct rcu_head *head) | 2228 | static void shmem_destroy_callback(struct rcu_head *head) |
2648 | { | 2229 | { |
2649 | struct inode *inode = container_of(head, struct inode, i_rcu); | 2230 | struct inode *inode = container_of(head, struct inode, i_rcu); |
2650 | INIT_LIST_HEAD(&inode->i_dentry); | 2231 | INIT_LIST_HEAD(&inode->i_dentry); |
@@ -2653,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head) | |||
2653 | 2234 | ||
2654 | static void shmem_destroy_inode(struct inode *inode) | 2235 | static void shmem_destroy_inode(struct inode *inode) |
2655 | { | 2236 | { |
2656 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | 2237 | if ((inode->i_mode & S_IFMT) == S_IFREG) |
2657 | /* only struct inode is valid if it's an inline symlink */ | ||
2658 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2238 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2659 | } | 2239 | call_rcu(&inode->i_rcu, shmem_destroy_callback); |
2660 | call_rcu(&inode->i_rcu, shmem_i_callback); | ||
2661 | } | 2240 | } |
2662 | 2241 | ||
2663 | static void init_once(void *foo) | 2242 | static void shmem_init_inode(void *foo) |
2664 | { | 2243 | { |
2665 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2244 | struct shmem_inode_info *info = foo; |
2666 | 2245 | inode_init_once(&info->vfs_inode); | |
2667 | inode_init_once(&p->vfs_inode); | ||
2668 | } | 2246 | } |
2669 | 2247 | ||
2670 | static int init_inodecache(void) | 2248 | static int shmem_init_inodecache(void) |
2671 | { | 2249 | { |
2672 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | 2250 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", |
2673 | sizeof(struct shmem_inode_info), | 2251 | sizeof(struct shmem_inode_info), |
2674 | 0, SLAB_PANIC, init_once); | 2252 | 0, SLAB_PANIC, shmem_init_inode); |
2675 | return 0; | 2253 | return 0; |
2676 | } | 2254 | } |
2677 | 2255 | ||
2678 | static void destroy_inodecache(void) | 2256 | static void shmem_destroy_inodecache(void) |
2679 | { | 2257 | { |
2680 | kmem_cache_destroy(shmem_inode_cachep); | 2258 | kmem_cache_destroy(shmem_inode_cachep); |
2681 | } | 2259 | } |
@@ -2684,7 +2262,6 @@ static const struct address_space_operations shmem_aops = { | |||
2684 | .writepage = shmem_writepage, | 2262 | .writepage = shmem_writepage, |
2685 | .set_page_dirty = __set_page_dirty_no_writeback, | 2263 | .set_page_dirty = __set_page_dirty_no_writeback, |
2686 | #ifdef CONFIG_TMPFS | 2264 | #ifdef CONFIG_TMPFS |
2687 | .readpage = shmem_readpage, | ||
2688 | .write_begin = shmem_write_begin, | 2265 | .write_begin = shmem_write_begin, |
2689 | .write_end = shmem_write_end, | 2266 | .write_end = shmem_write_end, |
2690 | #endif | 2267 | #endif |
@@ -2701,7 +2278,7 @@ static const struct file_operations shmem_file_operations = { | |||
2701 | .aio_read = shmem_file_aio_read, | 2278 | .aio_read = shmem_file_aio_read, |
2702 | .aio_write = generic_file_aio_write, | 2279 | .aio_write = generic_file_aio_write, |
2703 | .fsync = noop_fsync, | 2280 | .fsync = noop_fsync, |
2704 | .splice_read = generic_file_splice_read, | 2281 | .splice_read = shmem_file_splice_read, |
2705 | .splice_write = generic_file_splice_write, | 2282 | .splice_write = generic_file_splice_write, |
2706 | #endif | 2283 | #endif |
2707 | }; | 2284 | }; |
@@ -2715,10 +2292,6 @@ static const struct inode_operations shmem_inode_operations = { | |||
2715 | .listxattr = shmem_listxattr, | 2292 | .listxattr = shmem_listxattr, |
2716 | .removexattr = shmem_removexattr, | 2293 | .removexattr = shmem_removexattr, |
2717 | #endif | 2294 | #endif |
2718 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2719 | .check_acl = generic_check_acl, | ||
2720 | #endif | ||
2721 | |||
2722 | }; | 2295 | }; |
2723 | 2296 | ||
2724 | static const struct inode_operations shmem_dir_inode_operations = { | 2297 | static const struct inode_operations shmem_dir_inode_operations = { |
@@ -2741,7 +2314,6 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2741 | #endif | 2314 | #endif |
2742 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2315 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2743 | .setattr = shmem_setattr, | 2316 | .setattr = shmem_setattr, |
2744 | .check_acl = generic_check_acl, | ||
2745 | #endif | 2317 | #endif |
2746 | }; | 2318 | }; |
2747 | 2319 | ||
@@ -2754,7 +2326,6 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2754 | #endif | 2326 | #endif |
2755 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2327 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2756 | .setattr = shmem_setattr, | 2328 | .setattr = shmem_setattr, |
2757 | .check_acl = generic_check_acl, | ||
2758 | #endif | 2329 | #endif |
2759 | }; | 2330 | }; |
2760 | 2331 | ||
@@ -2779,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
2779 | #endif | 2350 | #endif |
2780 | }; | 2351 | }; |
2781 | 2352 | ||
2782 | |||
2783 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 2353 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
2784 | int flags, const char *dev_name, void *data) | 2354 | int flags, const char *dev_name, void *data) |
2785 | { | 2355 | { |
2786 | return mount_nodev(fs_type, flags, data, shmem_fill_super); | 2356 | return mount_nodev(fs_type, flags, data, shmem_fill_super); |
2787 | } | 2357 | } |
2788 | 2358 | ||
2789 | static struct file_system_type tmpfs_fs_type = { | 2359 | static struct file_system_type shmem_fs_type = { |
2790 | .owner = THIS_MODULE, | 2360 | .owner = THIS_MODULE, |
2791 | .name = "tmpfs", | 2361 | .name = "tmpfs", |
2792 | .mount = shmem_mount, | 2362 | .mount = shmem_mount, |
2793 | .kill_sb = kill_litter_super, | 2363 | .kill_sb = kill_litter_super, |
2794 | }; | 2364 | }; |
2795 | 2365 | ||
2796 | int __init init_tmpfs(void) | 2366 | int __init shmem_init(void) |
2797 | { | 2367 | { |
2798 | int error; | 2368 | int error; |
2799 | 2369 | ||
@@ -2801,18 +2371,18 @@ int __init init_tmpfs(void) | |||
2801 | if (error) | 2371 | if (error) |
2802 | goto out4; | 2372 | goto out4; |
2803 | 2373 | ||
2804 | error = init_inodecache(); | 2374 | error = shmem_init_inodecache(); |
2805 | if (error) | 2375 | if (error) |
2806 | goto out3; | 2376 | goto out3; |
2807 | 2377 | ||
2808 | error = register_filesystem(&tmpfs_fs_type); | 2378 | error = register_filesystem(&shmem_fs_type); |
2809 | if (error) { | 2379 | if (error) { |
2810 | printk(KERN_ERR "Could not register tmpfs\n"); | 2380 | printk(KERN_ERR "Could not register tmpfs\n"); |
2811 | goto out2; | 2381 | goto out2; |
2812 | } | 2382 | } |
2813 | 2383 | ||
2814 | shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, | 2384 | shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, |
2815 | tmpfs_fs_type.name, NULL); | 2385 | shmem_fs_type.name, NULL); |
2816 | if (IS_ERR(shm_mnt)) { | 2386 | if (IS_ERR(shm_mnt)) { |
2817 | error = PTR_ERR(shm_mnt); | 2387 | error = PTR_ERR(shm_mnt); |
2818 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); | 2388 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); |
@@ -2821,9 +2391,9 @@ int __init init_tmpfs(void) | |||
2821 | return 0; | 2391 | return 0; |
2822 | 2392 | ||
2823 | out1: | 2393 | out1: |
2824 | unregister_filesystem(&tmpfs_fs_type); | 2394 | unregister_filesystem(&shmem_fs_type); |
2825 | out2: | 2395 | out2: |
2826 | destroy_inodecache(); | 2396 | shmem_destroy_inodecache(); |
2827 | out3: | 2397 | out3: |
2828 | bdi_destroy(&shmem_backing_dev_info); | 2398 | bdi_destroy(&shmem_backing_dev_info); |
2829 | out4: | 2399 | out4: |
@@ -2831,45 +2401,6 @@ out4: | |||
2831 | return error; | 2401 | return error; |
2832 | } | 2402 | } |
2833 | 2403 | ||
2834 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2835 | /** | ||
2836 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2837 | * @inode: the inode to be searched | ||
2838 | * @pgoff: the offset to be searched | ||
2839 | * @pagep: the pointer for the found page to be stored | ||
2840 | * @ent: the pointer for the found swap entry to be stored | ||
2841 | * | ||
2842 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2843 | * these refcount. | ||
2844 | */ | ||
2845 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2846 | struct page **pagep, swp_entry_t *ent) | ||
2847 | { | ||
2848 | swp_entry_t entry = { .val = 0 }, *ptr; | ||
2849 | struct page *page = NULL; | ||
2850 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
2851 | |||
2852 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2853 | goto out; | ||
2854 | |||
2855 | spin_lock(&info->lock); | ||
2856 | ptr = shmem_swp_entry(info, pgoff, NULL); | ||
2857 | #ifdef CONFIG_SWAP | ||
2858 | if (ptr && ptr->val) { | ||
2859 | entry.val = ptr->val; | ||
2860 | page = find_get_page(&swapper_space, entry.val); | ||
2861 | } else | ||
2862 | #endif | ||
2863 | page = find_get_page(inode->i_mapping, pgoff); | ||
2864 | if (ptr) | ||
2865 | shmem_swp_unmap(ptr); | ||
2866 | spin_unlock(&info->lock); | ||
2867 | out: | ||
2868 | *pagep = page; | ||
2869 | *ent = entry; | ||
2870 | } | ||
2871 | #endif | ||
2872 | |||
2873 | #else /* !CONFIG_SHMEM */ | 2404 | #else /* !CONFIG_SHMEM */ |
2874 | 2405 | ||
2875 | /* | 2406 | /* |
@@ -2883,23 +2414,23 @@ out: | |||
2883 | 2414 | ||
2884 | #include <linux/ramfs.h> | 2415 | #include <linux/ramfs.h> |
2885 | 2416 | ||
2886 | static struct file_system_type tmpfs_fs_type = { | 2417 | static struct file_system_type shmem_fs_type = { |
2887 | .name = "tmpfs", | 2418 | .name = "tmpfs", |
2888 | .mount = ramfs_mount, | 2419 | .mount = ramfs_mount, |
2889 | .kill_sb = kill_litter_super, | 2420 | .kill_sb = kill_litter_super, |
2890 | }; | 2421 | }; |
2891 | 2422 | ||
2892 | int __init init_tmpfs(void) | 2423 | int __init shmem_init(void) |
2893 | { | 2424 | { |
2894 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 2425 | BUG_ON(register_filesystem(&shmem_fs_type) != 0); |
2895 | 2426 | ||
2896 | shm_mnt = kern_mount(&tmpfs_fs_type); | 2427 | shm_mnt = kern_mount(&shmem_fs_type); |
2897 | BUG_ON(IS_ERR(shm_mnt)); | 2428 | BUG_ON(IS_ERR(shm_mnt)); |
2898 | 2429 | ||
2899 | return 0; | 2430 | return 0; |
2900 | } | 2431 | } |
2901 | 2432 | ||
2902 | int shmem_unuse(swp_entry_t entry, struct page *page) | 2433 | int shmem_unuse(swp_entry_t swap, struct page *page) |
2903 | { | 2434 | { |
2904 | return 0; | 2435 | return 0; |
2905 | } | 2436 | } |
@@ -2909,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
2909 | return 0; | 2440 | return 0; |
2910 | } | 2441 | } |
2911 | 2442 | ||
2912 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 2443 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
2913 | { | 2444 | { |
2914 | truncate_inode_pages_range(inode->i_mapping, start, end); | 2445 | truncate_inode_pages_range(inode->i_mapping, lstart, lend); |
2915 | } | 2446 | } |
2916 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 2447 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
2917 | 2448 | ||
2918 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2919 | /** | ||
2920 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2921 | * @inode: the inode to be searched | ||
2922 | * @pgoff: the offset to be searched | ||
2923 | * @pagep: the pointer for the found page to be stored | ||
2924 | * @ent: the pointer for the found swap entry to be stored | ||
2925 | * | ||
2926 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2927 | * these refcount. | ||
2928 | */ | ||
2929 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2930 | struct page **pagep, swp_entry_t *ent) | ||
2931 | { | ||
2932 | struct page *page = NULL; | ||
2933 | |||
2934 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2935 | goto out; | ||
2936 | page = find_get_page(inode->i_mapping, pgoff); | ||
2937 | out: | ||
2938 | *pagep = page; | ||
2939 | *ent = (swp_entry_t){ .val = 0 }; | ||
2940 | } | ||
2941 | #endif | ||
2942 | |||
2943 | #define shmem_vm_ops generic_file_vm_ops | 2449 | #define shmem_vm_ops generic_file_vm_ops |
2944 | #define shmem_file_operations ramfs_file_operations | 2450 | #define shmem_file_operations ramfs_file_operations |
2945 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) | 2451 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) |
2946 | #define shmem_acct_size(flags, size) 0 | 2452 | #define shmem_acct_size(flags, size) 0 |
2947 | #define shmem_unacct_size(flags, size) do {} while (0) | 2453 | #define shmem_unacct_size(flags, size) do {} while (0) |
2948 | #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE | ||
2949 | 2454 | ||
2950 | #endif /* CONFIG_SHMEM */ | 2455 | #endif /* CONFIG_SHMEM */ |
2951 | 2456 | ||
@@ -2969,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2969 | if (IS_ERR(shm_mnt)) | 2474 | if (IS_ERR(shm_mnt)) |
2970 | return (void *)shm_mnt; | 2475 | return (void *)shm_mnt; |
2971 | 2476 | ||
2972 | if (size < 0 || size > SHMEM_MAX_BYTES) | 2477 | if (size < 0 || size > MAX_LFS_FILESIZE) |
2973 | return ERR_PTR(-EINVAL); | 2478 | return ERR_PTR(-EINVAL); |
2974 | 2479 | ||
2975 | if (shmem_acct_size(flags, size)) | 2480 | if (shmem_acct_size(flags, size)) |
@@ -3015,6 +2520,15 @@ put_memory: | |||
3015 | } | 2520 | } |
3016 | EXPORT_SYMBOL_GPL(shmem_file_setup); | 2521 | EXPORT_SYMBOL_GPL(shmem_file_setup); |
3017 | 2522 | ||
2523 | void shmem_set_file(struct vm_area_struct *vma, struct file *file) | ||
2524 | { | ||
2525 | if (vma->vm_file) | ||
2526 | fput(vma->vm_file); | ||
2527 | vma->vm_file = file; | ||
2528 | vma->vm_ops = &shmem_vm_ops; | ||
2529 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
2530 | } | ||
2531 | |||
3018 | /** | 2532 | /** |
3019 | * shmem_zero_setup - setup a shared anonymous mapping | 2533 | * shmem_zero_setup - setup a shared anonymous mapping |
3020 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff | 2534 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff |
@@ -3028,11 +2542,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3028 | if (IS_ERR(file)) | 2542 | if (IS_ERR(file)) |
3029 | return PTR_ERR(file); | 2543 | return PTR_ERR(file); |
3030 | 2544 | ||
3031 | if (vma->vm_file) | 2545 | shmem_set_file(vma, file); |
3032 | fput(vma->vm_file); | ||
3033 | vma->vm_file = file; | ||
3034 | vma->vm_ops = &shmem_vm_ops; | ||
3035 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
3036 | return 0; | 2546 | return 0; |
3037 | } | 2547 | } |
3038 | 2548 | ||
@@ -3048,13 +2558,29 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3048 | * suit tmpfs, since it may have pages in swapcache, and needs to find those | 2558 | * suit tmpfs, since it may have pages in swapcache, and needs to find those |
3049 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. | 2559 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. |
3050 | * | 2560 | * |
3051 | * Provide a stub for those callers to start using now, then later | 2561 | * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in |
3052 | * flesh it out to call shmem_getpage() with additional gfp mask, when | 2562 | * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. |
3053 | * shmem_file_splice_read() is added and shmem_readpage() is removed. | ||
3054 | */ | 2563 | */ |
3055 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | 2564 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, |
3056 | pgoff_t index, gfp_t gfp) | 2565 | pgoff_t index, gfp_t gfp) |
3057 | { | 2566 | { |
2567 | #ifdef CONFIG_SHMEM | ||
2568 | struct inode *inode = mapping->host; | ||
2569 | struct page *page; | ||
2570 | int error; | ||
2571 | |||
2572 | BUG_ON(mapping->a_ops != &shmem_aops); | ||
2573 | error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); | ||
2574 | if (error) | ||
2575 | page = ERR_PTR(error); | ||
2576 | else | ||
2577 | unlock_page(page); | ||
2578 | return page; | ||
2579 | #else | ||
2580 | /* | ||
2581 | * The tiny !SHMEM case uses ramfs without swap | ||
2582 | */ | ||
3058 | return read_cache_page_gfp(mapping, index, gfp); | 2583 | return read_cache_page_gfp(mapping, index, gfp); |
2584 | #endif | ||
3059 | } | 2585 | } |
3060 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); | 2586 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); |
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic = | |||
574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
575 | 575 | ||
576 | /* internal cache of cache description objs */ | 576 | /* internal cache of cache description objs */ |
577 | static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; | ||
577 | static struct kmem_cache cache_cache = { | 578 | static struct kmem_cache cache_cache = { |
579 | .nodelists = cache_cache_nodelists, | ||
578 | .batchcount = 1, | 580 | .batchcount = 1, |
579 | .limit = BOOT_CPUCACHE_ENTRIES, | 581 | .limit = BOOT_CPUCACHE_ENTRIES, |
580 | .shared = 1, | 582 | .shared = 1, |
@@ -593,6 +595,7 @@ static enum { | |||
593 | PARTIAL_AC, | 595 | PARTIAL_AC, |
594 | PARTIAL_L3, | 596 | PARTIAL_L3, |
595 | EARLY, | 597 | EARLY, |
598 | LATE, | ||
596 | FULL | 599 | FULL |
597 | } g_cpucache_up; | 600 | } g_cpucache_up; |
598 | 601 | ||
@@ -620,37 +623,67 @@ int slab_is_available(void) | |||
620 | static struct lock_class_key on_slab_l3_key; | 623 | static struct lock_class_key on_slab_l3_key; |
621 | static struct lock_class_key on_slab_alc_key; | 624 | static struct lock_class_key on_slab_alc_key; |
622 | 625 | ||
626 | static struct lock_class_key debugobj_l3_key; | ||
627 | static struct lock_class_key debugobj_alc_key; | ||
628 | |||
629 | static void slab_set_lock_classes(struct kmem_cache *cachep, | ||
630 | struct lock_class_key *l3_key, struct lock_class_key *alc_key, | ||
631 | int q) | ||
632 | { | ||
633 | struct array_cache **alc; | ||
634 | struct kmem_list3 *l3; | ||
635 | int r; | ||
636 | |||
637 | l3 = cachep->nodelists[q]; | ||
638 | if (!l3) | ||
639 | return; | ||
640 | |||
641 | lockdep_set_class(&l3->list_lock, l3_key); | ||
642 | alc = l3->alien; | ||
643 | /* | ||
644 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
645 | * should go away when common slab code is taught to | ||
646 | * work even without alien caches. | ||
647 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
648 | * for alloc_alien_cache, | ||
649 | */ | ||
650 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
651 | return; | ||
652 | for_each_node(r) { | ||
653 | if (alc[r]) | ||
654 | lockdep_set_class(&alc[r]->lock, alc_key); | ||
655 | } | ||
656 | } | ||
657 | |||
658 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
659 | { | ||
660 | slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); | ||
661 | } | ||
662 | |||
663 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
664 | { | ||
665 | int node; | ||
666 | |||
667 | for_each_online_node(node) | ||
668 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
669 | } | ||
670 | |||
623 | static void init_node_lock_keys(int q) | 671 | static void init_node_lock_keys(int q) |
624 | { | 672 | { |
625 | struct cache_sizes *s = malloc_sizes; | 673 | struct cache_sizes *s = malloc_sizes; |
626 | 674 | ||
627 | if (g_cpucache_up != FULL) | 675 | if (g_cpucache_up < LATE) |
628 | return; | 676 | return; |
629 | 677 | ||
630 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { | 678 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
631 | struct array_cache **alc; | ||
632 | struct kmem_list3 *l3; | 679 | struct kmem_list3 *l3; |
633 | int r; | ||
634 | 680 | ||
635 | l3 = s->cs_cachep->nodelists[q]; | 681 | l3 = s->cs_cachep->nodelists[q]; |
636 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 682 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
637 | continue; | 683 | continue; |
638 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 684 | |
639 | alc = l3->alien; | 685 | slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, |
640 | /* | 686 | &on_slab_alc_key, q); |
641 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
642 | * should go away when common slab code is taught to | ||
643 | * work even without alien caches. | ||
644 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
645 | * for alloc_alien_cache, | ||
646 | */ | ||
647 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
648 | continue; | ||
649 | for_each_node(r) { | ||
650 | if (alc[r]) | ||
651 | lockdep_set_class(&alc[r]->lock, | ||
652 | &on_slab_alc_key); | ||
653 | } | ||
654 | } | 687 | } |
655 | } | 688 | } |
656 | 689 | ||
@@ -669,6 +702,14 @@ static void init_node_lock_keys(int q) | |||
669 | static inline void init_lock_keys(void) | 702 | static inline void init_lock_keys(void) |
670 | { | 703 | { |
671 | } | 704 | } |
705 | |||
706 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
707 | { | ||
708 | } | ||
709 | |||
710 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
711 | { | ||
712 | } | ||
672 | #endif | 713 | #endif |
673 | 714 | ||
674 | /* | 715 | /* |
@@ -1262,6 +1303,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1262 | spin_unlock_irq(&l3->list_lock); | 1303 | spin_unlock_irq(&l3->list_lock); |
1263 | kfree(shared); | 1304 | kfree(shared); |
1264 | free_alien_cache(alien); | 1305 | free_alien_cache(alien); |
1306 | if (cachep->flags & SLAB_DEBUG_OBJECTS) | ||
1307 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
1265 | } | 1308 | } |
1266 | init_node_lock_keys(node); | 1309 | init_node_lock_keys(node); |
1267 | 1310 | ||
@@ -1492,11 +1535,10 @@ void __init kmem_cache_init(void) | |||
1492 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | 1535 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; |
1493 | 1536 | ||
1494 | /* | 1537 | /* |
1495 | * struct kmem_cache size depends on nr_node_ids, which | 1538 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1496 | * can be less than MAX_NUMNODES. | ||
1497 | */ | 1539 | */ |
1498 | cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + | 1540 | cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1499 | nr_node_ids * sizeof(struct kmem_list3 *); | 1541 | nr_node_ids * sizeof(struct kmem_list3 *); |
1500 | #if DEBUG | 1542 | #if DEBUG |
1501 | cache_cache.obj_size = cache_cache.buffer_size; | 1543 | cache_cache.obj_size = cache_cache.buffer_size; |
1502 | #endif | 1544 | #endif |
@@ -1625,6 +1667,11 @@ void __init kmem_cache_init_late(void) | |||
1625 | { | 1667 | { |
1626 | struct kmem_cache *cachep; | 1668 | struct kmem_cache *cachep; |
1627 | 1669 | ||
1670 | g_cpucache_up = LATE; | ||
1671 | |||
1672 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1673 | init_lock_keys(); | ||
1674 | |||
1628 | /* 6) resize the head arrays to their final sizes */ | 1675 | /* 6) resize the head arrays to their final sizes */ |
1629 | mutex_lock(&cache_chain_mutex); | 1676 | mutex_lock(&cache_chain_mutex); |
1630 | list_for_each_entry(cachep, &cache_chain, next) | 1677 | list_for_each_entry(cachep, &cache_chain, next) |
@@ -1635,9 +1682,6 @@ void __init kmem_cache_init_late(void) | |||
1635 | /* Done! */ | 1682 | /* Done! */ |
1636 | g_cpucache_up = FULL; | 1683 | g_cpucache_up = FULL; |
1637 | 1684 | ||
1638 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1639 | init_lock_keys(); | ||
1640 | |||
1641 | /* | 1685 | /* |
1642 | * Register a cpu startup notifier callback that initializes | 1686 | * Register a cpu startup notifier callback that initializes |
1643 | * cpu_cache_get for all new cpus | 1687 | * cpu_cache_get for all new cpus |
@@ -2308,6 +2352,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2308 | if (!cachep) | 2352 | if (!cachep) |
2309 | goto oops; | 2353 | goto oops; |
2310 | 2354 | ||
2355 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | ||
2311 | #if DEBUG | 2356 | #if DEBUG |
2312 | cachep->obj_size = size; | 2357 | cachep->obj_size = size; |
2313 | 2358 | ||
@@ -2424,6 +2469,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2424 | goto oops; | 2469 | goto oops; |
2425 | } | 2470 | } |
2426 | 2471 | ||
2472 | if (flags & SLAB_DEBUG_OBJECTS) { | ||
2473 | /* | ||
2474 | * Would deadlock through slab_destroy()->call_rcu()-> | ||
2475 | * debug_object_activate()->kmem_cache_alloc(). | ||
2476 | */ | ||
2477 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); | ||
2478 | |||
2479 | slab_set_debugobj_lock_classes(cachep); | ||
2480 | } | ||
2481 | |||
2427 | /* cache setup completed, link it into the list */ | 2482 | /* cache setup completed, link it into the list */ |
2428 | list_add(&cachep->next, &cache_chain); | 2483 | list_add(&cachep->next, &cache_chain); |
2429 | oops: | 2484 | oops: |
@@ -3153,12 +3208,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3153 | objp += obj_offset(cachep); | 3208 | objp += obj_offset(cachep); |
3154 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3209 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3155 | cachep->ctor(objp); | 3210 | cachep->ctor(objp); |
3156 | #if ARCH_SLAB_MINALIGN | 3211 | if (ARCH_SLAB_MINALIGN && |
3157 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3212 | ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { |
3158 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3213 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
3159 | objp, ARCH_SLAB_MINALIGN); | 3214 | objp, (int)ARCH_SLAB_MINALIGN); |
3160 | } | 3215 | } |
3161 | #endif | ||
3162 | return objp; | 3216 | return objp; |
3163 | } | 3217 | } |
3164 | #else | 3218 | #else |
@@ -3402,7 +3456,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3402 | cache_alloc_debugcheck_before(cachep, flags); | 3456 | cache_alloc_debugcheck_before(cachep, flags); |
3403 | local_irq_save(save_flags); | 3457 | local_irq_save(save_flags); |
3404 | 3458 | ||
3405 | if (nodeid == -1) | 3459 | if (nodeid == NUMA_NO_NODE) |
3406 | nodeid = slab_node; | 3460 | nodeid = slab_node; |
3407 | 3461 | ||
3408 | if (unlikely(!cachep->nodelists[nodeid])) { | 3462 | if (unlikely(!cachep->nodelists[nodeid])) { |
@@ -3933,7 +3987,7 @@ fail: | |||
3933 | 3987 | ||
3934 | struct ccupdate_struct { | 3988 | struct ccupdate_struct { |
3935 | struct kmem_cache *cachep; | 3989 | struct kmem_cache *cachep; |
3936 | struct array_cache *new[NR_CPUS]; | 3990 | struct array_cache *new[0]; |
3937 | }; | 3991 | }; |
3938 | 3992 | ||
3939 | static void do_ccupdate_local(void *info) | 3993 | static void do_ccupdate_local(void *info) |
@@ -3955,7 +4009,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3955 | struct ccupdate_struct *new; | 4009 | struct ccupdate_struct *new; |
3956 | int i; | 4010 | int i; |
3957 | 4011 | ||
3958 | new = kzalloc(sizeof(*new), gfp); | 4012 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), |
4013 | gfp); | ||
3959 | if (!new) | 4014 | if (!new) |
3960 | return -ENOMEM; | 4015 | return -ENOMEM; |
3961 | 4016 | ||
@@ -70,7 +70,7 @@ | |||
70 | 70 | ||
71 | #include <trace/events/kmem.h> | 71 | #include <trace/events/kmem.h> |
72 | 72 | ||
73 | #include <asm/atomic.h> | 73 | #include <linux/atomic.h> |
74 | 74 | ||
75 | /* | 75 | /* |
76 | * slob_block has a field 'units', which indicates size of block if +ve, | 76 | * slob_block has a field 'units', which indicates size of block if +ve, |
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
483 | void *ret; | 483 | void *ret; |
484 | 484 | ||
485 | gfp &= gfp_allowed_mask; | ||
486 | |||
485 | lockdep_trace_alloc(gfp); | 487 | lockdep_trace_alloc(gfp); |
486 | 488 | ||
487 | if (size < PAGE_SIZE - align) { | 489 | if (size < PAGE_SIZE - align) { |
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
608 | { | 610 | { |
609 | void *b; | 611 | void *b; |
610 | 612 | ||
613 | flags &= gfp_allowed_mask; | ||
614 | |||
615 | lockdep_trace_alloc(flags); | ||
616 | |||
611 | if (c->size < PAGE_SIZE) { | 617 | if (c->size < PAGE_SIZE) { |
612 | b = slob_alloc(c->size, flags, c->align, node); | 618 | b = slob_alloc(c->size, flags, c->align, node); |
613 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 619 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, |
@@ -2,10 +2,11 @@ | |||
2 | * SLUB: A slab allocator that limits cache line use instead of queuing | 2 | * SLUB: A slab allocator that limits cache line use instead of queuing |
3 | * objects in per cpu and per node lists. | 3 | * objects in per cpu and per node lists. |
4 | * | 4 | * |
5 | * The allocator synchronizes using per slab locks and only | 5 | * The allocator synchronizes using per slab locks or atomic operatios |
6 | * uses a centralized lock to manage a pool of partial slabs. | 6 | * and only uses a centralized lock to manage a pool of partial slabs. |
7 | * | 7 | * |
8 | * (C) 2007 SGI, Christoph Lameter | 8 | * (C) 2007 SGI, Christoph Lameter |
9 | * (C) 2011 Linux Foundation, Christoph Lameter | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -27,20 +28,33 @@ | |||
27 | #include <linux/memory.h> | 28 | #include <linux/memory.h> |
28 | #include <linux/math64.h> | 29 | #include <linux/math64.h> |
29 | #include <linux/fault-inject.h> | 30 | #include <linux/fault-inject.h> |
31 | #include <linux/stacktrace.h> | ||
30 | 32 | ||
31 | #include <trace/events/kmem.h> | 33 | #include <trace/events/kmem.h> |
32 | 34 | ||
33 | /* | 35 | /* |
34 | * Lock order: | 36 | * Lock order: |
35 | * 1. slab_lock(page) | 37 | * 1. slub_lock (Global Semaphore) |
36 | * 2. slab->list_lock | 38 | * 2. node->list_lock |
39 | * 3. slab_lock(page) (Only on some arches and for debugging) | ||
37 | * | 40 | * |
38 | * The slab_lock protects operations on the object of a particular | 41 | * slub_lock |
39 | * slab and its metadata in the page struct. If the slab lock | 42 | * |
40 | * has been taken then no allocations nor frees can be performed | 43 | * The role of the slub_lock is to protect the list of all the slabs |
41 | * on the objects in the slab nor can the slab be added or removed | 44 | * and to synchronize major metadata changes to slab cache structures. |
42 | * from the partial or full lists since this would mean modifying | 45 | * |
43 | * the page_struct of the slab. | 46 | * The slab_lock is only used for debugging and on arches that do not |
47 | * have the ability to do a cmpxchg_double. It only protects the second | ||
48 | * double word in the page struct. Meaning | ||
49 | * A. page->freelist -> List of object free in a page | ||
50 | * B. page->counters -> Counters of objects | ||
51 | * C. page->frozen -> frozen state | ||
52 | * | ||
53 | * If a slab is frozen then it is exempt from list management. It is not | ||
54 | * on any list. The processor that froze the slab is the one who can | ||
55 | * perform list operations on the page. Other processors may put objects | ||
56 | * onto the freelist but the processor that froze the slab is the only | ||
57 | * one that can retrieve the objects from the page's freelist. | ||
44 | * | 58 | * |
45 | * The list_lock protects the partial and full list on each node and | 59 | * The list_lock protects the partial and full list on each node and |
46 | * the partial slab counter. If taken then no new slabs may be added or | 60 | * the partial slab counter. If taken then no new slabs may be added or |
@@ -53,20 +67,6 @@ | |||
53 | * slabs, operations can continue without any centralized lock. F.e. | 67 | * slabs, operations can continue without any centralized lock. F.e. |
54 | * allocating a long series of objects that fill up slabs does not require | 68 | * allocating a long series of objects that fill up slabs does not require |
55 | * the list lock. | 69 | * the list lock. |
56 | * | ||
57 | * The lock order is sometimes inverted when we are trying to get a slab | ||
58 | * off a list. We take the list_lock and then look for a page on the list | ||
59 | * to use. While we do that objects in the slabs may be freed. We can | ||
60 | * only operate on the slab if we have also taken the slab_lock. So we use | ||
61 | * a slab_trylock() on the slab. If trylock was successful then no frees | ||
62 | * can occur anymore and we can use the slab for allocations etc. If the | ||
63 | * slab_trylock() does not succeed then frees are in progress in the slab and | ||
64 | * we must stay away from it for a while since we may cause a bouncing | ||
65 | * cacheline if we try to acquire the lock. So go onto the next slab. | ||
66 | * If all pages are busy then we may allocate a new slab instead of reusing | ||
67 | * a partial slab. A new slab has no one operating on it and thus there is | ||
68 | * no danger of cacheline contention. | ||
69 | * | ||
70 | * Interrupts are disabled during allocation and deallocation in order to | 70 | * Interrupts are disabled during allocation and deallocation in order to |
71 | * make the slab allocator safe to use in the context of an irq. In addition | 71 | * make the slab allocator safe to use in the context of an irq. In addition |
72 | * interrupts are disabled to ensure that the processor does not change | 72 | * interrupts are disabled to ensure that the processor does not change |
@@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
131 | /* Enable to test recovery from slab corruption on boot */ | 131 | /* Enable to test recovery from slab corruption on boot */ |
132 | #undef SLUB_RESILIENCY_TEST | 132 | #undef SLUB_RESILIENCY_TEST |
133 | 133 | ||
134 | /* Enable to log cmpxchg failures */ | ||
135 | #undef SLUB_DEBUG_CMPXCHG | ||
136 | |||
134 | /* | 137 | /* |
135 | * Mininum number of partial slabs. These will be left on the partial | 138 | * Mininum number of partial slabs. These will be left on the partial |
136 | * lists even if they are empty. kmem_cache_shrink may reclaim them. | 139 | * lists even if they are empty. kmem_cache_shrink may reclaim them. |
@@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
166 | 169 | ||
167 | #define OO_SHIFT 16 | 170 | #define OO_SHIFT 16 |
168 | #define OO_MASK ((1 << OO_SHIFT) - 1) | 171 | #define OO_MASK ((1 << OO_SHIFT) - 1) |
169 | #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ | 172 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
170 | 173 | ||
171 | /* Internal SLUB flags */ | 174 | /* Internal SLUB flags */ |
172 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 175 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ |
176 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | ||
173 | 177 | ||
174 | static int kmem_size = sizeof(struct kmem_cache); | 178 | static int kmem_size = sizeof(struct kmem_cache); |
175 | 179 | ||
@@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches); | |||
191 | /* | 195 | /* |
192 | * Tracking user of a slab. | 196 | * Tracking user of a slab. |
193 | */ | 197 | */ |
198 | #define TRACK_ADDRS_COUNT 16 | ||
194 | struct track { | 199 | struct track { |
195 | unsigned long addr; /* Called from address */ | 200 | unsigned long addr; /* Called from address */ |
201 | #ifdef CONFIG_STACKTRACE | ||
202 | unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ | ||
203 | #endif | ||
196 | int cpu; /* Was running on cpu */ | 204 | int cpu; /* Was running on cpu */ |
197 | int pid; /* Pid context */ | 205 | int pid; /* Pid context */ |
198 | unsigned long when; /* When did the operation occur */ | 206 | unsigned long when; /* When did the operation occur */ |
@@ -338,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
338 | return x.x & OO_MASK; | 346 | return x.x & OO_MASK; |
339 | } | 347 | } |
340 | 348 | ||
349 | /* | ||
350 | * Per slab locking using the pagelock | ||
351 | */ | ||
352 | static __always_inline void slab_lock(struct page *page) | ||
353 | { | ||
354 | bit_spin_lock(PG_locked, &page->flags); | ||
355 | } | ||
356 | |||
357 | static __always_inline void slab_unlock(struct page *page) | ||
358 | { | ||
359 | __bit_spin_unlock(PG_locked, &page->flags); | ||
360 | } | ||
361 | |||
362 | /* Interrupts must be disabled (for the fallback code to work right) */ | ||
363 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
364 | void *freelist_old, unsigned long counters_old, | ||
365 | void *freelist_new, unsigned long counters_new, | ||
366 | const char *n) | ||
367 | { | ||
368 | VM_BUG_ON(!irqs_disabled()); | ||
369 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
370 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
371 | if (cmpxchg_double(&page->freelist, | ||
372 | freelist_old, counters_old, | ||
373 | freelist_new, counters_new)) | ||
374 | return 1; | ||
375 | } else | ||
376 | #endif | ||
377 | { | ||
378 | slab_lock(page); | ||
379 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
380 | page->freelist = freelist_new; | ||
381 | page->counters = counters_new; | ||
382 | slab_unlock(page); | ||
383 | return 1; | ||
384 | } | ||
385 | slab_unlock(page); | ||
386 | } | ||
387 | |||
388 | cpu_relax(); | ||
389 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
390 | |||
391 | #ifdef SLUB_DEBUG_CMPXCHG | ||
392 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
393 | #endif | ||
394 | |||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
399 | void *freelist_old, unsigned long counters_old, | ||
400 | void *freelist_new, unsigned long counters_new, | ||
401 | const char *n) | ||
402 | { | ||
403 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
404 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
405 | if (cmpxchg_double(&page->freelist, | ||
406 | freelist_old, counters_old, | ||
407 | freelist_new, counters_new)) | ||
408 | return 1; | ||
409 | } else | ||
410 | #endif | ||
411 | { | ||
412 | unsigned long flags; | ||
413 | |||
414 | local_irq_save(flags); | ||
415 | slab_lock(page); | ||
416 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
417 | page->freelist = freelist_new; | ||
418 | page->counters = counters_new; | ||
419 | slab_unlock(page); | ||
420 | local_irq_restore(flags); | ||
421 | return 1; | ||
422 | } | ||
423 | slab_unlock(page); | ||
424 | local_irq_restore(flags); | ||
425 | } | ||
426 | |||
427 | cpu_relax(); | ||
428 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
429 | |||
430 | #ifdef SLUB_DEBUG_CMPXCHG | ||
431 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
432 | #endif | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | |||
341 | #ifdef CONFIG_SLUB_DEBUG | 437 | #ifdef CONFIG_SLUB_DEBUG |
342 | /* | 438 | /* |
343 | * Determine a map of object in use on a page. | 439 | * Determine a map of object in use on a page. |
344 | * | 440 | * |
345 | * Slab lock or node listlock must be held to guarantee that the page does | 441 | * Node listlock must be held to guarantee that the page does |
346 | * not vanish from under us. | 442 | * not vanish from under us. |
347 | */ | 443 | */ |
348 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | 444 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) |
@@ -420,6 +516,24 @@ static void set_track(struct kmem_cache *s, void *object, | |||
420 | struct track *p = get_track(s, object, alloc); | 516 | struct track *p = get_track(s, object, alloc); |
421 | 517 | ||
422 | if (addr) { | 518 | if (addr) { |
519 | #ifdef CONFIG_STACKTRACE | ||
520 | struct stack_trace trace; | ||
521 | int i; | ||
522 | |||
523 | trace.nr_entries = 0; | ||
524 | trace.max_entries = TRACK_ADDRS_COUNT; | ||
525 | trace.entries = p->addrs; | ||
526 | trace.skip = 3; | ||
527 | save_stack_trace(&trace); | ||
528 | |||
529 | /* See rant in lockdep.c */ | ||
530 | if (trace.nr_entries != 0 && | ||
531 | trace.entries[trace.nr_entries - 1] == ULONG_MAX) | ||
532 | trace.nr_entries--; | ||
533 | |||
534 | for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) | ||
535 | p->addrs[i] = 0; | ||
536 | #endif | ||
423 | p->addr = addr; | 537 | p->addr = addr; |
424 | p->cpu = smp_processor_id(); | 538 | p->cpu = smp_processor_id(); |
425 | p->pid = current->pid; | 539 | p->pid = current->pid; |
@@ -444,6 +558,16 @@ static void print_track(const char *s, struct track *t) | |||
444 | 558 | ||
445 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", | 559 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", |
446 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); | 560 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); |
561 | #ifdef CONFIG_STACKTRACE | ||
562 | { | ||
563 | int i; | ||
564 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) | ||
565 | if (t->addrs[i]) | ||
566 | printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); | ||
567 | else | ||
568 | break; | ||
569 | } | ||
570 | #endif | ||
447 | } | 571 | } |
448 | 572 | ||
449 | static void print_tracking(struct kmem_cache *s, void *object) | 573 | static void print_tracking(struct kmem_cache *s, void *object) |
@@ -557,10 +681,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
557 | memset(p + s->objsize, val, s->inuse - s->objsize); | 681 | memset(p + s->objsize, val, s->inuse - s->objsize); |
558 | } | 682 | } |
559 | 683 | ||
560 | static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | 684 | static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) |
561 | { | 685 | { |
562 | while (bytes) { | 686 | while (bytes) { |
563 | if (*start != (u8)value) | 687 | if (*start != value) |
564 | return start; | 688 | return start; |
565 | start++; | 689 | start++; |
566 | bytes--; | 690 | bytes--; |
@@ -568,6 +692,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | |||
568 | return NULL; | 692 | return NULL; |
569 | } | 693 | } |
570 | 694 | ||
695 | static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) | ||
696 | { | ||
697 | u64 value64; | ||
698 | unsigned int words, prefix; | ||
699 | |||
700 | if (bytes <= 16) | ||
701 | return check_bytes8(start, value, bytes); | ||
702 | |||
703 | value64 = value | value << 8 | value << 16 | value << 24; | ||
704 | value64 = (value64 & 0xffffffff) | value64 << 32; | ||
705 | prefix = 8 - ((unsigned long)start) % 8; | ||
706 | |||
707 | if (prefix) { | ||
708 | u8 *r = check_bytes8(start, value, prefix); | ||
709 | if (r) | ||
710 | return r; | ||
711 | start += prefix; | ||
712 | bytes -= prefix; | ||
713 | } | ||
714 | |||
715 | words = bytes / 8; | ||
716 | |||
717 | while (words) { | ||
718 | if (*(u64 *)start != value64) | ||
719 | return check_bytes8(start, value, 8); | ||
720 | start += 8; | ||
721 | words--; | ||
722 | } | ||
723 | |||
724 | return check_bytes8(start, value, bytes % 8); | ||
725 | } | ||
726 | |||
571 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | 727 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, |
572 | void *from, void *to) | 728 | void *from, void *to) |
573 | { | 729 | { |
@@ -773,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
773 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 929 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
774 | { | 930 | { |
775 | int nr = 0; | 931 | int nr = 0; |
776 | void *fp = page->freelist; | 932 | void *fp; |
777 | void *object = NULL; | 933 | void *object = NULL; |
778 | unsigned long max_objects; | 934 | unsigned long max_objects; |
779 | 935 | ||
936 | fp = page->freelist; | ||
780 | while (fp && nr <= page->objects) { | 937 | while (fp && nr <= page->objects) { |
781 | if (fp == search) | 938 | if (fp == search) |
782 | return 1; | 939 | return 1; |
@@ -881,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
881 | 1038 | ||
882 | /* | 1039 | /* |
883 | * Tracking of fully allocated slabs for debugging purposes. | 1040 | * Tracking of fully allocated slabs for debugging purposes. |
1041 | * | ||
1042 | * list_lock must be held. | ||
884 | */ | 1043 | */ |
885 | static void add_full(struct kmem_cache_node *n, struct page *page) | 1044 | static void add_full(struct kmem_cache *s, |
1045 | struct kmem_cache_node *n, struct page *page) | ||
886 | { | 1046 | { |
887 | spin_lock(&n->list_lock); | 1047 | if (!(s->flags & SLAB_STORE_USER)) |
1048 | return; | ||
1049 | |||
888 | list_add(&page->lru, &n->full); | 1050 | list_add(&page->lru, &n->full); |
889 | spin_unlock(&n->list_lock); | ||
890 | } | 1051 | } |
891 | 1052 | ||
1053 | /* | ||
1054 | * list_lock must be held. | ||
1055 | */ | ||
892 | static void remove_full(struct kmem_cache *s, struct page *page) | 1056 | static void remove_full(struct kmem_cache *s, struct page *page) |
893 | { | 1057 | { |
894 | struct kmem_cache_node *n; | ||
895 | |||
896 | if (!(s->flags & SLAB_STORE_USER)) | 1058 | if (!(s->flags & SLAB_STORE_USER)) |
897 | return; | 1059 | return; |
898 | 1060 | ||
899 | n = get_node(s, page_to_nid(page)); | ||
900 | |||
901 | spin_lock(&n->list_lock); | ||
902 | list_del(&page->lru); | 1061 | list_del(&page->lru); |
903 | spin_unlock(&n->list_lock); | ||
904 | } | 1062 | } |
905 | 1063 | ||
906 | /* Tracking of the number of slabs for debugging purposes */ | 1064 | /* Tracking of the number of slabs for debugging purposes */ |
@@ -956,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa | |||
956 | if (!check_slab(s, page)) | 1114 | if (!check_slab(s, page)) |
957 | goto bad; | 1115 | goto bad; |
958 | 1116 | ||
959 | if (!on_freelist(s, page, object)) { | ||
960 | object_err(s, page, object, "Object already allocated"); | ||
961 | goto bad; | ||
962 | } | ||
963 | |||
964 | if (!check_valid_pointer(s, page, object)) { | 1117 | if (!check_valid_pointer(s, page, object)) { |
965 | object_err(s, page, object, "Freelist Pointer check fails"); | 1118 | object_err(s, page, object, "Freelist Pointer check fails"); |
966 | goto bad; | 1119 | goto bad; |
@@ -993,6 +1146,12 @@ bad: | |||
993 | static noinline int free_debug_processing(struct kmem_cache *s, | 1146 | static noinline int free_debug_processing(struct kmem_cache *s, |
994 | struct page *page, void *object, unsigned long addr) | 1147 | struct page *page, void *object, unsigned long addr) |
995 | { | 1148 | { |
1149 | unsigned long flags; | ||
1150 | int rc = 0; | ||
1151 | |||
1152 | local_irq_save(flags); | ||
1153 | slab_lock(page); | ||
1154 | |||
996 | if (!check_slab(s, page)) | 1155 | if (!check_slab(s, page)) |
997 | goto fail; | 1156 | goto fail; |
998 | 1157 | ||
@@ -1007,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1007 | } | 1166 | } |
1008 | 1167 | ||
1009 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1168 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1010 | return 0; | 1169 | goto out; |
1011 | 1170 | ||
1012 | if (unlikely(s != page->slab)) { | 1171 | if (unlikely(s != page->slab)) { |
1013 | if (!PageSlab(page)) { | 1172 | if (!PageSlab(page)) { |
@@ -1024,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1024 | goto fail; | 1183 | goto fail; |
1025 | } | 1184 | } |
1026 | 1185 | ||
1027 | /* Special debug activities for freeing objects */ | ||
1028 | if (!PageSlubFrozen(page) && !page->freelist) | ||
1029 | remove_full(s, page); | ||
1030 | if (s->flags & SLAB_STORE_USER) | 1186 | if (s->flags & SLAB_STORE_USER) |
1031 | set_track(s, object, TRACK_FREE, addr); | 1187 | set_track(s, object, TRACK_FREE, addr); |
1032 | trace(s, page, object, 0); | 1188 | trace(s, page, object, 0); |
1033 | init_object(s, object, SLUB_RED_INACTIVE); | 1189 | init_object(s, object, SLUB_RED_INACTIVE); |
1034 | return 1; | 1190 | rc = 1; |
1191 | out: | ||
1192 | slab_unlock(page); | ||
1193 | local_irq_restore(flags); | ||
1194 | return rc; | ||
1035 | 1195 | ||
1036 | fail: | 1196 | fail: |
1037 | slab_fix(s, "Object at 0x%p not freed", object); | 1197 | slab_fix(s, "Object at 0x%p not freed", object); |
1038 | return 0; | 1198 | goto out; |
1039 | } | 1199 | } |
1040 | 1200 | ||
1041 | static int __init setup_slub_debug(char *str) | 1201 | static int __init setup_slub_debug(char *str) |
@@ -1135,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
1135 | { return 1; } | 1295 | { return 1; } |
1136 | static inline int check_object(struct kmem_cache *s, struct page *page, | 1296 | static inline int check_object(struct kmem_cache *s, struct page *page, |
1137 | void *object, u8 val) { return 1; } | 1297 | void *object, u8 val) { return 1; } |
1138 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1298 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1299 | struct page *page) {} | ||
1300 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | ||
1139 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1301 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1140 | unsigned long flags, const char *name, | 1302 | unsigned long flags, const char *name, |
1141 | void (*ctor)(void *)) | 1303 | void (*ctor)(void *)) |
@@ -1187,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1187 | struct kmem_cache_order_objects oo = s->oo; | 1349 | struct kmem_cache_order_objects oo = s->oo; |
1188 | gfp_t alloc_gfp; | 1350 | gfp_t alloc_gfp; |
1189 | 1351 | ||
1352 | flags &= gfp_allowed_mask; | ||
1353 | |||
1354 | if (flags & __GFP_WAIT) | ||
1355 | local_irq_enable(); | ||
1356 | |||
1190 | flags |= s->allocflags; | 1357 | flags |= s->allocflags; |
1191 | 1358 | ||
1192 | /* | 1359 | /* |
@@ -1203,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1203 | * Try a lower order alloc if possible | 1370 | * Try a lower order alloc if possible |
1204 | */ | 1371 | */ |
1205 | page = alloc_slab_page(flags, node, oo); | 1372 | page = alloc_slab_page(flags, node, oo); |
1206 | if (!page) | ||
1207 | return NULL; | ||
1208 | 1373 | ||
1209 | stat(s, ORDER_FALLBACK); | 1374 | if (page) |
1375 | stat(s, ORDER_FALLBACK); | ||
1210 | } | 1376 | } |
1211 | 1377 | ||
1378 | if (flags & __GFP_WAIT) | ||
1379 | local_irq_disable(); | ||
1380 | |||
1381 | if (!page) | ||
1382 | return NULL; | ||
1383 | |||
1212 | if (kmemcheck_enabled | 1384 | if (kmemcheck_enabled |
1213 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1385 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1214 | int pages = 1 << oo_order(oo); | 1386 | int pages = 1 << oo_order(oo); |
@@ -1276,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1276 | 1448 | ||
1277 | page->freelist = start; | 1449 | page->freelist = start; |
1278 | page->inuse = 0; | 1450 | page->inuse = 0; |
1451 | page->frozen = 1; | ||
1279 | out: | 1452 | out: |
1280 | return page; | 1453 | return page; |
1281 | } | 1454 | } |
@@ -1353,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
1353 | } | 1526 | } |
1354 | 1527 | ||
1355 | /* | 1528 | /* |
1356 | * Per slab locking using the pagelock | 1529 | * Management of partially allocated slabs. |
1357 | */ | 1530 | * |
1358 | static __always_inline void slab_lock(struct page *page) | 1531 | * list_lock must be held. |
1359 | { | ||
1360 | bit_spin_lock(PG_locked, &page->flags); | ||
1361 | } | ||
1362 | |||
1363 | static __always_inline void slab_unlock(struct page *page) | ||
1364 | { | ||
1365 | __bit_spin_unlock(PG_locked, &page->flags); | ||
1366 | } | ||
1367 | |||
1368 | static __always_inline int slab_trylock(struct page *page) | ||
1369 | { | ||
1370 | int rc = 1; | ||
1371 | |||
1372 | rc = bit_spin_trylock(PG_locked, &page->flags); | ||
1373 | return rc; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Management of partially allocated slabs | ||
1378 | */ | 1532 | */ |
1379 | static void add_partial(struct kmem_cache_node *n, | 1533 | static inline void add_partial(struct kmem_cache_node *n, |
1380 | struct page *page, int tail) | 1534 | struct page *page, int tail) |
1381 | { | 1535 | { |
1382 | spin_lock(&n->list_lock); | ||
1383 | n->nr_partial++; | 1536 | n->nr_partial++; |
1384 | if (tail) | 1537 | if (tail) |
1385 | list_add_tail(&page->lru, &n->partial); | 1538 | list_add_tail(&page->lru, &n->partial); |
1386 | else | 1539 | else |
1387 | list_add(&page->lru, &n->partial); | 1540 | list_add(&page->lru, &n->partial); |
1388 | spin_unlock(&n->list_lock); | ||
1389 | } | 1541 | } |
1390 | 1542 | ||
1391 | static inline void __remove_partial(struct kmem_cache_node *n, | 1543 | /* |
1544 | * list_lock must be held. | ||
1545 | */ | ||
1546 | static inline void remove_partial(struct kmem_cache_node *n, | ||
1392 | struct page *page) | 1547 | struct page *page) |
1393 | { | 1548 | { |
1394 | list_del(&page->lru); | 1549 | list_del(&page->lru); |
1395 | n->nr_partial--; | 1550 | n->nr_partial--; |
1396 | } | 1551 | } |
1397 | 1552 | ||
1398 | static void remove_partial(struct kmem_cache *s, struct page *page) | ||
1399 | { | ||
1400 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1401 | |||
1402 | spin_lock(&n->list_lock); | ||
1403 | __remove_partial(n, page); | ||
1404 | spin_unlock(&n->list_lock); | ||
1405 | } | ||
1406 | |||
1407 | /* | 1553 | /* |
1408 | * Lock slab and remove from the partial list. | 1554 | * Lock slab, remove from the partial list and put the object into the |
1555 | * per cpu freelist. | ||
1409 | * | 1556 | * |
1410 | * Must hold list_lock. | 1557 | * Must hold list_lock. |
1411 | */ | 1558 | */ |
1412 | static inline int lock_and_freeze_slab(struct kmem_cache_node *n, | 1559 | static inline int acquire_slab(struct kmem_cache *s, |
1413 | struct page *page) | 1560 | struct kmem_cache_node *n, struct page *page) |
1414 | { | 1561 | { |
1415 | if (slab_trylock(page)) { | 1562 | void *freelist; |
1416 | __remove_partial(n, page); | 1563 | unsigned long counters; |
1417 | __SetPageSlubFrozen(page); | 1564 | struct page new; |
1565 | |||
1566 | /* | ||
1567 | * Zap the freelist and set the frozen bit. | ||
1568 | * The old freelist is the list of objects for the | ||
1569 | * per cpu allocation list. | ||
1570 | */ | ||
1571 | do { | ||
1572 | freelist = page->freelist; | ||
1573 | counters = page->counters; | ||
1574 | new.counters = counters; | ||
1575 | new.inuse = page->objects; | ||
1576 | |||
1577 | VM_BUG_ON(new.frozen); | ||
1578 | new.frozen = 1; | ||
1579 | |||
1580 | } while (!__cmpxchg_double_slab(s, page, | ||
1581 | freelist, counters, | ||
1582 | NULL, new.counters, | ||
1583 | "lock and freeze")); | ||
1584 | |||
1585 | remove_partial(n, page); | ||
1586 | |||
1587 | if (freelist) { | ||
1588 | /* Populate the per cpu freelist */ | ||
1589 | this_cpu_write(s->cpu_slab->freelist, freelist); | ||
1590 | this_cpu_write(s->cpu_slab->page, page); | ||
1591 | this_cpu_write(s->cpu_slab->node, page_to_nid(page)); | ||
1418 | return 1; | 1592 | return 1; |
1593 | } else { | ||
1594 | /* | ||
1595 | * Slab page came from the wrong list. No object to allocate | ||
1596 | * from. Put it onto the correct list and continue partial | ||
1597 | * scan. | ||
1598 | */ | ||
1599 | printk(KERN_ERR "SLUB: %s : Page without available objects on" | ||
1600 | " partial list\n", s->name); | ||
1601 | return 0; | ||
1419 | } | 1602 | } |
1420 | return 0; | ||
1421 | } | 1603 | } |
1422 | 1604 | ||
1423 | /* | 1605 | /* |
1424 | * Try to allocate a partial slab from a specific node. | 1606 | * Try to allocate a partial slab from a specific node. |
1425 | */ | 1607 | */ |
1426 | static struct page *get_partial_node(struct kmem_cache_node *n) | 1608 | static struct page *get_partial_node(struct kmem_cache *s, |
1609 | struct kmem_cache_node *n) | ||
1427 | { | 1610 | { |
1428 | struct page *page; | 1611 | struct page *page; |
1429 | 1612 | ||
@@ -1438,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
1438 | 1621 | ||
1439 | spin_lock(&n->list_lock); | 1622 | spin_lock(&n->list_lock); |
1440 | list_for_each_entry(page, &n->partial, lru) | 1623 | list_for_each_entry(page, &n->partial, lru) |
1441 | if (lock_and_freeze_slab(n, page)) | 1624 | if (acquire_slab(s, n, page)) |
1442 | goto out; | 1625 | goto out; |
1443 | page = NULL; | 1626 | page = NULL; |
1444 | out: | 1627 | out: |
@@ -1489,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1489 | 1672 | ||
1490 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1673 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1491 | n->nr_partial > s->min_partial) { | 1674 | n->nr_partial > s->min_partial) { |
1492 | page = get_partial_node(n); | 1675 | page = get_partial_node(s, n); |
1493 | if (page) { | 1676 | if (page) { |
1494 | put_mems_allowed(); | 1677 | put_mems_allowed(); |
1495 | return page; | 1678 | return page; |
@@ -1509,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1509 | struct page *page; | 1692 | struct page *page; |
1510 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; | 1693 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; |
1511 | 1694 | ||
1512 | page = get_partial_node(get_node(s, searchnode)); | 1695 | page = get_partial_node(s, get_node(s, searchnode)); |
1513 | if (page || node != NUMA_NO_NODE) | 1696 | if (page || node != NUMA_NO_NODE) |
1514 | return page; | 1697 | return page; |
1515 | 1698 | ||
1516 | return get_any_partial(s, flags); | 1699 | return get_any_partial(s, flags); |
1517 | } | 1700 | } |
1518 | 1701 | ||
1519 | /* | ||
1520 | * Move a page back to the lists. | ||
1521 | * | ||
1522 | * Must be called with the slab lock held. | ||
1523 | * | ||
1524 | * On exit the slab lock will have been dropped. | ||
1525 | */ | ||
1526 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | ||
1527 | __releases(bitlock) | ||
1528 | { | ||
1529 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1530 | |||
1531 | __ClearPageSlubFrozen(page); | ||
1532 | if (page->inuse) { | ||
1533 | |||
1534 | if (page->freelist) { | ||
1535 | add_partial(n, page, tail); | ||
1536 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1537 | } else { | ||
1538 | stat(s, DEACTIVATE_FULL); | ||
1539 | if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) | ||
1540 | add_full(n, page); | ||
1541 | } | ||
1542 | slab_unlock(page); | ||
1543 | } else { | ||
1544 | stat(s, DEACTIVATE_EMPTY); | ||
1545 | if (n->nr_partial < s->min_partial) { | ||
1546 | /* | ||
1547 | * Adding an empty slab to the partial slabs in order | ||
1548 | * to avoid page allocator overhead. This slab needs | ||
1549 | * to come after the other slabs with objects in | ||
1550 | * so that the others get filled first. That way the | ||
1551 | * size of the partial list stays small. | ||
1552 | * | ||
1553 | * kmem_cache_shrink can reclaim any empty slabs from | ||
1554 | * the partial list. | ||
1555 | */ | ||
1556 | add_partial(n, page, 1); | ||
1557 | slab_unlock(page); | ||
1558 | } else { | ||
1559 | slab_unlock(page); | ||
1560 | stat(s, FREE_SLAB); | ||
1561 | discard_slab(s, page); | ||
1562 | } | ||
1563 | } | ||
1564 | } | ||
1565 | |||
1566 | #ifdef CONFIG_PREEMPT | 1702 | #ifdef CONFIG_PREEMPT |
1567 | /* | 1703 | /* |
1568 | * Calculate the next globally unique transaction for disambiguiation | 1704 | * Calculate the next globally unique transaction for disambiguiation |
@@ -1632,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s) | |||
1632 | /* | 1768 | /* |
1633 | * Remove the cpu slab | 1769 | * Remove the cpu slab |
1634 | */ | 1770 | */ |
1771 | |||
1772 | /* | ||
1773 | * Remove the cpu slab | ||
1774 | */ | ||
1635 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1775 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1636 | __releases(bitlock) | ||
1637 | { | 1776 | { |
1777 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; | ||
1638 | struct page *page = c->page; | 1778 | struct page *page = c->page; |
1639 | int tail = 1; | 1779 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1640 | 1780 | int lock = 0; | |
1641 | if (page->freelist) | 1781 | enum slab_modes l = M_NONE, m = M_NONE; |
1782 | void *freelist; | ||
1783 | void *nextfree; | ||
1784 | int tail = 0; | ||
1785 | struct page new; | ||
1786 | struct page old; | ||
1787 | |||
1788 | if (page->freelist) { | ||
1642 | stat(s, DEACTIVATE_REMOTE_FREES); | 1789 | stat(s, DEACTIVATE_REMOTE_FREES); |
1790 | tail = 1; | ||
1791 | } | ||
1792 | |||
1793 | c->tid = next_tid(c->tid); | ||
1794 | c->page = NULL; | ||
1795 | freelist = c->freelist; | ||
1796 | c->freelist = NULL; | ||
1797 | |||
1643 | /* | 1798 | /* |
1644 | * Merge cpu freelist into slab freelist. Typically we get here | 1799 | * Stage one: Free all available per cpu objects back |
1645 | * because both freelists are empty. So this is unlikely | 1800 | * to the page freelist while it is still frozen. Leave the |
1646 | * to occur. | 1801 | * last one. |
1802 | * | ||
1803 | * There is no need to take the list->lock because the page | ||
1804 | * is still frozen. | ||
1647 | */ | 1805 | */ |
1648 | while (unlikely(c->freelist)) { | 1806 | while (freelist && (nextfree = get_freepointer(s, freelist))) { |
1649 | void **object; | 1807 | void *prior; |
1808 | unsigned long counters; | ||
1809 | |||
1810 | do { | ||
1811 | prior = page->freelist; | ||
1812 | counters = page->counters; | ||
1813 | set_freepointer(s, freelist, prior); | ||
1814 | new.counters = counters; | ||
1815 | new.inuse--; | ||
1816 | VM_BUG_ON(!new.frozen); | ||
1817 | |||
1818 | } while (!__cmpxchg_double_slab(s, page, | ||
1819 | prior, counters, | ||
1820 | freelist, new.counters, | ||
1821 | "drain percpu freelist")); | ||
1822 | |||
1823 | freelist = nextfree; | ||
1824 | } | ||
1650 | 1825 | ||
1651 | tail = 0; /* Hot objects. Put the slab first */ | 1826 | /* |
1827 | * Stage two: Ensure that the page is unfrozen while the | ||
1828 | * list presence reflects the actual number of objects | ||
1829 | * during unfreeze. | ||
1830 | * | ||
1831 | * We setup the list membership and then perform a cmpxchg | ||
1832 | * with the count. If there is a mismatch then the page | ||
1833 | * is not unfrozen but the page is on the wrong list. | ||
1834 | * | ||
1835 | * Then we restart the process which may have to remove | ||
1836 | * the page from the list that we just put it on again | ||
1837 | * because the number of objects in the slab may have | ||
1838 | * changed. | ||
1839 | */ | ||
1840 | redo: | ||
1652 | 1841 | ||
1653 | /* Retrieve object from cpu_freelist */ | 1842 | old.freelist = page->freelist; |
1654 | object = c->freelist; | 1843 | old.counters = page->counters; |
1655 | c->freelist = get_freepointer(s, c->freelist); | 1844 | VM_BUG_ON(!old.frozen); |
1656 | 1845 | ||
1657 | /* And put onto the regular freelist */ | 1846 | /* Determine target state of the slab */ |
1658 | set_freepointer(s, object, page->freelist); | 1847 | new.counters = old.counters; |
1659 | page->freelist = object; | 1848 | if (freelist) { |
1660 | page->inuse--; | 1849 | new.inuse--; |
1850 | set_freepointer(s, freelist, old.freelist); | ||
1851 | new.freelist = freelist; | ||
1852 | } else | ||
1853 | new.freelist = old.freelist; | ||
1854 | |||
1855 | new.frozen = 0; | ||
1856 | |||
1857 | if (!new.inuse && n->nr_partial > s->min_partial) | ||
1858 | m = M_FREE; | ||
1859 | else if (new.freelist) { | ||
1860 | m = M_PARTIAL; | ||
1861 | if (!lock) { | ||
1862 | lock = 1; | ||
1863 | /* | ||
1864 | * Taking the spinlock removes the possiblity | ||
1865 | * that acquire_slab() will see a slab page that | ||
1866 | * is frozen | ||
1867 | */ | ||
1868 | spin_lock(&n->list_lock); | ||
1869 | } | ||
1870 | } else { | ||
1871 | m = M_FULL; | ||
1872 | if (kmem_cache_debug(s) && !lock) { | ||
1873 | lock = 1; | ||
1874 | /* | ||
1875 | * This also ensures that the scanning of full | ||
1876 | * slabs from diagnostic functions will not see | ||
1877 | * any frozen slabs. | ||
1878 | */ | ||
1879 | spin_lock(&n->list_lock); | ||
1880 | } | ||
1881 | } | ||
1882 | |||
1883 | if (l != m) { | ||
1884 | |||
1885 | if (l == M_PARTIAL) | ||
1886 | |||
1887 | remove_partial(n, page); | ||
1888 | |||
1889 | else if (l == M_FULL) | ||
1890 | |||
1891 | remove_full(s, page); | ||
1892 | |||
1893 | if (m == M_PARTIAL) { | ||
1894 | |||
1895 | add_partial(n, page, tail); | ||
1896 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1897 | |||
1898 | } else if (m == M_FULL) { | ||
1899 | |||
1900 | stat(s, DEACTIVATE_FULL); | ||
1901 | add_full(s, n, page); | ||
1902 | |||
1903 | } | ||
1904 | } | ||
1905 | |||
1906 | l = m; | ||
1907 | if (!__cmpxchg_double_slab(s, page, | ||
1908 | old.freelist, old.counters, | ||
1909 | new.freelist, new.counters, | ||
1910 | "unfreezing slab")) | ||
1911 | goto redo; | ||
1912 | |||
1913 | if (lock) | ||
1914 | spin_unlock(&n->list_lock); | ||
1915 | |||
1916 | if (m == M_FREE) { | ||
1917 | stat(s, DEACTIVATE_EMPTY); | ||
1918 | discard_slab(s, page); | ||
1919 | stat(s, FREE_SLAB); | ||
1661 | } | 1920 | } |
1662 | c->page = NULL; | ||
1663 | c->tid = next_tid(c->tid); | ||
1664 | unfreeze_slab(s, page, tail); | ||
1665 | } | 1921 | } |
1666 | 1922 | ||
1667 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1923 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1668 | { | 1924 | { |
1669 | stat(s, CPUSLAB_FLUSH); | 1925 | stat(s, CPUSLAB_FLUSH); |
1670 | slab_lock(c->page); | ||
1671 | deactivate_slab(s, c); | 1926 | deactivate_slab(s, c); |
1672 | } | 1927 | } |
1673 | 1928 | ||
@@ -1796,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1796 | void **object; | 2051 | void **object; |
1797 | struct page *page; | 2052 | struct page *page; |
1798 | unsigned long flags; | 2053 | unsigned long flags; |
2054 | struct page new; | ||
2055 | unsigned long counters; | ||
1799 | 2056 | ||
1800 | local_irq_save(flags); | 2057 | local_irq_save(flags); |
1801 | #ifdef CONFIG_PREEMPT | 2058 | #ifdef CONFIG_PREEMPT |
@@ -1814,72 +2071,102 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1814 | if (!page) | 2071 | if (!page) |
1815 | goto new_slab; | 2072 | goto new_slab; |
1816 | 2073 | ||
1817 | slab_lock(page); | 2074 | if (unlikely(!node_match(c, node))) { |
1818 | if (unlikely(!node_match(c, node))) | 2075 | stat(s, ALLOC_NODE_MISMATCH); |
1819 | goto another_slab; | 2076 | deactivate_slab(s, c); |
2077 | goto new_slab; | ||
2078 | } | ||
2079 | |||
2080 | /* must check again c->freelist in case of cpu migration or IRQ */ | ||
2081 | object = c->freelist; | ||
2082 | if (object) | ||
2083 | goto load_freelist; | ||
2084 | |||
2085 | stat(s, ALLOC_SLOWPATH); | ||
2086 | |||
2087 | do { | ||
2088 | object = page->freelist; | ||
2089 | counters = page->counters; | ||
2090 | new.counters = counters; | ||
2091 | VM_BUG_ON(!new.frozen); | ||
2092 | |||
2093 | /* | ||
2094 | * If there is no object left then we use this loop to | ||
2095 | * deactivate the slab which is simple since no objects | ||
2096 | * are left in the slab and therefore we do not need to | ||
2097 | * put the page back onto the partial list. | ||
2098 | * | ||
2099 | * If there are objects left then we retrieve them | ||
2100 | * and use them to refill the per cpu queue. | ||
2101 | */ | ||
2102 | |||
2103 | new.inuse = page->objects; | ||
2104 | new.frozen = object != NULL; | ||
2105 | |||
2106 | } while (!__cmpxchg_double_slab(s, page, | ||
2107 | object, counters, | ||
2108 | NULL, new.counters, | ||
2109 | "__slab_alloc")); | ||
2110 | |||
2111 | if (unlikely(!object)) { | ||
2112 | c->page = NULL; | ||
2113 | stat(s, DEACTIVATE_BYPASS); | ||
2114 | goto new_slab; | ||
2115 | } | ||
1820 | 2116 | ||
1821 | stat(s, ALLOC_REFILL); | 2117 | stat(s, ALLOC_REFILL); |
1822 | 2118 | ||
1823 | load_freelist: | 2119 | load_freelist: |
1824 | object = page->freelist; | 2120 | VM_BUG_ON(!page->frozen); |
1825 | if (unlikely(!object)) | ||
1826 | goto another_slab; | ||
1827 | if (kmem_cache_debug(s)) | ||
1828 | goto debug; | ||
1829 | |||
1830 | c->freelist = get_freepointer(s, object); | 2121 | c->freelist = get_freepointer(s, object); |
1831 | page->inuse = page->objects; | ||
1832 | page->freelist = NULL; | ||
1833 | |||
1834 | slab_unlock(page); | ||
1835 | c->tid = next_tid(c->tid); | 2122 | c->tid = next_tid(c->tid); |
1836 | local_irq_restore(flags); | 2123 | local_irq_restore(flags); |
1837 | stat(s, ALLOC_SLOWPATH); | ||
1838 | return object; | 2124 | return object; |
1839 | 2125 | ||
1840 | another_slab: | ||
1841 | deactivate_slab(s, c); | ||
1842 | |||
1843 | new_slab: | 2126 | new_slab: |
1844 | page = get_partial(s, gfpflags, node); | 2127 | page = get_partial(s, gfpflags, node); |
1845 | if (page) { | 2128 | if (page) { |
1846 | stat(s, ALLOC_FROM_PARTIAL); | 2129 | stat(s, ALLOC_FROM_PARTIAL); |
1847 | c->node = page_to_nid(page); | 2130 | object = c->freelist; |
1848 | c->page = page; | 2131 | |
2132 | if (kmem_cache_debug(s)) | ||
2133 | goto debug; | ||
1849 | goto load_freelist; | 2134 | goto load_freelist; |
1850 | } | 2135 | } |
1851 | 2136 | ||
1852 | gfpflags &= gfp_allowed_mask; | ||
1853 | if (gfpflags & __GFP_WAIT) | ||
1854 | local_irq_enable(); | ||
1855 | |||
1856 | page = new_slab(s, gfpflags, node); | 2137 | page = new_slab(s, gfpflags, node); |
1857 | 2138 | ||
1858 | if (gfpflags & __GFP_WAIT) | ||
1859 | local_irq_disable(); | ||
1860 | |||
1861 | if (page) { | 2139 | if (page) { |
1862 | c = __this_cpu_ptr(s->cpu_slab); | 2140 | c = __this_cpu_ptr(s->cpu_slab); |
1863 | stat(s, ALLOC_SLAB); | ||
1864 | if (c->page) | 2141 | if (c->page) |
1865 | flush_slab(s, c); | 2142 | flush_slab(s, c); |
1866 | 2143 | ||
1867 | slab_lock(page); | 2144 | /* |
1868 | __SetPageSlubFrozen(page); | 2145 | * No other reference to the page yet so we can |
2146 | * muck around with it freely without cmpxchg | ||
2147 | */ | ||
2148 | object = page->freelist; | ||
2149 | page->freelist = NULL; | ||
2150 | page->inuse = page->objects; | ||
2151 | |||
2152 | stat(s, ALLOC_SLAB); | ||
1869 | c->node = page_to_nid(page); | 2153 | c->node = page_to_nid(page); |
1870 | c->page = page; | 2154 | c->page = page; |
2155 | |||
2156 | if (kmem_cache_debug(s)) | ||
2157 | goto debug; | ||
1871 | goto load_freelist; | 2158 | goto load_freelist; |
1872 | } | 2159 | } |
1873 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 2160 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) |
1874 | slab_out_of_memory(s, gfpflags, node); | 2161 | slab_out_of_memory(s, gfpflags, node); |
1875 | local_irq_restore(flags); | 2162 | local_irq_restore(flags); |
1876 | return NULL; | 2163 | return NULL; |
2164 | |||
1877 | debug: | 2165 | debug: |
1878 | if (!alloc_debug_processing(s, page, object, addr)) | 2166 | if (!object || !alloc_debug_processing(s, page, object, addr)) |
1879 | goto another_slab; | 2167 | goto new_slab; |
1880 | 2168 | ||
1881 | page->inuse++; | 2169 | c->freelist = get_freepointer(s, object); |
1882 | page->freelist = get_freepointer(s, object); | ||
1883 | deactivate_slab(s, c); | 2170 | deactivate_slab(s, c); |
1884 | c->page = NULL; | 2171 | c->page = NULL; |
1885 | c->node = NUMA_NO_NODE; | 2172 | c->node = NUMA_NO_NODE; |
@@ -2031,52 +2318,89 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2031 | { | 2318 | { |
2032 | void *prior; | 2319 | void *prior; |
2033 | void **object = (void *)x; | 2320 | void **object = (void *)x; |
2034 | unsigned long flags; | 2321 | int was_frozen; |
2322 | int inuse; | ||
2323 | struct page new; | ||
2324 | unsigned long counters; | ||
2325 | struct kmem_cache_node *n = NULL; | ||
2326 | unsigned long uninitialized_var(flags); | ||
2035 | 2327 | ||
2036 | local_irq_save(flags); | ||
2037 | slab_lock(page); | ||
2038 | stat(s, FREE_SLOWPATH); | 2328 | stat(s, FREE_SLOWPATH); |
2039 | 2329 | ||
2040 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) | 2330 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) |
2041 | goto out_unlock; | 2331 | return; |
2042 | 2332 | ||
2043 | prior = page->freelist; | 2333 | do { |
2044 | set_freepointer(s, object, prior); | 2334 | prior = page->freelist; |
2045 | page->freelist = object; | 2335 | counters = page->counters; |
2046 | page->inuse--; | 2336 | set_freepointer(s, object, prior); |
2337 | new.counters = counters; | ||
2338 | was_frozen = new.frozen; | ||
2339 | new.inuse--; | ||
2340 | if ((!new.inuse || !prior) && !was_frozen && !n) { | ||
2341 | n = get_node(s, page_to_nid(page)); | ||
2342 | /* | ||
2343 | * Speculatively acquire the list_lock. | ||
2344 | * If the cmpxchg does not succeed then we may | ||
2345 | * drop the list_lock without any processing. | ||
2346 | * | ||
2347 | * Otherwise the list_lock will synchronize with | ||
2348 | * other processors updating the list of slabs. | ||
2349 | */ | ||
2350 | spin_lock_irqsave(&n->list_lock, flags); | ||
2351 | } | ||
2352 | inuse = new.inuse; | ||
2047 | 2353 | ||
2048 | if (unlikely(PageSlubFrozen(page))) { | 2354 | } while (!cmpxchg_double_slab(s, page, |
2049 | stat(s, FREE_FROZEN); | 2355 | prior, counters, |
2050 | goto out_unlock; | 2356 | object, new.counters, |
2051 | } | 2357 | "__slab_free")); |
2052 | 2358 | ||
2053 | if (unlikely(!page->inuse)) | 2359 | if (likely(!n)) { |
2054 | goto slab_empty; | 2360 | /* |
2361 | * The list lock was not taken therefore no list | ||
2362 | * activity can be necessary. | ||
2363 | */ | ||
2364 | if (was_frozen) | ||
2365 | stat(s, FREE_FROZEN); | ||
2366 | return; | ||
2367 | } | ||
2055 | 2368 | ||
2056 | /* | 2369 | /* |
2057 | * Objects left in the slab. If it was not on the partial list before | 2370 | * was_frozen may have been set after we acquired the list_lock in |
2058 | * then add it. | 2371 | * an earlier loop. So we need to check it here again. |
2059 | */ | 2372 | */ |
2060 | if (unlikely(!prior)) { | 2373 | if (was_frozen) |
2061 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 2374 | stat(s, FREE_FROZEN); |
2062 | stat(s, FREE_ADD_PARTIAL); | 2375 | else { |
2063 | } | 2376 | if (unlikely(!inuse && n->nr_partial > s->min_partial)) |
2377 | goto slab_empty; | ||
2064 | 2378 | ||
2065 | out_unlock: | 2379 | /* |
2066 | slab_unlock(page); | 2380 | * Objects left in the slab. If it was not on the partial list before |
2067 | local_irq_restore(flags); | 2381 | * then add it. |
2382 | */ | ||
2383 | if (unlikely(!prior)) { | ||
2384 | remove_full(s, page); | ||
2385 | add_partial(n, page, 1); | ||
2386 | stat(s, FREE_ADD_PARTIAL); | ||
2387 | } | ||
2388 | } | ||
2389 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2068 | return; | 2390 | return; |
2069 | 2391 | ||
2070 | slab_empty: | 2392 | slab_empty: |
2071 | if (prior) { | 2393 | if (prior) { |
2072 | /* | 2394 | /* |
2073 | * Slab still on the partial list. | 2395 | * Slab on the partial list. |
2074 | */ | 2396 | */ |
2075 | remove_partial(s, page); | 2397 | remove_partial(n, page); |
2076 | stat(s, FREE_REMOVE_PARTIAL); | 2398 | stat(s, FREE_REMOVE_PARTIAL); |
2077 | } | 2399 | } else |
2078 | slab_unlock(page); | 2400 | /* Slab must be on the full list */ |
2079 | local_irq_restore(flags); | 2401 | remove_full(s, page); |
2402 | |||
2403 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2080 | stat(s, FREE_SLAB); | 2404 | stat(s, FREE_SLAB); |
2081 | discard_slab(s, page); | 2405 | discard_slab(s, page); |
2082 | } | 2406 | } |
@@ -2350,7 +2674,6 @@ static void early_kmem_cache_node_alloc(int node) | |||
2350 | { | 2674 | { |
2351 | struct page *page; | 2675 | struct page *page; |
2352 | struct kmem_cache_node *n; | 2676 | struct kmem_cache_node *n; |
2353 | unsigned long flags; | ||
2354 | 2677 | ||
2355 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); | 2678 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); |
2356 | 2679 | ||
@@ -2368,6 +2691,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2368 | BUG_ON(!n); | 2691 | BUG_ON(!n); |
2369 | page->freelist = get_freepointer(kmem_cache_node, n); | 2692 | page->freelist = get_freepointer(kmem_cache_node, n); |
2370 | page->inuse++; | 2693 | page->inuse++; |
2694 | page->frozen = 0; | ||
2371 | kmem_cache_node->node[node] = n; | 2695 | kmem_cache_node->node[node] = n; |
2372 | #ifdef CONFIG_SLUB_DEBUG | 2696 | #ifdef CONFIG_SLUB_DEBUG |
2373 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2697 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
@@ -2376,14 +2700,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2376 | init_kmem_cache_node(n, kmem_cache_node); | 2700 | init_kmem_cache_node(n, kmem_cache_node); |
2377 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2701 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2378 | 2702 | ||
2379 | /* | ||
2380 | * lockdep requires consistent irq usage for each lock | ||
2381 | * so even though there cannot be a race this early in | ||
2382 | * the boot sequence, we still disable irqs. | ||
2383 | */ | ||
2384 | local_irq_save(flags); | ||
2385 | add_partial(n, page, 0); | 2703 | add_partial(n, page, 0); |
2386 | local_irq_restore(flags); | ||
2387 | } | 2704 | } |
2388 | 2705 | ||
2389 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2706 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
@@ -2589,6 +2906,12 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
2589 | } | 2906 | } |
2590 | } | 2907 | } |
2591 | 2908 | ||
2909 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
2910 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) | ||
2911 | /* Enable fast mode */ | ||
2912 | s->flags |= __CMPXCHG_DOUBLE; | ||
2913 | #endif | ||
2914 | |||
2592 | /* | 2915 | /* |
2593 | * The larger the object size is, the more pages we want on the partial | 2916 | * The larger the object size is, the more pages we want on the partial |
2594 | * list to avoid pounding the page allocator excessively. | 2917 | * list to avoid pounding the page allocator excessively. |
@@ -2661,7 +2984,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | |||
2661 | spin_lock_irqsave(&n->list_lock, flags); | 2984 | spin_lock_irqsave(&n->list_lock, flags); |
2662 | list_for_each_entry_safe(page, h, &n->partial, lru) { | 2985 | list_for_each_entry_safe(page, h, &n->partial, lru) { |
2663 | if (!page->inuse) { | 2986 | if (!page->inuse) { |
2664 | __remove_partial(n, page); | 2987 | remove_partial(n, page); |
2665 | discard_slab(s, page); | 2988 | discard_slab(s, page); |
2666 | } else { | 2989 | } else { |
2667 | list_slab_objects(s, page, | 2990 | list_slab_objects(s, page, |
@@ -2928,6 +3251,42 @@ size_t ksize(const void *object) | |||
2928 | } | 3251 | } |
2929 | EXPORT_SYMBOL(ksize); | 3252 | EXPORT_SYMBOL(ksize); |
2930 | 3253 | ||
3254 | #ifdef CONFIG_SLUB_DEBUG | ||
3255 | bool verify_mem_not_deleted(const void *x) | ||
3256 | { | ||
3257 | struct page *page; | ||
3258 | void *object = (void *)x; | ||
3259 | unsigned long flags; | ||
3260 | bool rv; | ||
3261 | |||
3262 | if (unlikely(ZERO_OR_NULL_PTR(x))) | ||
3263 | return false; | ||
3264 | |||
3265 | local_irq_save(flags); | ||
3266 | |||
3267 | page = virt_to_head_page(x); | ||
3268 | if (unlikely(!PageSlab(page))) { | ||
3269 | /* maybe it was from stack? */ | ||
3270 | rv = true; | ||
3271 | goto out_unlock; | ||
3272 | } | ||
3273 | |||
3274 | slab_lock(page); | ||
3275 | if (on_freelist(page->slab, page, object)) { | ||
3276 | object_err(page->slab, page, object, "Object is on free-list"); | ||
3277 | rv = false; | ||
3278 | } else { | ||
3279 | rv = true; | ||
3280 | } | ||
3281 | slab_unlock(page); | ||
3282 | |||
3283 | out_unlock: | ||
3284 | local_irq_restore(flags); | ||
3285 | return rv; | ||
3286 | } | ||
3287 | EXPORT_SYMBOL(verify_mem_not_deleted); | ||
3288 | #endif | ||
3289 | |||
2931 | void kfree(const void *x) | 3290 | void kfree(const void *x) |
2932 | { | 3291 | { |
2933 | struct page *page; | 3292 | struct page *page; |
@@ -2993,14 +3352,8 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2993 | * list_lock. page->inuse here is the upper limit. | 3352 | * list_lock. page->inuse here is the upper limit. |
2994 | */ | 3353 | */ |
2995 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3354 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
2996 | if (!page->inuse && slab_trylock(page)) { | 3355 | if (!page->inuse) { |
2997 | /* | 3356 | remove_partial(n, page); |
2998 | * Must hold slab lock here because slab_free | ||
2999 | * may have freed the last object and be | ||
3000 | * waiting to release the slab. | ||
3001 | */ | ||
3002 | __remove_partial(n, page); | ||
3003 | slab_unlock(page); | ||
3004 | discard_slab(s, page); | 3357 | discard_slab(s, page); |
3005 | } else { | 3358 | } else { |
3006 | list_move(&page->lru, | 3359 | list_move(&page->lru, |
@@ -3588,12 +3941,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3588 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, | 3941 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, |
3589 | unsigned long *map) | 3942 | unsigned long *map) |
3590 | { | 3943 | { |
3591 | if (slab_trylock(page)) { | 3944 | slab_lock(page); |
3592 | validate_slab(s, page, map); | 3945 | validate_slab(s, page, map); |
3593 | slab_unlock(page); | 3946 | slab_unlock(page); |
3594 | } else | ||
3595 | printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", | ||
3596 | s->name, page); | ||
3597 | } | 3947 | } |
3598 | 3948 | ||
3599 | static int validate_slab_node(struct kmem_cache *s, | 3949 | static int validate_slab_node(struct kmem_cache *s, |
@@ -4058,7 +4408,7 @@ static int any_slab_objects(struct kmem_cache *s) | |||
4058 | #endif | 4408 | #endif |
4059 | 4409 | ||
4060 | #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) | 4410 | #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) |
4061 | #define to_slab(n) container_of(n, struct kmem_cache, kobj); | 4411 | #define to_slab(n) container_of(n, struct kmem_cache, kobj) |
4062 | 4412 | ||
4063 | struct slab_attribute { | 4413 | struct slab_attribute { |
4064 | struct attribute attr; | 4414 | struct attribute attr; |
@@ -4241,8 +4591,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, | |||
4241 | const char *buf, size_t length) | 4591 | const char *buf, size_t length) |
4242 | { | 4592 | { |
4243 | s->flags &= ~SLAB_DEBUG_FREE; | 4593 | s->flags &= ~SLAB_DEBUG_FREE; |
4244 | if (buf[0] == '1') | 4594 | if (buf[0] == '1') { |
4595 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4245 | s->flags |= SLAB_DEBUG_FREE; | 4596 | s->flags |= SLAB_DEBUG_FREE; |
4597 | } | ||
4246 | return length; | 4598 | return length; |
4247 | } | 4599 | } |
4248 | SLAB_ATTR(sanity_checks); | 4600 | SLAB_ATTR(sanity_checks); |
@@ -4256,8 +4608,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
4256 | size_t length) | 4608 | size_t length) |
4257 | { | 4609 | { |
4258 | s->flags &= ~SLAB_TRACE; | 4610 | s->flags &= ~SLAB_TRACE; |
4259 | if (buf[0] == '1') | 4611 | if (buf[0] == '1') { |
4612 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4260 | s->flags |= SLAB_TRACE; | 4613 | s->flags |= SLAB_TRACE; |
4614 | } | ||
4261 | return length; | 4615 | return length; |
4262 | } | 4616 | } |
4263 | SLAB_ATTR(trace); | 4617 | SLAB_ATTR(trace); |
@@ -4274,8 +4628,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, | |||
4274 | return -EBUSY; | 4628 | return -EBUSY; |
4275 | 4629 | ||
4276 | s->flags &= ~SLAB_RED_ZONE; | 4630 | s->flags &= ~SLAB_RED_ZONE; |
4277 | if (buf[0] == '1') | 4631 | if (buf[0] == '1') { |
4632 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4278 | s->flags |= SLAB_RED_ZONE; | 4633 | s->flags |= SLAB_RED_ZONE; |
4634 | } | ||
4279 | calculate_sizes(s, -1); | 4635 | calculate_sizes(s, -1); |
4280 | return length; | 4636 | return length; |
4281 | } | 4637 | } |
@@ -4293,8 +4649,10 @@ static ssize_t poison_store(struct kmem_cache *s, | |||
4293 | return -EBUSY; | 4649 | return -EBUSY; |
4294 | 4650 | ||
4295 | s->flags &= ~SLAB_POISON; | 4651 | s->flags &= ~SLAB_POISON; |
4296 | if (buf[0] == '1') | 4652 | if (buf[0] == '1') { |
4653 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4297 | s->flags |= SLAB_POISON; | 4654 | s->flags |= SLAB_POISON; |
4655 | } | ||
4298 | calculate_sizes(s, -1); | 4656 | calculate_sizes(s, -1); |
4299 | return length; | 4657 | return length; |
4300 | } | 4658 | } |
@@ -4312,8 +4670,10 @@ static ssize_t store_user_store(struct kmem_cache *s, | |||
4312 | return -EBUSY; | 4670 | return -EBUSY; |
4313 | 4671 | ||
4314 | s->flags &= ~SLAB_STORE_USER; | 4672 | s->flags &= ~SLAB_STORE_USER; |
4315 | if (buf[0] == '1') | 4673 | if (buf[0] == '1') { |
4674 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4316 | s->flags |= SLAB_STORE_USER; | 4675 | s->flags |= SLAB_STORE_USER; |
4676 | } | ||
4317 | calculate_sizes(s, -1); | 4677 | calculate_sizes(s, -1); |
4318 | return length; | 4678 | return length; |
4319 | } | 4679 | } |
@@ -4478,6 +4838,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); | |||
4478 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); | 4838 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); |
4479 | STAT_ATTR(ALLOC_SLAB, alloc_slab); | 4839 | STAT_ATTR(ALLOC_SLAB, alloc_slab); |
4480 | STAT_ATTR(ALLOC_REFILL, alloc_refill); | 4840 | STAT_ATTR(ALLOC_REFILL, alloc_refill); |
4841 | STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); | ||
4481 | STAT_ATTR(FREE_SLAB, free_slab); | 4842 | STAT_ATTR(FREE_SLAB, free_slab); |
4482 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); | 4843 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); |
4483 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); | 4844 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); |
@@ -4485,7 +4846,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); | |||
4485 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); | 4846 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); |
4486 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); | 4847 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); |
4487 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); | 4848 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); |
4849 | STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); | ||
4488 | STAT_ATTR(ORDER_FALLBACK, order_fallback); | 4850 | STAT_ATTR(ORDER_FALLBACK, order_fallback); |
4851 | STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); | ||
4852 | STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); | ||
4489 | #endif | 4853 | #endif |
4490 | 4854 | ||
4491 | static struct attribute *slab_attrs[] = { | 4855 | static struct attribute *slab_attrs[] = { |
@@ -4535,6 +4899,7 @@ static struct attribute *slab_attrs[] = { | |||
4535 | &alloc_from_partial_attr.attr, | 4899 | &alloc_from_partial_attr.attr, |
4536 | &alloc_slab_attr.attr, | 4900 | &alloc_slab_attr.attr, |
4537 | &alloc_refill_attr.attr, | 4901 | &alloc_refill_attr.attr, |
4902 | &alloc_node_mismatch_attr.attr, | ||
4538 | &free_slab_attr.attr, | 4903 | &free_slab_attr.attr, |
4539 | &cpuslab_flush_attr.attr, | 4904 | &cpuslab_flush_attr.attr, |
4540 | &deactivate_full_attr.attr, | 4905 | &deactivate_full_attr.attr, |
@@ -4542,7 +4907,10 @@ static struct attribute *slab_attrs[] = { | |||
4542 | &deactivate_to_head_attr.attr, | 4907 | &deactivate_to_head_attr.attr, |
4543 | &deactivate_to_tail_attr.attr, | 4908 | &deactivate_to_tail_attr.attr, |
4544 | &deactivate_remote_frees_attr.attr, | 4909 | &deactivate_remote_frees_attr.attr, |
4910 | &deactivate_bypass_attr.attr, | ||
4545 | &order_fallback_attr.attr, | 4911 | &order_fallback_attr.attr, |
4912 | &cmpxchg_double_fail_attr.attr, | ||
4913 | &cmpxchg_double_cpu_fail_attr.attr, | ||
4546 | #endif | 4914 | #endif |
4547 | #ifdef CONFIG_FAILSLAB | 4915 | #ifdef CONFIG_FAILSLAB |
4548 | &failslab_attr.attr, | 4916 | &failslab_attr.attr, |
diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12831a..858e1dff9b2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | |||
40 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | 40 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; |
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | int page_to_nid(struct page *page) | 43 | int page_to_nid(const struct page *page) |
44 | { | 44 | { |
45 | return section_to_node_table[page_to_section(page)]; | 45 | return section_to_node_table[page_to_section(page)]; |
46 | } | 46 | } |
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) | |||
78 | { | 78 | { |
79 | if (unlikely(PageTail(page))) { | 79 | if (unlikely(PageTail(page))) { |
80 | /* __split_huge_page_refcount can run under us */ | 80 | /* __split_huge_page_refcount can run under us */ |
81 | struct page *page_head = page->first_page; | 81 | struct page *page_head = compound_trans_head(page); |
82 | smp_rmb(); | 82 | |
83 | /* | 83 | if (likely(page != page_head && |
84 | * If PageTail is still set after smp_rmb() we can be sure | 84 | get_page_unless_zero(page_head))) { |
85 | * that the page->first_page we read wasn't a dangling pointer. | ||
86 | * See __split_huge_page_refcount() smp_wmb(). | ||
87 | */ | ||
88 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
89 | unsigned long flags; | 85 | unsigned long flags; |
90 | /* | 86 | /* |
91 | * Verify that our page_head wasn't converted | 87 | * page_head wasn't a dangling pointer but it |
92 | * to a a regular page before we got a | 88 | * may not be a head page anymore by the time |
93 | * reference on it. | 89 | * we obtain the lock. That is ok as long as it |
90 | * can't be freed from under us. | ||
94 | */ | 91 | */ |
95 | if (unlikely(!PageHead(page_head))) { | ||
96 | /* PageHead is cleared after PageTail */ | ||
97 | smp_rmb(); | ||
98 | VM_BUG_ON(PageTail(page)); | ||
99 | goto out_put_head; | ||
100 | } | ||
101 | /* | ||
102 | * Only run compound_lock on a valid PageHead, | ||
103 | * after having it pinned with | ||
104 | * get_page_unless_zero() above. | ||
105 | */ | ||
106 | smp_mb(); | ||
107 | /* page_head wasn't a dangling pointer */ | ||
108 | flags = compound_lock_irqsave(page_head); | 92 | flags = compound_lock_irqsave(page_head); |
109 | if (unlikely(!PageTail(page))) { | 93 | if (unlikely(!PageTail(page))) { |
110 | /* __split_huge_page_refcount run before us */ | 94 | /* __split_huge_page_refcount run before us */ |
111 | compound_unlock_irqrestore(page_head, flags); | 95 | compound_unlock_irqrestore(page_head, flags); |
112 | VM_BUG_ON(PageHead(page_head)); | 96 | VM_BUG_ON(PageHead(page_head)); |
113 | out_put_head: | ||
114 | if (put_page_testzero(page_head)) | 97 | if (put_page_testzero(page_head)) |
115 | __put_single_page(page_head); | 98 | __put_single_page(page_head); |
116 | out_put_single: | 99 | out_put_single: |
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) | |||
121 | VM_BUG_ON(page_head != page->first_page); | 104 | VM_BUG_ON(page_head != page->first_page); |
122 | /* | 105 | /* |
123 | * We can release the refcount taken by | 106 | * We can release the refcount taken by |
124 | * get_page_unless_zero now that | 107 | * get_page_unless_zero() now that |
125 | * split_huge_page_refcount is blocked on the | 108 | * __split_huge_page_refcount() is blocked on |
126 | * compound_lock. | 109 | * the compound_lock. |
127 | */ | 110 | */ |
128 | if (put_page_testzero(page_head)) | 111 | if (put_page_testzero(page_head)) |
129 | VM_BUG_ON(1); | 112 | VM_BUG_ON(1); |
130 | /* __split_huge_page_refcount will wait now */ | 113 | /* __split_huge_page_refcount will wait now */ |
131 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 114 | VM_BUG_ON(page_mapcount(page) <= 0); |
132 | atomic_dec(&page->_count); | 115 | atomic_dec(&page->_mapcount); |
133 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 116 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
117 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
134 | compound_unlock_irqrestore(page_head, flags); | 118 | compound_unlock_irqrestore(page_head, flags); |
135 | if (put_page_testzero(page_head)) { | 119 | if (put_page_testzero(page_head)) { |
136 | if (PageHead(page_head)) | 120 | if (PageHead(page_head)) |
@@ -160,6 +144,45 @@ void put_page(struct page *page) | |||
160 | } | 144 | } |
161 | EXPORT_SYMBOL(put_page); | 145 | EXPORT_SYMBOL(put_page); |
162 | 146 | ||
147 | /* | ||
148 | * This function is exported but must not be called by anything other | ||
149 | * than get_page(). It implements the slow path of get_page(). | ||
150 | */ | ||
151 | bool __get_page_tail(struct page *page) | ||
152 | { | ||
153 | /* | ||
154 | * This takes care of get_page() if run on a tail page | ||
155 | * returned by one of the get_user_pages/follow_page variants. | ||
156 | * get_user_pages/follow_page itself doesn't need the compound | ||
157 | * lock because it runs __get_page_tail_foll() under the | ||
158 | * proper PT lock that already serializes against | ||
159 | * split_huge_page(). | ||
160 | */ | ||
161 | unsigned long flags; | ||
162 | bool got = false; | ||
163 | struct page *page_head = compound_trans_head(page); | ||
164 | |||
165 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
166 | /* | ||
167 | * page_head wasn't a dangling pointer but it | ||
168 | * may not be a head page anymore by the time | ||
169 | * we obtain the lock. That is ok as long as it | ||
170 | * can't be freed from under us. | ||
171 | */ | ||
172 | flags = compound_lock_irqsave(page_head); | ||
173 | /* here __split_huge_page_refcount won't run anymore */ | ||
174 | if (likely(PageTail(page))) { | ||
175 | __get_page_tail_foll(page, false); | ||
176 | got = true; | ||
177 | } | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
179 | if (unlikely(!got)) | ||
180 | put_page(page_head); | ||
181 | } | ||
182 | return got; | ||
183 | } | ||
184 | EXPORT_SYMBOL(__get_page_tail); | ||
185 | |||
163 | /** | 186 | /** |
164 | * put_pages_list() - release a list of pages | 187 | * put_pages_list() - release a list of pages |
165 | * @pages: list of pages threaded on page->lru | 188 | * @pages: list of pages threaded on page->lru |
diff --git a/mm/swapfile.c b/mm/swapfile.c index ff8dc1a18cb..17bc224bce6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1681,19 +1681,14 @@ out: | |||
1681 | } | 1681 | } |
1682 | 1682 | ||
1683 | #ifdef CONFIG_PROC_FS | 1683 | #ifdef CONFIG_PROC_FS |
1684 | struct proc_swaps { | ||
1685 | struct seq_file seq; | ||
1686 | int event; | ||
1687 | }; | ||
1688 | |||
1689 | static unsigned swaps_poll(struct file *file, poll_table *wait) | 1684 | static unsigned swaps_poll(struct file *file, poll_table *wait) |
1690 | { | 1685 | { |
1691 | struct proc_swaps *s = file->private_data; | 1686 | struct seq_file *seq = file->private_data; |
1692 | 1687 | ||
1693 | poll_wait(file, &proc_poll_wait, wait); | 1688 | poll_wait(file, &proc_poll_wait, wait); |
1694 | 1689 | ||
1695 | if (s->event != atomic_read(&proc_poll_event)) { | 1690 | if (seq->poll_event != atomic_read(&proc_poll_event)) { |
1696 | s->event = atomic_read(&proc_poll_event); | 1691 | seq->poll_event = atomic_read(&proc_poll_event); |
1697 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; | 1692 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; |
1698 | } | 1693 | } |
1699 | 1694 | ||
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = { | |||
1783 | 1778 | ||
1784 | static int swaps_open(struct inode *inode, struct file *file) | 1779 | static int swaps_open(struct inode *inode, struct file *file) |
1785 | { | 1780 | { |
1786 | struct proc_swaps *s; | 1781 | struct seq_file *seq; |
1787 | int ret; | 1782 | int ret; |
1788 | 1783 | ||
1789 | s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); | ||
1790 | if (!s) | ||
1791 | return -ENOMEM; | ||
1792 | |||
1793 | file->private_data = s; | ||
1794 | |||
1795 | ret = seq_open(file, &swaps_op); | 1784 | ret = seq_open(file, &swaps_op); |
1796 | if (ret) { | 1785 | if (ret) |
1797 | kfree(s); | ||
1798 | return ret; | 1786 | return ret; |
1799 | } | ||
1800 | 1787 | ||
1801 | s->seq.private = s; | 1788 | seq = file->private_data; |
1802 | s->event = atomic_read(&proc_poll_event); | 1789 | seq->poll_event = atomic_read(&proc_poll_event); |
1803 | return ret; | 1790 | return 0; |
1804 | } | 1791 | } |
1805 | 1792 | ||
1806 | static const struct file_operations proc_swaps_operations = { | 1793 | static const struct file_operations proc_swaps_operations = { |
@@ -1937,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1937 | 1924 | ||
1938 | /* | 1925 | /* |
1939 | * Find out how many pages are allowed for a single swap | 1926 | * Find out how many pages are allowed for a single swap |
1940 | * device. There are two limiting factors: 1) the number of | 1927 | * device. There are three limiting factors: 1) the number |
1941 | * bits for the swap offset in the swp_entry_t type and | 1928 | * of bits for the swap offset in the swp_entry_t type, and |
1942 | * 2) the number of bits in the a swap pte as defined by | 1929 | * 2) the number of bits in the swap pte as defined by the |
1943 | * the different architectures. In order to find the | 1930 | * the different architectures, and 3) the number of free bits |
1944 | * largest possible bit mask a swap entry with swap type 0 | 1931 | * in an exceptional radix_tree entry. In order to find the |
1932 | * largest possible bit mask, a swap entry with swap type 0 | ||
1945 | * and swap offset ~0UL is created, encoded to a swap pte, | 1933 | * and swap offset ~0UL is created, encoded to a swap pte, |
1946 | * decoded to a swp_entry_t again and finally the swap | 1934 | * decoded to a swp_entry_t again, and finally the swap |
1947 | * offset is extracted. This will mask all the bits from | 1935 | * offset is extracted. This will mask all the bits from |
1948 | * the initial ~0UL mask that can't be encoded in either | 1936 | * the initial ~0UL mask that can't be encoded in either |
1949 | * the swp_entry_t or the architecture definition of a | 1937 | * the swp_entry_t or the architecture definition of a |
1950 | * swap pte. | 1938 | * swap pte. Then the same is done for a radix_tree entry. |
1951 | */ | 1939 | */ |
1952 | maxpages = swp_offset(pte_to_swp_entry( | 1940 | maxpages = swp_offset(pte_to_swp_entry( |
1953 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; | 1941 | swp_entry_to_pte(swp_entry(0, ~0UL)))); |
1942 | maxpages = swp_offset(radix_to_swp_entry( | ||
1943 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1944 | |||
1954 | if (maxpages > swap_header->info.last_page) { | 1945 | if (maxpages > swap_header->info.last_page) { |
1955 | maxpages = swap_header->info.last_page + 1; | 1946 | maxpages = swap_header->info.last_page + 1; |
1956 | /* p->max is an unsigned int: don't overflow it */ | 1947 | /* p->max is an unsigned int: don't overflow it */ |
diff --git a/mm/thrash.c b/mm/thrash.c index fabf2d0f516..e53f7d02c17 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Released under the GPL, see the file COPYING for details. | 6 | * Released under the GPL, see the file COPYING for details. |
7 | * | 7 | * |
8 | * Simple token based thrashing protection, using the algorithm | 8 | * Simple token based thrashing protection, using the algorithm |
9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf | 9 | * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html |
10 | * | 10 | * |
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | 11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> |
12 | * Improved algorithm to pass token: | 12 | * Improved algorithm to pass token: |
@@ -30,8 +30,6 @@ | |||
30 | static DEFINE_SPINLOCK(swap_token_lock); | 30 | static DEFINE_SPINLOCK(swap_token_lock); |
31 | struct mm_struct *swap_token_mm; | 31 | struct mm_struct *swap_token_mm; |
32 | struct mem_cgroup *swap_token_memcg; | 32 | struct mem_cgroup *swap_token_memcg; |
33 | static unsigned int global_faults; | ||
34 | static unsigned int last_aging; | ||
35 | 33 | ||
36 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
37 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | 35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) |
@@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm) | |||
55 | { | 53 | { |
56 | int current_interval; | 54 | int current_interval; |
57 | unsigned int old_prio = mm->token_priority; | 55 | unsigned int old_prio = mm->token_priority; |
56 | static unsigned int global_faults; | ||
57 | static unsigned int last_aging; | ||
58 | 58 | ||
59 | global_faults++; | 59 | global_faults++; |
60 | 60 | ||
@@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm) | |||
67 | if (!swap_token_mm) | 67 | if (!swap_token_mm) |
68 | goto replace_token; | 68 | goto replace_token; |
69 | 69 | ||
70 | /* | ||
71 | * Usually, we don't need priority aging because long interval faults | ||
72 | * makes priority decrease quickly. But there is one exception. If the | ||
73 | * token owner task is sleeping, it never make long interval faults. | ||
74 | * Thus, we need a priority aging mechanism instead. The requirements | ||
75 | * of priority aging are | ||
76 | * 1) An aging interval is reasonable enough long. Too short aging | ||
77 | * interval makes quick swap token lost and decrease performance. | ||
78 | * 2) The swap token owner task have to get priority aging even if | ||
79 | * it's under sleep. | ||
80 | */ | ||
70 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { | 81 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { |
71 | swap_token_mm->token_priority /= 2; | 82 | swap_token_mm->token_priority /= 2; |
72 | last_aging = global_faults; | 83 | last_aging = global_faults; |
diff --git a/mm/truncate.c b/mm/truncate.c index e13f22efaad..b40ac6d4e86 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) | |||
199 | * The first pass will remove most pages, so the search cost of the second pass | 199 | * The first pass will remove most pages, so the search cost of the second pass |
200 | * is low. | 200 | * is low. |
201 | * | 201 | * |
202 | * When looking at page->index outside the page lock we need to be careful to | ||
203 | * copy it into a local to avoid races (it could change at any time). | ||
204 | * | ||
205 | * We pass down the cache-hot hint to the page freeing code. Even if the | 202 | * We pass down the cache-hot hint to the page freeing code. Even if the |
206 | * mapping is large, it is probably the case that the final pages are the most | 203 | * mapping is large, it is probably the case that the final pages are the most |
207 | * recently touched, and freeing happens in ascending file offset order. | 204 | * recently touched, and freeing happens in ascending file offset order. |
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
210 | loff_t lstart, loff_t lend) | 207 | loff_t lstart, loff_t lend) |
211 | { | 208 | { |
212 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 209 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
213 | pgoff_t end; | ||
214 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 210 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
215 | struct pagevec pvec; | 211 | struct pagevec pvec; |
216 | pgoff_t next; | 212 | pgoff_t index; |
213 | pgoff_t end; | ||
217 | int i; | 214 | int i; |
218 | 215 | ||
219 | cleancache_flush_inode(mapping); | 216 | cleancache_flush_inode(mapping); |
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
224 | end = (lend >> PAGE_CACHE_SHIFT); | 221 | end = (lend >> PAGE_CACHE_SHIFT); |
225 | 222 | ||
226 | pagevec_init(&pvec, 0); | 223 | pagevec_init(&pvec, 0); |
227 | next = start; | 224 | index = start; |
228 | while (next <= end && | 225 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
229 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 226 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
230 | mem_cgroup_uncharge_start(); | 227 | mem_cgroup_uncharge_start(); |
231 | for (i = 0; i < pagevec_count(&pvec); i++) { | 228 | for (i = 0; i < pagevec_count(&pvec); i++) { |
232 | struct page *page = pvec.pages[i]; | 229 | struct page *page = pvec.pages[i]; |
233 | pgoff_t page_index = page->index; | ||
234 | 230 | ||
235 | if (page_index > end) { | 231 | /* We rely upon deletion not changing page->index */ |
236 | next = page_index; | 232 | index = page->index; |
233 | if (index > end) | ||
237 | break; | 234 | break; |
238 | } | ||
239 | 235 | ||
240 | if (page_index > next) | ||
241 | next = page_index; | ||
242 | next++; | ||
243 | if (!trylock_page(page)) | 236 | if (!trylock_page(page)) |
244 | continue; | 237 | continue; |
238 | WARN_ON(page->index != index); | ||
245 | if (PageWriteback(page)) { | 239 | if (PageWriteback(page)) { |
246 | unlock_page(page); | 240 | unlock_page(page); |
247 | continue; | 241 | continue; |
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
252 | pagevec_release(&pvec); | 246 | pagevec_release(&pvec); |
253 | mem_cgroup_uncharge_end(); | 247 | mem_cgroup_uncharge_end(); |
254 | cond_resched(); | 248 | cond_resched(); |
249 | index++; | ||
255 | } | 250 | } |
256 | 251 | ||
257 | if (partial) { | 252 | if (partial) { |
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
264 | } | 259 | } |
265 | } | 260 | } |
266 | 261 | ||
267 | next = start; | 262 | index = start; |
268 | for ( ; ; ) { | 263 | for ( ; ; ) { |
269 | cond_resched(); | 264 | cond_resched(); |
270 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 265 | if (!pagevec_lookup(&pvec, mapping, index, |
271 | if (next == start) | 266 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
267 | if (index == start) | ||
272 | break; | 268 | break; |
273 | next = start; | 269 | index = start; |
274 | continue; | 270 | continue; |
275 | } | 271 | } |
276 | if (pvec.pages[0]->index > end) { | 272 | if (index == start && pvec.pages[0]->index > end) { |
277 | pagevec_release(&pvec); | 273 | pagevec_release(&pvec); |
278 | break; | 274 | break; |
279 | } | 275 | } |
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
281 | for (i = 0; i < pagevec_count(&pvec); i++) { | 277 | for (i = 0; i < pagevec_count(&pvec); i++) { |
282 | struct page *page = pvec.pages[i]; | 278 | struct page *page = pvec.pages[i]; |
283 | 279 | ||
284 | if (page->index > end) | 280 | /* We rely upon deletion not changing page->index */ |
281 | index = page->index; | ||
282 | if (index > end) | ||
285 | break; | 283 | break; |
284 | |||
286 | lock_page(page); | 285 | lock_page(page); |
286 | WARN_ON(page->index != index); | ||
287 | wait_on_page_writeback(page); | 287 | wait_on_page_writeback(page); |
288 | truncate_inode_page(mapping, page); | 288 | truncate_inode_page(mapping, page); |
289 | if (page->index > next) | ||
290 | next = page->index; | ||
291 | next++; | ||
292 | unlock_page(page); | 289 | unlock_page(page); |
293 | } | 290 | } |
294 | pagevec_release(&pvec); | 291 | pagevec_release(&pvec); |
295 | mem_cgroup_uncharge_end(); | 292 | mem_cgroup_uncharge_end(); |
293 | index++; | ||
296 | } | 294 | } |
297 | cleancache_flush_inode(mapping); | 295 | cleancache_flush_inode(mapping); |
298 | } | 296 | } |
@@ -333,35 +331,34 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
333 | pgoff_t start, pgoff_t end) | 331 | pgoff_t start, pgoff_t end) |
334 | { | 332 | { |
335 | struct pagevec pvec; | 333 | struct pagevec pvec; |
336 | pgoff_t next = start; | 334 | pgoff_t index = start; |
337 | unsigned long ret; | 335 | unsigned long ret; |
338 | unsigned long count = 0; | 336 | unsigned long count = 0; |
339 | int i; | 337 | int i; |
340 | 338 | ||
339 | /* | ||
340 | * Note: this function may get called on a shmem/tmpfs mapping: | ||
341 | * pagevec_lookup() might then return 0 prematurely (because it | ||
342 | * got a gangful of swap entries); but it's hardly worth worrying | ||
343 | * about - it can rarely have anything to free from such a mapping | ||
344 | * (most pages are dirty), and already skips over any difficulties. | ||
345 | */ | ||
346 | |||
341 | pagevec_init(&pvec, 0); | 347 | pagevec_init(&pvec, 0); |
342 | while (next <= end && | 348 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
343 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 349 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
344 | mem_cgroup_uncharge_start(); | 350 | mem_cgroup_uncharge_start(); |
345 | for (i = 0; i < pagevec_count(&pvec); i++) { | 351 | for (i = 0; i < pagevec_count(&pvec); i++) { |
346 | struct page *page = pvec.pages[i]; | 352 | struct page *page = pvec.pages[i]; |
347 | pgoff_t index; | ||
348 | int lock_failed; | ||
349 | 353 | ||
350 | lock_failed = !trylock_page(page); | 354 | /* We rely upon deletion not changing page->index */ |
351 | |||
352 | /* | ||
353 | * We really shouldn't be looking at the ->index of an | ||
354 | * unlocked page. But we're not allowed to lock these | ||
355 | * pages. So we rely upon nobody altering the ->index | ||
356 | * of this (pinned-by-us) page. | ||
357 | */ | ||
358 | index = page->index; | 355 | index = page->index; |
359 | if (index > next) | 356 | if (index > end) |
360 | next = index; | 357 | break; |
361 | next++; | ||
362 | if (lock_failed) | ||
363 | continue; | ||
364 | 358 | ||
359 | if (!trylock_page(page)) | ||
360 | continue; | ||
361 | WARN_ON(page->index != index); | ||
365 | ret = invalidate_inode_page(page); | 362 | ret = invalidate_inode_page(page); |
366 | unlock_page(page); | 363 | unlock_page(page); |
367 | /* | 364 | /* |
@@ -371,12 +368,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
371 | if (!ret) | 368 | if (!ret) |
372 | deactivate_page(page); | 369 | deactivate_page(page); |
373 | count += ret; | 370 | count += ret; |
374 | if (next > end) | ||
375 | break; | ||
376 | } | 371 | } |
377 | pagevec_release(&pvec); | 372 | pagevec_release(&pvec); |
378 | mem_cgroup_uncharge_end(); | 373 | mem_cgroup_uncharge_end(); |
379 | cond_resched(); | 374 | cond_resched(); |
375 | index++; | ||
380 | } | 376 | } |
381 | return count; | 377 | return count; |
382 | } | 378 | } |
@@ -442,37 +438,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
442 | pgoff_t start, pgoff_t end) | 438 | pgoff_t start, pgoff_t end) |
443 | { | 439 | { |
444 | struct pagevec pvec; | 440 | struct pagevec pvec; |
445 | pgoff_t next; | 441 | pgoff_t index; |
446 | int i; | 442 | int i; |
447 | int ret = 0; | 443 | int ret = 0; |
448 | int ret2 = 0; | 444 | int ret2 = 0; |
449 | int did_range_unmap = 0; | 445 | int did_range_unmap = 0; |
450 | int wrapped = 0; | ||
451 | 446 | ||
452 | cleancache_flush_inode(mapping); | 447 | cleancache_flush_inode(mapping); |
453 | pagevec_init(&pvec, 0); | 448 | pagevec_init(&pvec, 0); |
454 | next = start; | 449 | index = start; |
455 | while (next <= end && !wrapped && | 450 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
456 | pagevec_lookup(&pvec, mapping, next, | 451 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
457 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | ||
458 | mem_cgroup_uncharge_start(); | 452 | mem_cgroup_uncharge_start(); |
459 | for (i = 0; i < pagevec_count(&pvec); i++) { | 453 | for (i = 0; i < pagevec_count(&pvec); i++) { |
460 | struct page *page = pvec.pages[i]; | 454 | struct page *page = pvec.pages[i]; |
461 | pgoff_t page_index; | 455 | |
456 | /* We rely upon deletion not changing page->index */ | ||
457 | index = page->index; | ||
458 | if (index > end) | ||
459 | break; | ||
462 | 460 | ||
463 | lock_page(page); | 461 | lock_page(page); |
462 | WARN_ON(page->index != index); | ||
464 | if (page->mapping != mapping) { | 463 | if (page->mapping != mapping) { |
465 | unlock_page(page); | 464 | unlock_page(page); |
466 | continue; | 465 | continue; |
467 | } | 466 | } |
468 | page_index = page->index; | ||
469 | next = page_index + 1; | ||
470 | if (next == 0) | ||
471 | wrapped = 1; | ||
472 | if (page_index > end) { | ||
473 | unlock_page(page); | ||
474 | break; | ||
475 | } | ||
476 | wait_on_page_writeback(page); | 467 | wait_on_page_writeback(page); |
477 | if (page_mapped(page)) { | 468 | if (page_mapped(page)) { |
478 | if (!did_range_unmap) { | 469 | if (!did_range_unmap) { |
@@ -480,9 +471,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
480 | * Zap the rest of the file in one hit. | 471 | * Zap the rest of the file in one hit. |
481 | */ | 472 | */ |
482 | unmap_mapping_range(mapping, | 473 | unmap_mapping_range(mapping, |
483 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | 474 | (loff_t)index << PAGE_CACHE_SHIFT, |
484 | (loff_t)(end - page_index + 1) | 475 | (loff_t)(1 + end - index) |
485 | << PAGE_CACHE_SHIFT, | 476 | << PAGE_CACHE_SHIFT, |
486 | 0); | 477 | 0); |
487 | did_range_unmap = 1; | 478 | did_range_unmap = 1; |
488 | } else { | 479 | } else { |
@@ -490,8 +481,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
490 | * Just zap this page | 481 | * Just zap this page |
491 | */ | 482 | */ |
492 | unmap_mapping_range(mapping, | 483 | unmap_mapping_range(mapping, |
493 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | 484 | (loff_t)index << PAGE_CACHE_SHIFT, |
494 | PAGE_CACHE_SIZE, 0); | 485 | PAGE_CACHE_SIZE, 0); |
495 | } | 486 | } |
496 | } | 487 | } |
497 | BUG_ON(page_mapped(page)); | 488 | BUG_ON(page_mapped(page)); |
@@ -507,6 +498,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
507 | pagevec_release(&pvec); | 498 | pagevec_release(&pvec); |
508 | mem_cgroup_uncharge_end(); | 499 | mem_cgroup_uncharge_end(); |
509 | cond_resched(); | 500 | cond_resched(); |
501 | index++; | ||
510 | } | 502 | } |
511 | cleancache_flush_inode(mapping); | 503 | cleancache_flush_inode(mapping); |
512 | return ret; | 504 | return ret; |
@@ -531,8 +523,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
531 | /** | 523 | /** |
532 | * truncate_pagecache - unmap and remove pagecache that has been truncated | 524 | * truncate_pagecache - unmap and remove pagecache that has been truncated |
533 | * @inode: inode | 525 | * @inode: inode |
534 | * @old: old file offset | 526 | * @oldsize: old file size |
535 | * @new: new file offset | 527 | * @newsize: new file size |
536 | * | 528 | * |
537 | * inode's new i_size must already be written before truncate_pagecache | 529 | * inode's new i_size must already be written before truncate_pagecache |
538 | * is called. | 530 | * is called. |
@@ -544,9 +536,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
544 | * situations such as writepage being called for a page that has already | 536 | * situations such as writepage being called for a page that has already |
545 | * had its underlying blocks deallocated. | 537 | * had its underlying blocks deallocated. |
546 | */ | 538 | */ |
547 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | 539 | void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) |
548 | { | 540 | { |
549 | struct address_space *mapping = inode->i_mapping; | 541 | struct address_space *mapping = inode->i_mapping; |
542 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | ||
550 | 543 | ||
551 | /* | 544 | /* |
552 | * unmap_mapping_range is called twice, first simply for | 545 | * unmap_mapping_range is called twice, first simply for |
@@ -557,9 +550,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | |||
557 | * truncate_inode_pages finishes, hence the second | 550 | * truncate_inode_pages finishes, hence the second |
558 | * unmap_mapping_range call must be made for correctness. | 551 | * unmap_mapping_range call must be made for correctness. |
559 | */ | 552 | */ |
560 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 553 | unmap_mapping_range(mapping, holebegin, 0, 1); |
561 | truncate_inode_pages(mapping, new); | 554 | truncate_inode_pages(mapping, newsize); |
562 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 555 | unmap_mapping_range(mapping, holebegin, 0, 1); |
563 | } | 556 | } |
564 | EXPORT_SYMBOL(truncate_pagecache); | 557 | EXPORT_SYMBOL(truncate_pagecache); |
565 | 558 | ||
@@ -589,29 +582,31 @@ EXPORT_SYMBOL(truncate_setsize); | |||
589 | /** | 582 | /** |
590 | * vmtruncate - unmap mappings "freed" by truncate() syscall | 583 | * vmtruncate - unmap mappings "freed" by truncate() syscall |
591 | * @inode: inode of the file used | 584 | * @inode: inode of the file used |
592 | * @offset: file offset to start truncating | 585 | * @newsize: file offset to start truncating |
593 | * | 586 | * |
594 | * This function is deprecated and truncate_setsize or truncate_pagecache | 587 | * This function is deprecated and truncate_setsize or truncate_pagecache |
595 | * should be used instead, together with filesystem specific block truncation. | 588 | * should be used instead, together with filesystem specific block truncation. |
596 | */ | 589 | */ |
597 | int vmtruncate(struct inode *inode, loff_t offset) | 590 | int vmtruncate(struct inode *inode, loff_t newsize) |
598 | { | 591 | { |
599 | int error; | 592 | int error; |
600 | 593 | ||
601 | error = inode_newsize_ok(inode, offset); | 594 | error = inode_newsize_ok(inode, newsize); |
602 | if (error) | 595 | if (error) |
603 | return error; | 596 | return error; |
604 | 597 | ||
605 | truncate_setsize(inode, offset); | 598 | truncate_setsize(inode, newsize); |
606 | if (inode->i_op->truncate) | 599 | if (inode->i_op->truncate) |
607 | inode->i_op->truncate(inode); | 600 | inode->i_op->truncate(inode); |
608 | return 0; | 601 | return 0; |
609 | } | 602 | } |
610 | EXPORT_SYMBOL(vmtruncate); | 603 | EXPORT_SYMBOL(vmtruncate); |
611 | 604 | ||
612 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | 605 | int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
613 | { | 606 | { |
614 | struct address_space *mapping = inode->i_mapping; | 607 | struct address_space *mapping = inode->i_mapping; |
608 | loff_t holebegin = round_up(lstart, PAGE_SIZE); | ||
609 | loff_t holelen = 1 + lend - holebegin; | ||
615 | 610 | ||
616 | /* | 611 | /* |
617 | * If the underlying filesystem is not going to provide | 612 | * If the underlying filesystem is not going to provide |
@@ -622,12 +617,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
622 | return -ENOSYS; | 617 | return -ENOSYS; |
623 | 618 | ||
624 | mutex_lock(&inode->i_mutex); | 619 | mutex_lock(&inode->i_mutex); |
625 | down_write(&inode->i_alloc_sem); | 620 | inode_dio_wait(inode); |
626 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 621 | unmap_mapping_range(mapping, holebegin, holelen, 1); |
627 | inode->i_op->truncate_range(inode, offset, end); | 622 | inode->i_op->truncate_range(inode, lstart, lend); |
628 | /* unmap again to remove racily COWed private pages */ | 623 | /* unmap again to remove racily COWed private pages */ |
629 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 624 | unmap_mapping_range(mapping, holebegin, holelen, 1); |
630 | up_write(&inode->i_alloc_sem); | ||
631 | mutex_unlock(&inode->i_mutex); | 625 | mutex_unlock(&inode->i_mutex); |
632 | 626 | ||
633 | return 0; | 627 | return 0; |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d34d75366a..3a65d6f7422 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -26,7 +26,7 @@ | |||
26 | #include <linux/rcupdate.h> | 26 | #include <linux/rcupdate.h> |
27 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
28 | #include <linux/kmemleak.h> | 28 | #include <linux/kmemleak.h> |
29 | #include <asm/atomic.h> | 29 | #include <linux/atomic.h> |
30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
32 | #include <asm/shmparam.h> | 32 | #include <asm/shmparam.h> |
@@ -452,13 +452,6 @@ overflow: | |||
452 | return ERR_PTR(-EBUSY); | 452 | return ERR_PTR(-EBUSY); |
453 | } | 453 | } |
454 | 454 | ||
455 | static void rcu_free_va(struct rcu_head *head) | ||
456 | { | ||
457 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
458 | |||
459 | kfree(va); | ||
460 | } | ||
461 | |||
462 | static void __free_vmap_area(struct vmap_area *va) | 455 | static void __free_vmap_area(struct vmap_area *va) |
463 | { | 456 | { |
464 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | 457 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); |
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va) | |||
491 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) | 484 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) |
492 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); | 485 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); |
493 | 486 | ||
494 | call_rcu(&va->rcu_head, rcu_free_va); | 487 | kfree_rcu(va, rcu_head); |
495 | } | 488 | } |
496 | 489 | ||
497 | /* | 490 | /* |
@@ -732,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) | |||
732 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | 725 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) |
733 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | 726 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ |
734 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | 727 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ |
735 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | 728 | #define VMAP_BBMAP_BITS \ |
736 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | 729 | VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ |
737 | VMALLOC_PAGES / NR_CPUS / 16)) | 730 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ |
731 | VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) | ||
738 | 732 | ||
739 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | 733 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) |
740 | 734 | ||
@@ -837,13 +831,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
837 | return vb; | 831 | return vb; |
838 | } | 832 | } |
839 | 833 | ||
840 | static void rcu_free_vb(struct rcu_head *head) | ||
841 | { | ||
842 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
843 | |||
844 | kfree(vb); | ||
845 | } | ||
846 | |||
847 | static void free_vmap_block(struct vmap_block *vb) | 834 | static void free_vmap_block(struct vmap_block *vb) |
848 | { | 835 | { |
849 | struct vmap_block *tmp; | 836 | struct vmap_block *tmp; |
@@ -856,7 +843,7 @@ static void free_vmap_block(struct vmap_block *vb) | |||
856 | BUG_ON(tmp != vb); | 843 | BUG_ON(tmp != vb); |
857 | 844 | ||
858 | free_vmap_area_noflush(vb->va); | 845 | free_vmap_area_noflush(vb->va); |
859 | call_rcu(&vb->rcu_head, rcu_free_vb); | 846 | kfree_rcu(vb, rcu_head); |
860 | } | 847 | } |
861 | 848 | ||
862 | static void purge_fragmented_blocks(int cpu) | 849 | static void purge_fragmented_blocks(int cpu) |
@@ -1266,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area); | |||
1266 | DEFINE_RWLOCK(vmlist_lock); | 1253 | DEFINE_RWLOCK(vmlist_lock); |
1267 | struct vm_struct *vmlist; | 1254 | struct vm_struct *vmlist; |
1268 | 1255 | ||
1269 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1256 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1270 | unsigned long flags, void *caller) | 1257 | unsigned long flags, void *caller) |
1271 | { | 1258 | { |
1272 | struct vm_struct *tmp, **p; | ||
1273 | |||
1274 | vm->flags = flags; | 1259 | vm->flags = flags; |
1275 | vm->addr = (void *)va->va_start; | 1260 | vm->addr = (void *)va->va_start; |
1276 | vm->size = va->va_end - va->va_start; | 1261 | vm->size = va->va_end - va->va_start; |
1277 | vm->caller = caller; | 1262 | vm->caller = caller; |
1278 | va->private = vm; | 1263 | va->private = vm; |
1279 | va->flags |= VM_VM_AREA; | 1264 | va->flags |= VM_VM_AREA; |
1265 | } | ||
1266 | |||
1267 | static void insert_vmalloc_vmlist(struct vm_struct *vm) | ||
1268 | { | ||
1269 | struct vm_struct *tmp, **p; | ||
1280 | 1270 | ||
1271 | vm->flags &= ~VM_UNLIST; | ||
1281 | write_lock(&vmlist_lock); | 1272 | write_lock(&vmlist_lock); |
1282 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | 1273 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { |
1283 | if (tmp->addr >= vm->addr) | 1274 | if (tmp->addr >= vm->addr) |
@@ -1288,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1288 | write_unlock(&vmlist_lock); | 1279 | write_unlock(&vmlist_lock); |
1289 | } | 1280 | } |
1290 | 1281 | ||
1282 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | ||
1283 | unsigned long flags, void *caller) | ||
1284 | { | ||
1285 | setup_vmalloc_vm(vm, va, flags, caller); | ||
1286 | insert_vmalloc_vmlist(vm); | ||
1287 | } | ||
1288 | |||
1291 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1289 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
1292 | unsigned long align, unsigned long flags, unsigned long start, | 1290 | unsigned long align, unsigned long flags, unsigned long start, |
1293 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 1291 | unsigned long end, int node, gfp_t gfp_mask, void *caller) |
@@ -1326,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1326 | return NULL; | 1324 | return NULL; |
1327 | } | 1325 | } |
1328 | 1326 | ||
1329 | insert_vmalloc_vm(area, va, flags, caller); | 1327 | /* |
1328 | * When this function is called from __vmalloc_node_range, | ||
1329 | * we do not add vm_struct to vmlist here to avoid | ||
1330 | * accessing uninitialized members of vm_struct such as | ||
1331 | * pages and nr_pages fields. They will be set later. | ||
1332 | * To distinguish it from others, we use a VM_UNLIST flag. | ||
1333 | */ | ||
1334 | if (flags & VM_UNLIST) | ||
1335 | setup_vmalloc_vm(area, va, flags, caller); | ||
1336 | else | ||
1337 | insert_vmalloc_vm(area, va, flags, caller); | ||
1338 | |||
1330 | return area; | 1339 | return area; |
1331 | } | 1340 | } |
1332 | 1341 | ||
@@ -1394,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1394 | va = find_vmap_area((unsigned long)addr); | 1403 | va = find_vmap_area((unsigned long)addr); |
1395 | if (va && va->flags & VM_VM_AREA) { | 1404 | if (va && va->flags & VM_VM_AREA) { |
1396 | struct vm_struct *vm = va->private; | 1405 | struct vm_struct *vm = va->private; |
1397 | struct vm_struct *tmp, **p; | 1406 | |
1398 | /* | 1407 | if (!(vm->flags & VM_UNLIST)) { |
1399 | * remove from list and disallow access to this vm_struct | 1408 | struct vm_struct *tmp, **p; |
1400 | * before unmap. (address range confliction is maintained by | 1409 | /* |
1401 | * vmap.) | 1410 | * remove from list and disallow access to |
1402 | */ | 1411 | * this vm_struct before unmap. (address range |
1403 | write_lock(&vmlist_lock); | 1412 | * confliction is maintained by vmap.) |
1404 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | 1413 | */ |
1405 | ; | 1414 | write_lock(&vmlist_lock); |
1406 | *p = tmp->next; | 1415 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) |
1407 | write_unlock(&vmlist_lock); | 1416 | ; |
1417 | *p = tmp->next; | ||
1418 | write_unlock(&vmlist_lock); | ||
1419 | } | ||
1408 | 1420 | ||
1409 | vmap_debug_free_range(va->va_start, va->va_end); | 1421 | vmap_debug_free_range(va->va_start, va->va_end); |
1410 | free_unmap_vmap_area(va); | 1422 | free_unmap_vmap_area(va); |
@@ -1615,13 +1627,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1615 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1627 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1616 | return NULL; | 1628 | return NULL; |
1617 | 1629 | ||
1618 | area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, | 1630 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, |
1619 | gfp_mask, caller); | 1631 | start, end, node, gfp_mask, caller); |
1620 | 1632 | ||
1621 | if (!area) | 1633 | if (!area) |
1622 | return NULL; | 1634 | return NULL; |
1623 | 1635 | ||
1624 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1636 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
1637 | if (!addr) | ||
1638 | return NULL; | ||
1639 | |||
1640 | /* | ||
1641 | * In this function, newly allocated vm_struct is not added | ||
1642 | * to vmlist at __get_vm_area_node(). so, it is added here. | ||
1643 | */ | ||
1644 | insert_vmalloc_vmlist(area); | ||
1625 | 1645 | ||
1626 | /* | 1646 | /* |
1627 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1647 | * A ref_count = 3 is needed because the vm_struct and vmap_area |
@@ -2153,6 +2173,14 @@ struct vm_struct *alloc_vm_area(size_t size) | |||
2153 | return NULL; | 2173 | return NULL; |
2154 | } | 2174 | } |
2155 | 2175 | ||
2176 | /* | ||
2177 | * If the allocated address space is passed to a hypercall | ||
2178 | * before being used then we cannot rely on a page fault to | ||
2179 | * trigger an update of the page tables. So sync all the page | ||
2180 | * tables here. | ||
2181 | */ | ||
2182 | vmalloc_sync_all(); | ||
2183 | |||
2156 | return area; | 2184 | return area; |
2157 | } | 2185 | } |
2158 | EXPORT_SYMBOL_GPL(alloc_vm_area); | 2186 | EXPORT_SYMBOL_GPL(alloc_vm_area); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d036e59d302..b55699cd906 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -95,8 +95,6 @@ struct scan_control { | |||
95 | /* Can pages be swapped as part of reclaim? */ | 95 | /* Can pages be swapped as part of reclaim? */ |
96 | int may_swap; | 96 | int may_swap; |
97 | 97 | ||
98 | int swappiness; | ||
99 | |||
100 | int order; | 98 | int order; |
101 | 99 | ||
102 | /* | 100 | /* |
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, | |||
173 | struct scan_control *sc, enum lru_list lru) | 171 | struct scan_control *sc, enum lru_list lru) |
174 | { | 172 | { |
175 | if (!scanning_global_lru(sc)) | 173 | if (!scanning_global_lru(sc)) |
176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); | 174 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, |
175 | zone_to_nid(zone), zone_idx(zone), BIT(lru)); | ||
177 | 176 | ||
178 | return zone_page_state(zone, NR_LRU_BASE + lru); | 177 | return zone_page_state(zone, NR_LRU_BASE + lru); |
179 | } | 178 | } |
@@ -250,49 +249,90 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
250 | unsigned long long delta; | 249 | unsigned long long delta; |
251 | unsigned long total_scan; | 250 | unsigned long total_scan; |
252 | unsigned long max_pass; | 251 | unsigned long max_pass; |
252 | int shrink_ret = 0; | ||
253 | long nr; | ||
254 | long new_nr; | ||
255 | long batch_size = shrinker->batch ? shrinker->batch | ||
256 | : SHRINK_BATCH; | ||
257 | |||
258 | /* | ||
259 | * copy the current shrinker scan count into a local variable | ||
260 | * and zero it so that other concurrent shrinker invocations | ||
261 | * don't also do this scanning work. | ||
262 | */ | ||
263 | do { | ||
264 | nr = shrinker->nr; | ||
265 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | ||
253 | 266 | ||
267 | total_scan = nr; | ||
254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 268 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 269 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
256 | delta *= max_pass; | 270 | delta *= max_pass; |
257 | do_div(delta, lru_pages + 1); | 271 | do_div(delta, lru_pages + 1); |
258 | shrinker->nr += delta; | 272 | total_scan += delta; |
259 | if (shrinker->nr < 0) { | 273 | if (total_scan < 0) { |
260 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 274 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
261 | "delete nr=%ld\n", | 275 | "delete nr=%ld\n", |
262 | shrinker->shrink, shrinker->nr); | 276 | shrinker->shrink, total_scan); |
263 | shrinker->nr = max_pass; | 277 | total_scan = max_pass; |
264 | } | 278 | } |
265 | 279 | ||
266 | /* | 280 | /* |
281 | * We need to avoid excessive windup on filesystem shrinkers | ||
282 | * due to large numbers of GFP_NOFS allocations causing the | ||
283 | * shrinkers to return -1 all the time. This results in a large | ||
284 | * nr being built up so when a shrink that can do some work | ||
285 | * comes along it empties the entire cache due to nr >>> | ||
286 | * max_pass. This is bad for sustaining a working set in | ||
287 | * memory. | ||
288 | * | ||
289 | * Hence only allow the shrinker to scan the entire cache when | ||
290 | * a large delta change is calculated directly. | ||
291 | */ | ||
292 | if (delta < max_pass / 4) | ||
293 | total_scan = min(total_scan, max_pass / 2); | ||
294 | |||
295 | /* | ||
267 | * Avoid risking looping forever due to too large nr value: | 296 | * Avoid risking looping forever due to too large nr value: |
268 | * never try to free more than twice the estimate number of | 297 | * never try to free more than twice the estimate number of |
269 | * freeable entries. | 298 | * freeable entries. |
270 | */ | 299 | */ |
271 | if (shrinker->nr > max_pass * 2) | 300 | if (total_scan > max_pass * 2) |
272 | shrinker->nr = max_pass * 2; | 301 | total_scan = max_pass * 2; |
273 | 302 | ||
274 | total_scan = shrinker->nr; | 303 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
275 | shrinker->nr = 0; | 304 | nr_pages_scanned, lru_pages, |
305 | max_pass, delta, total_scan); | ||
276 | 306 | ||
277 | while (total_scan >= SHRINK_BATCH) { | 307 | while (total_scan >= batch_size) { |
278 | long this_scan = SHRINK_BATCH; | ||
279 | int shrink_ret; | ||
280 | int nr_before; | 308 | int nr_before; |
281 | 309 | ||
282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 310 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
283 | shrink_ret = do_shrinker_shrink(shrinker, shrink, | 311 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
284 | this_scan); | 312 | batch_size); |
285 | if (shrink_ret == -1) | 313 | if (shrink_ret == -1) |
286 | break; | 314 | break; |
287 | if (shrink_ret < nr_before) | 315 | if (shrink_ret < nr_before) |
288 | ret += nr_before - shrink_ret; | 316 | ret += nr_before - shrink_ret; |
289 | count_vm_events(SLABS_SCANNED, this_scan); | 317 | count_vm_events(SLABS_SCANNED, batch_size); |
290 | total_scan -= this_scan; | 318 | total_scan -= batch_size; |
291 | 319 | ||
292 | cond_resched(); | 320 | cond_resched(); |
293 | } | 321 | } |
294 | 322 | ||
295 | shrinker->nr += total_scan; | 323 | /* |
324 | * move the unused scan count back into the shrinker in a | ||
325 | * manner that handles concurrent updates. If we exhausted the | ||
326 | * scan, there is no need to do an update. | ||
327 | */ | ||
328 | do { | ||
329 | nr = shrinker->nr; | ||
330 | new_nr = total_scan + nr; | ||
331 | if (total_scan <= 0) | ||
332 | break; | ||
333 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | ||
334 | |||
335 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | ||
296 | } | 336 | } |
297 | up_read(&shrinker_rwsem); | 337 | up_read(&shrinker_rwsem); |
298 | out: | 338 | out: |
@@ -1729,6 +1769,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1729 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1769 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1730 | } | 1770 | } |
1731 | 1771 | ||
1772 | static int vmscan_swappiness(struct scan_control *sc) | ||
1773 | { | ||
1774 | if (scanning_global_lru(sc)) | ||
1775 | return vm_swappiness; | ||
1776 | return mem_cgroup_swappiness(sc->mem_cgroup); | ||
1777 | } | ||
1778 | |||
1732 | /* | 1779 | /* |
1733 | * Determine how aggressively the anon and file LRU lists should be | 1780 | * Determine how aggressively the anon and file LRU lists should be |
1734 | * scanned. The relative value of each set of LRU lists is determined | 1781 | * scanned. The relative value of each set of LRU lists is determined |
@@ -1747,22 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1747 | u64 fraction[2], denominator; | 1794 | u64 fraction[2], denominator; |
1748 | enum lru_list l; | 1795 | enum lru_list l; |
1749 | int noswap = 0; | 1796 | int noswap = 0; |
1750 | int force_scan = 0; | 1797 | bool force_scan = false; |
1798 | unsigned long nr_force_scan[2]; | ||
1751 | 1799 | ||
1752 | 1800 | /* kswapd does zone balancing and needs to scan this zone */ | |
1753 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1801 | if (scanning_global_lru(sc) && current_is_kswapd()) |
1754 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1802 | force_scan = true; |
1755 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1803 | /* memcg may have small limit and need to avoid priority drop */ |
1756 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 1804 | if (!scanning_global_lru(sc)) |
1757 | 1805 | force_scan = true; | |
1758 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | ||
1759 | /* kswapd does zone balancing and need to scan this zone */ | ||
1760 | if (scanning_global_lru(sc) && current_is_kswapd()) | ||
1761 | force_scan = 1; | ||
1762 | /* memcg may have small limit and need to avoid priority drop */ | ||
1763 | if (!scanning_global_lru(sc)) | ||
1764 | force_scan = 1; | ||
1765 | } | ||
1766 | 1806 | ||
1767 | /* If we have no swap space, do not bother scanning anon pages. */ | 1807 | /* If we have no swap space, do not bother scanning anon pages. */ |
1768 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1808 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1770,9 +1810,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1770 | fraction[0] = 0; | 1810 | fraction[0] = 0; |
1771 | fraction[1] = 1; | 1811 | fraction[1] = 1; |
1772 | denominator = 1; | 1812 | denominator = 1; |
1813 | nr_force_scan[0] = 0; | ||
1814 | nr_force_scan[1] = SWAP_CLUSTER_MAX; | ||
1773 | goto out; | 1815 | goto out; |
1774 | } | 1816 | } |
1775 | 1817 | ||
1818 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1819 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1820 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1821 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1822 | |||
1776 | if (scanning_global_lru(sc)) { | 1823 | if (scanning_global_lru(sc)) { |
1777 | free = zone_page_state(zone, NR_FREE_PAGES); | 1824 | free = zone_page_state(zone, NR_FREE_PAGES); |
1778 | /* If we have very few page cache pages, | 1825 | /* If we have very few page cache pages, |
@@ -1781,6 +1828,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1781 | fraction[0] = 1; | 1828 | fraction[0] = 1; |
1782 | fraction[1] = 0; | 1829 | fraction[1] = 0; |
1783 | denominator = 1; | 1830 | denominator = 1; |
1831 | nr_force_scan[0] = SWAP_CLUSTER_MAX; | ||
1832 | nr_force_scan[1] = 0; | ||
1784 | goto out; | 1833 | goto out; |
1785 | } | 1834 | } |
1786 | } | 1835 | } |
@@ -1789,8 +1838,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1789 | * With swappiness at 100, anonymous and file have the same priority. | 1838 | * With swappiness at 100, anonymous and file have the same priority. |
1790 | * This scanning priority is essentially the inverse of IO cost. | 1839 | * This scanning priority is essentially the inverse of IO cost. |
1791 | */ | 1840 | */ |
1792 | anon_prio = sc->swappiness; | 1841 | anon_prio = vmscan_swappiness(sc); |
1793 | file_prio = 200 - sc->swappiness; | 1842 | file_prio = 200 - vmscan_swappiness(sc); |
1794 | 1843 | ||
1795 | /* | 1844 | /* |
1796 | * OK, so we have swap space and a fair amount of page cache | 1845 | * OK, so we have swap space and a fair amount of page cache |
@@ -1829,6 +1878,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1829 | fraction[0] = ap; | 1878 | fraction[0] = ap; |
1830 | fraction[1] = fp; | 1879 | fraction[1] = fp; |
1831 | denominator = ap + fp + 1; | 1880 | denominator = ap + fp + 1; |
1881 | if (force_scan) { | ||
1882 | unsigned long scan = SWAP_CLUSTER_MAX; | ||
1883 | nr_force_scan[0] = div64_u64(scan * ap, denominator); | ||
1884 | nr_force_scan[1] = div64_u64(scan * fp, denominator); | ||
1885 | } | ||
1832 | out: | 1886 | out: |
1833 | for_each_evictable_lru(l) { | 1887 | for_each_evictable_lru(l) { |
1834 | int file = is_file_lru(l); | 1888 | int file = is_file_lru(l); |
@@ -1849,12 +1903,8 @@ out: | |||
1849 | * memcg, priority drop can cause big latency. So, it's better | 1903 | * memcg, priority drop can cause big latency. So, it's better |
1850 | * to scan small amount. See may_noscan above. | 1904 | * to scan small amount. See may_noscan above. |
1851 | */ | 1905 | */ |
1852 | if (!scan && force_scan) { | 1906 | if (!scan && force_scan) |
1853 | if (file) | 1907 | scan = nr_force_scan[file]; |
1854 | scan = SWAP_CLUSTER_MAX; | ||
1855 | else if (!noswap) | ||
1856 | scan = SWAP_CLUSTER_MAX; | ||
1857 | } | ||
1858 | nr[l] = scan; | 1908 | nr[l] = scan; |
1859 | } | 1909 | } |
1860 | } | 1910 | } |
@@ -2179,7 +2229,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2179 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2229 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2180 | .may_unmap = 1, | 2230 | .may_unmap = 1, |
2181 | .may_swap = 1, | 2231 | .may_swap = 1, |
2182 | .swappiness = vm_swappiness, | ||
2183 | .order = order, | 2232 | .order = order, |
2184 | .mem_cgroup = NULL, | 2233 | .mem_cgroup = NULL, |
2185 | .nodemask = nodemask, | 2234 | .nodemask = nodemask, |
@@ -2203,7 +2252,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2203 | 2252 | ||
2204 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2253 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2205 | gfp_t gfp_mask, bool noswap, | 2254 | gfp_t gfp_mask, bool noswap, |
2206 | unsigned int swappiness, | ||
2207 | struct zone *zone, | 2255 | struct zone *zone, |
2208 | unsigned long *nr_scanned) | 2256 | unsigned long *nr_scanned) |
2209 | { | 2257 | { |
@@ -2213,7 +2261,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2213 | .may_writepage = !laptop_mode, | 2261 | .may_writepage = !laptop_mode, |
2214 | .may_unmap = 1, | 2262 | .may_unmap = 1, |
2215 | .may_swap = !noswap, | 2263 | .may_swap = !noswap, |
2216 | .swappiness = swappiness, | ||
2217 | .order = 0, | 2264 | .order = 0, |
2218 | .mem_cgroup = mem, | 2265 | .mem_cgroup = mem, |
2219 | }; | 2266 | }; |
@@ -2242,8 +2289,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2242 | 2289 | ||
2243 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2290 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
2244 | gfp_t gfp_mask, | 2291 | gfp_t gfp_mask, |
2245 | bool noswap, | 2292 | bool noswap) |
2246 | unsigned int swappiness) | ||
2247 | { | 2293 | { |
2248 | struct zonelist *zonelist; | 2294 | struct zonelist *zonelist; |
2249 | unsigned long nr_reclaimed; | 2295 | unsigned long nr_reclaimed; |
@@ -2253,7 +2299,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2253 | .may_unmap = 1, | 2299 | .may_unmap = 1, |
2254 | .may_swap = !noswap, | 2300 | .may_swap = !noswap, |
2255 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2301 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2256 | .swappiness = swappiness, | ||
2257 | .order = 0, | 2302 | .order = 0, |
2258 | .mem_cgroup = mem_cont, | 2303 | .mem_cgroup = mem_cont, |
2259 | .nodemask = NULL, /* we don't care the placement */ | 2304 | .nodemask = NULL, /* we don't care the placement */ |
@@ -2404,7 +2449,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2404 | * we want to put equal scanning pressure on each zone. | 2449 | * we want to put equal scanning pressure on each zone. |
2405 | */ | 2450 | */ |
2406 | .nr_to_reclaim = ULONG_MAX, | 2451 | .nr_to_reclaim = ULONG_MAX, |
2407 | .swappiness = vm_swappiness, | ||
2408 | .order = order, | 2452 | .order = order, |
2409 | .mem_cgroup = NULL, | 2453 | .mem_cgroup = NULL, |
2410 | }; | 2454 | }; |
@@ -2453,6 +2497,9 @@ loop_again: | |||
2453 | high_wmark_pages(zone), 0, 0)) { | 2497 | high_wmark_pages(zone), 0, 0)) { |
2454 | end_zone = i; | 2498 | end_zone = i; |
2455 | break; | 2499 | break; |
2500 | } else { | ||
2501 | /* If balanced, clear the congested flag */ | ||
2502 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2456 | } | 2503 | } |
2457 | } | 2504 | } |
2458 | if (i < 0) | 2505 | if (i < 0) |
@@ -2874,7 +2921,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2874 | .may_writepage = 1, | 2921 | .may_writepage = 1, |
2875 | .nr_to_reclaim = nr_to_reclaim, | 2922 | .nr_to_reclaim = nr_to_reclaim, |
2876 | .hibernation_mode = 1, | 2923 | .hibernation_mode = 1, |
2877 | .swappiness = vm_swappiness, | ||
2878 | .order = 0, | 2924 | .order = 0, |
2879 | }; | 2925 | }; |
2880 | struct shrink_control shrink = { | 2926 | struct shrink_control shrink = { |
@@ -3061,7 +3107,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3061 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3107 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
3062 | SWAP_CLUSTER_MAX), | 3108 | SWAP_CLUSTER_MAX), |
3063 | .gfp_mask = gfp_mask, | 3109 | .gfp_mask = gfp_mask, |
3064 | .swappiness = vm_swappiness, | ||
3065 | .order = order, | 3110 | .order = order, |
3066 | }; | 3111 | }; |
3067 | struct shrink_control shrink = { | 3112 | struct shrink_control shrink = { |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b..d52b13d28e8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
659 | } | 659 | } |
660 | #endif | 660 | #endif |
661 | 661 | ||
662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) | 662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) |
663 | #ifdef CONFIG_ZONE_DMA | 663 | #ifdef CONFIG_ZONE_DMA |
664 | #define TEXT_FOR_DMA(xx) xx "_dma", | 664 | #define TEXT_FOR_DMA(xx) xx "_dma", |
665 | #else | 665 | #else |
@@ -788,7 +788,7 @@ const char * const vmstat_text[] = { | |||
788 | 788 | ||
789 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 789 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
790 | }; | 790 | }; |
791 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ | 791 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
792 | 792 | ||
793 | 793 | ||
794 | #ifdef CONFIG_PROC_FS | 794 | #ifdef CONFIG_PROC_FS |