diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 6 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 30 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 3 | ||||
-rw-r--r-- | mm/memory.c | 176 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 20 | ||||
-rw-r--r-- | mm/migrate.c | 89 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 22 | ||||
-rw-r--r-- | mm/mprotect.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 109 | ||||
-rw-r--r-- | mm/page-writeback.c | 245 | ||||
-rw-r--r-- | mm/page_alloc.c | 135 | ||||
-rw-r--r-- | mm/page_cgroup.c | 2 | ||||
-rw-r--r-- | mm/page_io.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/shmem.c | 82 | ||||
-rw-r--r-- | mm/swap.c | 44 | ||||
-rw-r--r-- | mm/swap_state.c | 31 | ||||
-rw-r--r-- | mm/swapfile.c | 576 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 134 | ||||
-rw-r--r-- | mm/vmalloc.c | 50 | ||||
-rw-r--r-- | mm/vmscan.c | 143 |
29 files changed, 1183 insertions, 865 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 5b5790f8a816..a5b77811fdf2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -181,12 +181,6 @@ config MIGRATION | |||
181 | example on NUMA systems to put pages nearer to the processors accessing | 181 | example on NUMA systems to put pages nearer to the processors accessing |
182 | the page. | 182 | the page. |
183 | 183 | ||
184 | config RESOURCES_64BIT | ||
185 | bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) | ||
186 | default 64BIT | ||
187 | help | ||
188 | This option allows memory and IO resources to be 64 bit. | ||
189 | |||
190 | config PHYS_ADDR_T_64BIT | 184 | config PHYS_ADDR_T_64BIT |
191 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT | 185 | def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT |
192 | 186 | ||
diff --git a/mm/Makefile b/mm/Makefile index 51c27709cc7c..72255be57f89 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | 11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
15 | 15 | ||
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
21 | obj-$(CONFIG_NUMA) += mempolicy.o | 21 | obj-$(CONFIG_NUMA) += mempolicy.o |
22 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 22 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
23 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 23 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
24 | obj-$(CONFIG_SHMEM) += shmem.o | ||
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 24 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | ||
27 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
28 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
29 | obj-$(CONFIG_SLAB) += slab.o | 27 | obj-$(CONFIG_SLAB) += slab.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a7c6c5613ec9..8e8587444132 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -24,9 +24,9 @@ static void bdi_debug_init(void) | |||
24 | static int bdi_debug_stats_show(struct seq_file *m, void *v) | 24 | static int bdi_debug_stats_show(struct seq_file *m, void *v) |
25 | { | 25 | { |
26 | struct backing_dev_info *bdi = m->private; | 26 | struct backing_dev_info *bdi = m->private; |
27 | long background_thresh; | 27 | unsigned long background_thresh; |
28 | long dirty_thresh; | 28 | unsigned long dirty_thresh; |
29 | long bdi_thresh; | 29 | unsigned long bdi_thresh; |
30 | 30 | ||
31 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); | 31 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); |
32 | 32 | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index ac5a891f142a..51a0ccf61e0e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
435 | unsigned long fallback = 0; | 435 | unsigned long fallback = 0; |
436 | unsigned long min, max, start, sidx, midx, step; | 436 | unsigned long min, max, start, sidx, midx, step; |
437 | 437 | ||
438 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", | ||
439 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, | ||
440 | align, goal, limit); | ||
441 | |||
438 | BUG_ON(!size); | 442 | BUG_ON(!size); |
439 | BUG_ON(align & (align - 1)); | 443 | BUG_ON(align & (align - 1)); |
440 | BUG_ON(limit && goal + size > limit); | 444 | BUG_ON(limit && goal + size > limit); |
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
442 | if (!bdata->node_bootmem_map) | 446 | if (!bdata->node_bootmem_map) |
443 | return NULL; | 447 | return NULL; |
444 | 448 | ||
445 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", | ||
446 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, | ||
447 | align, goal, limit); | ||
448 | |||
449 | min = bdata->node_min_pfn; | 449 | min = bdata->node_min_pfn; |
450 | max = bdata->node_low_pfn; | 450 | max = bdata->node_low_pfn; |
451 | 451 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index f5769b4dc075..2f55a1e2baf7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
210 | int ret; | 210 | int ret; |
211 | struct writeback_control wbc = { | 211 | struct writeback_control wbc = { |
212 | .sync_mode = sync_mode, | 212 | .sync_mode = sync_mode, |
213 | .nr_to_write = mapping->nrpages * 2, | 213 | .nr_to_write = LONG_MAX, |
214 | .range_start = start, | 214 | .range_start = start, |
215 | .range_end = end, | 215 | .range_end = end, |
216 | }; | 216 | }; |
@@ -741,7 +741,14 @@ repeat: | |||
741 | page = __page_cache_alloc(gfp_mask); | 741 | page = __page_cache_alloc(gfp_mask); |
742 | if (!page) | 742 | if (!page) |
743 | return NULL; | 743 | return NULL; |
744 | err = add_to_page_cache_lru(page, mapping, index, gfp_mask); | 744 | /* |
745 | * We want a regular kernel memory (not highmem or DMA etc) | ||
746 | * allocation for the radix tree nodes, but we need to honour | ||
747 | * the context-specific requirements the caller has asked for. | ||
748 | * GFP_RECLAIM_MASK collects those requirements. | ||
749 | */ | ||
750 | err = add_to_page_cache_lru(page, mapping, index, | ||
751 | (gfp_mask & GFP_RECLAIM_MASK)); | ||
745 | if (unlikely(err)) { | 752 | if (unlikely(err)) { |
746 | page_cache_release(page); | 753 | page_cache_release(page); |
747 | page = NULL; | 754 | page = NULL; |
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
950 | return NULL; | 957 | return NULL; |
951 | } | 958 | } |
952 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); | 959 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); |
953 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { | 960 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { |
954 | page_cache_release(page); | 961 | page_cache_release(page); |
955 | page = NULL; | 962 | page = NULL; |
956 | } | 963 | } |
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1317 | goto out; /* skip atime */ | 1324 | goto out; /* skip atime */ |
1318 | size = i_size_read(inode); | 1325 | size = i_size_read(inode); |
1319 | if (pos < size) { | 1326 | if (pos < size) { |
1320 | retval = filemap_write_and_wait(mapping); | 1327 | retval = filemap_write_and_wait_range(mapping, pos, |
1328 | pos + iov_length(iov, nr_segs) - 1); | ||
1321 | if (!retval) { | 1329 | if (!retval) { |
1322 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1330 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1323 | iov, pos, nr_segs); | 1331 | iov, pos, nr_segs); |
@@ -1530,7 +1538,6 @@ retry_find: | |||
1530 | /* | 1538 | /* |
1531 | * Found the page and have a reference on it. | 1539 | * Found the page and have a reference on it. |
1532 | */ | 1540 | */ |
1533 | mark_page_accessed(page); | ||
1534 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | 1541 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; |
1535 | vmf->page = page; | 1542 | vmf->page = page; |
1536 | return ret | VM_FAULT_LOCKED; | 1543 | return ret | VM_FAULT_LOCKED; |
@@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2060 | if (count != ocount) | 2067 | if (count != ocount) |
2061 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2068 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
2062 | 2069 | ||
2063 | /* | ||
2064 | * Unmap all mmappings of the file up-front. | ||
2065 | * | ||
2066 | * This will cause any pte dirty bits to be propagated into the | ||
2067 | * pageframes for the subsequent filemap_write_and_wait(). | ||
2068 | */ | ||
2069 | write_len = iov_length(iov, *nr_segs); | 2070 | write_len = iov_length(iov, *nr_segs); |
2070 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | 2071 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; |
2071 | if (mapping_mapped(mapping)) | ||
2072 | unmap_mapping_range(mapping, pos, write_len, 0); | ||
2073 | 2072 | ||
2074 | written = filemap_write_and_wait(mapping); | 2073 | written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); |
2075 | if (written) | 2074 | if (written) |
2076 | goto out; | 2075 | goto out; |
2077 | 2076 | ||
@@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2291 | * the file data here, to try to honour O_DIRECT expectations. | 2290 | * the file data here, to try to honour O_DIRECT expectations. |
2292 | */ | 2291 | */ |
2293 | if (unlikely(file->f_flags & O_DIRECT) && written) | 2292 | if (unlikely(file->f_flags & O_DIRECT) && written) |
2294 | status = filemap_write_and_wait(mapping); | 2293 | status = filemap_write_and_wait_range(mapping, |
2294 | pos, pos + written - 1); | ||
2295 | 2295 | ||
2296 | return written ? written : status; | 2296 | return written ? written : status; |
2297 | } | 2297 | } |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b5167dfb2f2d..0c04615651b7 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -193,7 +193,7 @@ retry: | |||
193 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
194 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
195 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush_notify(vma, address, pte); |
196 | page_remove_rmap(page, vma); | 196 | page_remove_rmap(page); |
197 | dec_mm_counter(mm, file_rss); | 197 | dec_mm_counter(mm, file_rss); |
198 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
199 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
diff --git a/mm/fremap.c b/mm/fremap.c index 7d12ca70ef7b..62d5bbda921a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
37 | if (page) { | 37 | if (page) { |
38 | if (pte_dirty(pte)) | 38 | if (pte_dirty(pte)) |
39 | set_page_dirty(page); | 39 | set_page_dirty(page); |
40 | page_remove_rmap(page, vma); | 40 | page_remove_rmap(page); |
41 | page_cache_release(page); | 41 | page_cache_release(page); |
42 | update_hiwater_rss(mm); | 42 | update_hiwater_rss(mm); |
43 | dec_mm_counter(mm, file_rss); | 43 | dec_mm_counter(mm, file_rss); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6058b53dcb89..618e98304080 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, | |||
220 | } | 220 | } |
221 | 221 | ||
222 | /* | 222 | /* |
223 | * Return the size of the pages allocated when backing a VMA. In the majority | ||
224 | * cases this will be same size as used by the page table entries. | ||
225 | */ | ||
226 | unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | ||
227 | { | ||
228 | struct hstate *hstate; | ||
229 | |||
230 | if (!is_vm_hugetlb_page(vma)) | ||
231 | return PAGE_SIZE; | ||
232 | |||
233 | hstate = hstate_vma(vma); | ||
234 | |||
235 | return 1UL << (hstate->order + PAGE_SHIFT); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Return the page size being used by the MMU to back a VMA. In the majority | ||
240 | * of cases, the page size used by the kernel matches the MMU size. On | ||
241 | * architectures where it differs, an architecture-specific version of this | ||
242 | * function is required. | ||
243 | */ | ||
244 | #ifndef vma_mmu_pagesize | ||
245 | unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) | ||
246 | { | ||
247 | return vma_kernel_pagesize(vma); | ||
248 | } | ||
249 | #endif | ||
250 | |||
251 | /* | ||
223 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom | 252 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom |
224 | * bits of the reservation map pointer, which are always clear due to | 253 | * bits of the reservation map pointer, which are always clear due to |
225 | * alignment. | 254 | * alignment. |
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page, | |||
371 | { | 400 | { |
372 | int i; | 401 | int i; |
373 | 402 | ||
374 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) | 403 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) { |
375 | return clear_gigantic_page(page, addr, sz); | 404 | clear_gigantic_page(page, addr, sz); |
405 | return; | ||
406 | } | ||
376 | 407 | ||
377 | might_sleep(); | 408 | might_sleep(); |
378 | for (i = 0; i < sz/PAGE_SIZE; i++) { | 409 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
404 | int i; | 435 | int i; |
405 | struct hstate *h = hstate_vma(vma); | 436 | struct hstate *h = hstate_vma(vma); |
406 | 437 | ||
407 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) | 438 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
408 | return copy_gigantic_page(dst, src, addr, vma); | 439 | copy_gigantic_page(dst, src, addr, vma); |
440 | return; | ||
441 | } | ||
409 | 442 | ||
410 | might_sleep(); | 443 | might_sleep(); |
411 | for (i = 0; i < pages_per_huge_page(h); i++) { | 444 | for (i = 0; i < pages_per_huge_page(h); i++) { |
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
972 | return page; | 1005 | return page; |
973 | } | 1006 | } |
974 | 1007 | ||
975 | __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) | 1008 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
976 | { | 1009 | { |
977 | struct huge_bootmem_page *m; | 1010 | struct huge_bootmem_page *m; |
978 | int nr_nodes = nodes_weight(node_online_map); | 1011 | int nr_nodes = nodes_weight(node_online_map); |
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) | |||
991 | * puts them into the mem_map). | 1024 | * puts them into the mem_map). |
992 | */ | 1025 | */ |
993 | m = addr; | 1026 | m = addr; |
994 | if (m) | 1027 | goto found; |
995 | goto found; | ||
996 | } | 1028 | } |
997 | hstate_next_node(h); | 1029 | hstate_next_node(h); |
998 | nr_nodes--; | 1030 | nr_nodes--; |
diff --git a/mm/internal.h b/mm/internal.h index 13333bc2eb68..478223b73a2a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page); | |||
49 | /* | 49 | /* |
50 | * in mm/page_alloc.c | 50 | * in mm/page_alloc.c |
51 | */ | 51 | */ |
52 | extern unsigned long highest_memmap_pfn; | ||
52 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
53 | 54 | ||
54 | /* | 55 | /* |
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
275 | #define GUP_FLAGS_WRITE 0x1 | 276 | #define GUP_FLAGS_WRITE 0x1 |
276 | #define GUP_FLAGS_FORCE 0x2 | 277 | #define GUP_FLAGS_FORCE 0x2 |
277 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 | 278 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 |
279 | #define GUP_FLAGS_IGNORE_SIGKILL 0x8 | ||
278 | 280 | ||
279 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 281 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
280 | unsigned long start, int len, int flags, | 282 | unsigned long start, int len, int flags, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 866dcc7eeb0c..51ee96545579 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -779,7 +779,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
779 | return 0; | 779 | return 0; |
780 | } | 780 | } |
781 | 781 | ||
782 | int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) | 782 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
783 | unsigned long long val) | ||
783 | { | 784 | { |
784 | 785 | ||
785 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | 786 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
diff --git a/mm/memory.c b/mm/memory.c index 7b9db658aca2..3f8fa06b963b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -52,6 +52,9 @@ | |||
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
54 | #include <linux/mmu_notifier.h> | 54 | #include <linux/mmu_notifier.h> |
55 | #include <linux/kallsyms.h> | ||
56 | #include <linux/swapops.h> | ||
57 | #include <linux/elf.h> | ||
55 | 58 | ||
56 | #include <asm/pgalloc.h> | 59 | #include <asm/pgalloc.h> |
57 | #include <asm/uaccess.h> | 60 | #include <asm/uaccess.h> |
@@ -59,9 +62,6 @@ | |||
59 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
60 | #include <asm/pgtable.h> | 63 | #include <asm/pgtable.h> |
61 | 64 | ||
62 | #include <linux/swapops.h> | ||
63 | #include <linux/elf.h> | ||
64 | |||
65 | #include "internal.h" | 65 | #include "internal.h" |
66 | 66 | ||
67 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 67 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
375 | * | 375 | * |
376 | * The calling function must still handle the error. | 376 | * The calling function must still handle the error. |
377 | */ | 377 | */ |
378 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, | 378 | static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, |
379 | unsigned long vaddr) | 379 | pte_t pte, struct page *page) |
380 | { | 380 | { |
381 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 381 | pgd_t *pgd = pgd_offset(vma->vm_mm, addr); |
382 | "vm_flags = %lx, vaddr = %lx\n", | 382 | pud_t *pud = pud_offset(pgd, addr); |
383 | (long long)pte_val(pte), | 383 | pmd_t *pmd = pmd_offset(pud, addr); |
384 | (vma->vm_mm == current->mm ? current->comm : "???"), | 384 | struct address_space *mapping; |
385 | vma->vm_flags, vaddr); | 385 | pgoff_t index; |
386 | static unsigned long resume; | ||
387 | static unsigned long nr_shown; | ||
388 | static unsigned long nr_unshown; | ||
389 | |||
390 | /* | ||
391 | * Allow a burst of 60 reports, then keep quiet for that minute; | ||
392 | * or allow a steady drip of one report per second. | ||
393 | */ | ||
394 | if (nr_shown == 60) { | ||
395 | if (time_before(jiffies, resume)) { | ||
396 | nr_unshown++; | ||
397 | return; | ||
398 | } | ||
399 | if (nr_unshown) { | ||
400 | printk(KERN_ALERT | ||
401 | "BUG: Bad page map: %lu messages suppressed\n", | ||
402 | nr_unshown); | ||
403 | nr_unshown = 0; | ||
404 | } | ||
405 | nr_shown = 0; | ||
406 | } | ||
407 | if (nr_shown++ == 0) | ||
408 | resume = jiffies + 60 * HZ; | ||
409 | |||
410 | mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; | ||
411 | index = linear_page_index(vma, addr); | ||
412 | |||
413 | printk(KERN_ALERT | ||
414 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | ||
415 | current->comm, | ||
416 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | ||
417 | if (page) { | ||
418 | printk(KERN_ALERT | ||
419 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
420 | page, (void *)page->flags, page_count(page), | ||
421 | page_mapcount(page), page->mapping, page->index); | ||
422 | } | ||
423 | printk(KERN_ALERT | ||
424 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | ||
425 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | ||
426 | /* | ||
427 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | ||
428 | */ | ||
429 | if (vma->vm_ops) | ||
430 | print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", | ||
431 | (unsigned long)vma->vm_ops->fault); | ||
432 | if (vma->vm_file && vma->vm_file->f_op) | ||
433 | print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", | ||
434 | (unsigned long)vma->vm_file->f_op->mmap); | ||
386 | dump_stack(); | 435 | dump_stack(); |
436 | add_taint(TAINT_BAD_PAGE); | ||
387 | } | 437 | } |
388 | 438 | ||
389 | static inline int is_cow_mapping(unsigned int flags) | 439 | static inline int is_cow_mapping(unsigned int flags) |
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags) | |||
441 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 491 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
442 | pte_t pte) | 492 | pte_t pte) |
443 | { | 493 | { |
444 | unsigned long pfn; | 494 | unsigned long pfn = pte_pfn(pte); |
445 | 495 | ||
446 | if (HAVE_PTE_SPECIAL) { | 496 | if (HAVE_PTE_SPECIAL) { |
447 | if (likely(!pte_special(pte))) { | 497 | if (likely(!pte_special(pte))) |
448 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 498 | goto check_pfn; |
449 | return pte_page(pte); | 499 | if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) |
450 | } | 500 | print_bad_pte(vma, addr, pte, NULL); |
451 | VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); | ||
452 | return NULL; | 501 | return NULL; |
453 | } | 502 | } |
454 | 503 | ||
455 | /* !HAVE_PTE_SPECIAL case follows: */ | 504 | /* !HAVE_PTE_SPECIAL case follows: */ |
456 | 505 | ||
457 | pfn = pte_pfn(pte); | ||
458 | |||
459 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { | 506 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
460 | if (vma->vm_flags & VM_MIXEDMAP) { | 507 | if (vma->vm_flags & VM_MIXEDMAP) { |
461 | if (!pfn_valid(pfn)) | 508 | if (!pfn_valid(pfn)) |
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
471 | } | 518 | } |
472 | } | 519 | } |
473 | 520 | ||
474 | VM_BUG_ON(!pfn_valid(pfn)); | 521 | check_pfn: |
522 | if (unlikely(pfn > highest_memmap_pfn)) { | ||
523 | print_bad_pte(vma, addr, pte, NULL); | ||
524 | return NULL; | ||
525 | } | ||
475 | 526 | ||
476 | /* | 527 | /* |
477 | * NOTE! We still have PageReserved() pages in the page tables. | 528 | * NOTE! We still have PageReserved() pages in the page tables. |
478 | * | ||
479 | * eg. VDSO mappings can cause them to exist. | 529 | * eg. VDSO mappings can cause them to exist. |
480 | */ | 530 | */ |
481 | out: | 531 | out: |
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
767 | else { | 817 | else { |
768 | if (pte_dirty(ptent)) | 818 | if (pte_dirty(ptent)) |
769 | set_page_dirty(page); | 819 | set_page_dirty(page); |
770 | if (pte_young(ptent)) | 820 | if (pte_young(ptent) && |
771 | SetPageReferenced(page); | 821 | likely(!VM_SequentialReadHint(vma))) |
822 | mark_page_accessed(page); | ||
772 | file_rss--; | 823 | file_rss--; |
773 | } | 824 | } |
774 | page_remove_rmap(page, vma); | 825 | page_remove_rmap(page); |
826 | if (unlikely(page_mapcount(page) < 0)) | ||
827 | print_bad_pte(vma, addr, ptent, page); | ||
775 | tlb_remove_page(tlb, page); | 828 | tlb_remove_page(tlb, page); |
776 | continue; | 829 | continue; |
777 | } | 830 | } |
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
781 | */ | 834 | */ |
782 | if (unlikely(details)) | 835 | if (unlikely(details)) |
783 | continue; | 836 | continue; |
784 | if (!pte_file(ptent)) | 837 | if (pte_file(ptent)) { |
785 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 838 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
839 | print_bad_pte(vma, addr, ptent, NULL); | ||
840 | } else if | ||
841 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | ||
842 | print_bad_pte(vma, addr, ptent, NULL); | ||
786 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 843 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
787 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 844 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
788 | 845 | ||
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1153 | int write = !!(flags & GUP_FLAGS_WRITE); | 1210 | int write = !!(flags & GUP_FLAGS_WRITE); |
1154 | int force = !!(flags & GUP_FLAGS_FORCE); | 1211 | int force = !!(flags & GUP_FLAGS_FORCE); |
1155 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | 1212 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); |
1213 | int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); | ||
1156 | 1214 | ||
1157 | if (len <= 0) | 1215 | if (len <= 0) |
1158 | return 0; | 1216 | return 0; |
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1231 | struct page *page; | 1289 | struct page *page; |
1232 | 1290 | ||
1233 | /* | 1291 | /* |
1234 | * If tsk is ooming, cut off its access to large memory | 1292 | * If we have a pending SIGKILL, don't keep faulting |
1235 | * allocations. It has a pending SIGKILL, but it can't | 1293 | * pages and potentially allocating memory, unless |
1236 | * be processed until returning to user space. | 1294 | * current is handling munlock--e.g., on exit. In |
1295 | * that case, we are not allocating memory. Rather, | ||
1296 | * we're only unlocking already resident/mapped pages. | ||
1237 | */ | 1297 | */ |
1238 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | 1298 | if (unlikely(!ignore_sigkill && |
1239 | return i ? i : -ENOMEM; | 1299 | fatal_signal_pending(current))) |
1300 | return i ? i : -ERESTARTSYS; | ||
1240 | 1301 | ||
1241 | if (write) | 1302 | if (write) |
1242 | foll_flags |= FOLL_WRITE; | 1303 | foll_flags |= FOLL_WRITE; |
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1263 | * do_wp_page has broken COW when necessary, | 1324 | * do_wp_page has broken COW when necessary, |
1264 | * even if maybe_mkwrite decided not to set | 1325 | * even if maybe_mkwrite decided not to set |
1265 | * pte_write. We can thus safely do subsequent | 1326 | * pte_write. We can thus safely do subsequent |
1266 | * page lookups as if they were reads. | 1327 | * page lookups as if they were reads. But only |
1328 | * do so when looping for pte_write is futile: | ||
1329 | * in some cases userspace may also be wanting | ||
1330 | * to write to the gotten user page, which a | ||
1331 | * read fault here might prevent (a readonly | ||
1332 | * page might get reCOWed by userspace write). | ||
1267 | */ | 1333 | */ |
1268 | if (ret & VM_FAULT_WRITE) | 1334 | if ((ret & VM_FAULT_WRITE) && |
1335 | !(vma->vm_flags & VM_WRITE)) | ||
1269 | foll_flags &= ~FOLL_WRITE; | 1336 | foll_flags &= ~FOLL_WRITE; |
1270 | 1337 | ||
1271 | cond_resched(); | 1338 | cond_resched(); |
@@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1644 | 1711 | ||
1645 | BUG_ON(pmd_huge(*pmd)); | 1712 | BUG_ON(pmd_huge(*pmd)); |
1646 | 1713 | ||
1714 | arch_enter_lazy_mmu_mode(); | ||
1715 | |||
1647 | token = pmd_pgtable(*pmd); | 1716 | token = pmd_pgtable(*pmd); |
1648 | 1717 | ||
1649 | do { | 1718 | do { |
@@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1652 | break; | 1721 | break; |
1653 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1722 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1654 | 1723 | ||
1724 | arch_leave_lazy_mmu_mode(); | ||
1725 | |||
1655 | if (mm != &init_mm) | 1726 | if (mm != &init_mm) |
1656 | pte_unmap_unlock(pte-1, ptl); | 1727 | pte_unmap_unlock(pte-1, ptl); |
1657 | return err; | 1728 | return err; |
@@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1837 | * not dirty accountable. | 1908 | * not dirty accountable. |
1838 | */ | 1909 | */ |
1839 | if (PageAnon(old_page)) { | 1910 | if (PageAnon(old_page)) { |
1840 | if (trylock_page(old_page)) { | 1911 | if (!trylock_page(old_page)) { |
1841 | reuse = can_share_swap_page(old_page); | 1912 | page_cache_get(old_page); |
1842 | unlock_page(old_page); | 1913 | pte_unmap_unlock(page_table, ptl); |
1914 | lock_page(old_page); | ||
1915 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
1916 | &ptl); | ||
1917 | if (!pte_same(*page_table, orig_pte)) { | ||
1918 | unlock_page(old_page); | ||
1919 | page_cache_release(old_page); | ||
1920 | goto unlock; | ||
1921 | } | ||
1922 | page_cache_release(old_page); | ||
1843 | } | 1923 | } |
1924 | reuse = reuse_swap_page(old_page); | ||
1925 | unlock_page(old_page); | ||
1844 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 1926 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
1845 | (VM_WRITE|VM_SHARED))) { | 1927 | (VM_WRITE|VM_SHARED))) { |
1846 | /* | 1928 | /* |
@@ -1943,11 +2025,7 @@ gotten: | |||
1943 | * thread doing COW. | 2025 | * thread doing COW. |
1944 | */ | 2026 | */ |
1945 | ptep_clear_flush_notify(vma, address, page_table); | 2027 | ptep_clear_flush_notify(vma, address, page_table); |
1946 | SetPageSwapBacked(new_page); | ||
1947 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
1948 | page_add_new_anon_rmap(new_page, vma, address); | 2028 | page_add_new_anon_rmap(new_page, vma, address); |
1949 | |||
1950 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
1951 | set_pte_at(mm, address, page_table, entry); | 2029 | set_pte_at(mm, address, page_table, entry); |
1952 | update_mmu_cache(vma, address, entry); | 2030 | update_mmu_cache(vma, address, entry); |
1953 | if (old_page) { | 2031 | if (old_page) { |
@@ -1973,7 +2051,7 @@ gotten: | |||
1973 | * mapcount is visible. So transitively, TLBs to | 2051 | * mapcount is visible. So transitively, TLBs to |
1974 | * old page will be flushed before it can be reused. | 2052 | * old page will be flushed before it can be reused. |
1975 | */ | 2053 | */ |
1976 | page_remove_rmap(old_page, vma); | 2054 | page_remove_rmap(old_page); |
1977 | } | 2055 | } |
1978 | 2056 | ||
1979 | /* Free the old page.. */ | 2057 | /* Free the old page.. */ |
@@ -2374,7 +2452,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2374 | 2452 | ||
2375 | inc_mm_counter(mm, anon_rss); | 2453 | inc_mm_counter(mm, anon_rss); |
2376 | pte = mk_pte(page, vma->vm_page_prot); | 2454 | pte = mk_pte(page, vma->vm_page_prot); |
2377 | if (write_access && can_share_swap_page(page)) { | 2455 | if (write_access && reuse_swap_page(page)) { |
2378 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2456 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2379 | write_access = 0; | 2457 | write_access = 0; |
2380 | } | 2458 | } |
@@ -2385,7 +2463,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2385 | 2463 | ||
2386 | swap_free(entry); | 2464 | swap_free(entry); |
2387 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 2465 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
2388 | remove_exclusive_swap_page(page); | 2466 | try_to_free_swap(page); |
2389 | unlock_page(page); | 2467 | unlock_page(page); |
2390 | 2468 | ||
2391 | if (write_access) { | 2469 | if (write_access) { |
@@ -2442,8 +2520,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2442 | if (!pte_none(*page_table)) | 2520 | if (!pte_none(*page_table)) |
2443 | goto release; | 2521 | goto release; |
2444 | inc_mm_counter(mm, anon_rss); | 2522 | inc_mm_counter(mm, anon_rss); |
2445 | SetPageSwapBacked(page); | ||
2446 | lru_cache_add_active_or_unevictable(page, vma); | ||
2447 | page_add_new_anon_rmap(page, vma, address); | 2523 | page_add_new_anon_rmap(page, vma, address); |
2448 | set_pte_at(mm, address, page_table, entry); | 2524 | set_pte_at(mm, address, page_table, entry); |
2449 | 2525 | ||
@@ -2591,8 +2667,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2591 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2667 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2592 | if (anon) { | 2668 | if (anon) { |
2593 | inc_mm_counter(mm, anon_rss); | 2669 | inc_mm_counter(mm, anon_rss); |
2594 | SetPageSwapBacked(page); | ||
2595 | lru_cache_add_active_or_unevictable(page, vma); | ||
2596 | page_add_new_anon_rmap(page, vma, address); | 2670 | page_add_new_anon_rmap(page, vma, address); |
2597 | } else { | 2671 | } else { |
2598 | inc_mm_counter(mm, file_rss); | 2672 | inc_mm_counter(mm, file_rss); |
@@ -2602,7 +2676,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2602 | get_page(dirty_page); | 2676 | get_page(dirty_page); |
2603 | } | 2677 | } |
2604 | } | 2678 | } |
2605 | //TODO: is this safe? do_anonymous_page() does it this way. | ||
2606 | set_pte_at(mm, address, page_table, entry); | 2679 | set_pte_at(mm, address, page_table, entry); |
2607 | 2680 | ||
2608 | /* no need to invalidate: a not-present page won't be cached */ | 2681 | /* no need to invalidate: a not-present page won't be cached */ |
@@ -2666,12 +2739,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2666 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2739 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2667 | return 0; | 2740 | return 0; |
2668 | 2741 | ||
2669 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || | 2742 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
2670 | !(vma->vm_flags & VM_CAN_NONLINEAR))) { | ||
2671 | /* | 2743 | /* |
2672 | * Page table corrupted: show pte and kill process. | 2744 | * Page table corrupted: show pte and kill process. |
2673 | */ | 2745 | */ |
2674 | print_bad_pte(vma, orig_pte, address); | 2746 | print_bad_pte(vma, address, orig_pte, NULL); |
2675 | return VM_FAULT_OOM; | 2747 | return VM_FAULT_OOM; |
2676 | } | 2748 | } |
2677 | 2749 | ||
@@ -2953,7 +3025,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
2953 | { | 3025 | { |
2954 | resource_size_t phys_addr; | 3026 | resource_size_t phys_addr; |
2955 | unsigned long prot = 0; | 3027 | unsigned long prot = 0; |
2956 | void *maddr; | 3028 | void __iomem *maddr; |
2957 | int offset = addr & (PAGE_SIZE-1); | 3029 | int offset = addr & (PAGE_SIZE-1); |
2958 | 3030 | ||
2959 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) | 3031 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b17371185468..c083cf5fd6df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
216 | return 0; | 216 | return 0; |
217 | } | 217 | } |
218 | 218 | ||
219 | static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) | 219 | static int __meminit __add_section(int nid, struct zone *zone, |
220 | unsigned long phys_start_pfn) | ||
220 | { | 221 | { |
221 | int nr_pages = PAGES_PER_SECTION; | 222 | int nr_pages = PAGES_PER_SECTION; |
222 | int ret; | 223 | int ret; |
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p | |||
234 | if (ret < 0) | 235 | if (ret < 0) |
235 | return ret; | 236 | return ret; |
236 | 237 | ||
237 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | 238 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
238 | } | 239 | } |
239 | 240 | ||
240 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 241 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
273 | * call this function after deciding the zone to which to | 274 | * call this function after deciding the zone to which to |
274 | * add the new pages. | 275 | * add the new pages. |
275 | */ | 276 | */ |
276 | int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | 277 | int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, |
277 | unsigned long nr_pages) | 278 | unsigned long nr_pages) |
278 | { | 279 | { |
279 | unsigned long i; | 280 | unsigned long i; |
280 | int err = 0; | 281 | int err = 0; |
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
284 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | 285 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); |
285 | 286 | ||
286 | for (i = start_sec; i <= end_sec; i++) { | 287 | for (i = start_sec; i <= end_sec; i++) { |
287 | err = __add_section(zone, i << PFN_SECTION_SHIFT); | 288 | err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); |
288 | 289 | ||
289 | /* | 290 | /* |
290 | * EEXIST is finally dealt with by ioresource collision | 291 | * EEXIST is finally dealt with by ioresource collision |
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end) | |||
626 | } | 627 | } |
627 | 628 | ||
628 | static struct page * | 629 | static struct page * |
629 | hotremove_migrate_alloc(struct page *page, | 630 | hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) |
630 | unsigned long private, | ||
631 | int **x) | ||
632 | { | 631 | { |
633 | /* This should be improoooooved!! */ | 632 | /* This should be improooooved!! */ |
634 | return alloc_page(GFP_HIGHUSER_PAGECACHE); | 633 | return alloc_page(GFP_HIGHUSER_MOVABLE); |
635 | } | 634 | } |
636 | 635 | ||
637 | |||
638 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | 636 | #define NR_OFFLINE_AT_ONCE_PAGES (256) |
639 | static int | 637 | static int |
640 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 638 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
diff --git a/mm/migrate.c b/mm/migrate.c index 21631ab8c08b..55373983c9c6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -300,12 +300,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
300 | * Now we know that no one else is looking at the page. | 300 | * Now we know that no one else is looking at the page. |
301 | */ | 301 | */ |
302 | get_page(newpage); /* add cache reference */ | 302 | get_page(newpage); /* add cache reference */ |
303 | #ifdef CONFIG_SWAP | ||
304 | if (PageSwapCache(page)) { | 303 | if (PageSwapCache(page)) { |
305 | SetPageSwapCache(newpage); | 304 | SetPageSwapCache(newpage); |
306 | set_page_private(newpage, page_private(page)); | 305 | set_page_private(newpage, page_private(page)); |
307 | } | 306 | } |
308 | #endif | ||
309 | 307 | ||
310 | radix_tree_replace_slot(pslot, newpage); | 308 | radix_tree_replace_slot(pslot, newpage); |
311 | 309 | ||
@@ -373,9 +371,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
373 | 371 | ||
374 | mlock_migrate_page(newpage, page); | 372 | mlock_migrate_page(newpage, page); |
375 | 373 | ||
376 | #ifdef CONFIG_SWAP | ||
377 | ClearPageSwapCache(page); | 374 | ClearPageSwapCache(page); |
378 | #endif | ||
379 | ClearPagePrivate(page); | 375 | ClearPagePrivate(page); |
380 | set_page_private(page, 0); | 376 | set_page_private(page, 0); |
381 | /* page->mapping contains a flag for PageAnon() */ | 377 | /* page->mapping contains a flag for PageAnon() */ |
@@ -848,12 +844,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
848 | struct vm_area_struct *vma; | 844 | struct vm_area_struct *vma; |
849 | struct page *page; | 845 | struct page *page; |
850 | 846 | ||
851 | /* | ||
852 | * A valid page pointer that will not match any of the | ||
853 | * pages that will be moved. | ||
854 | */ | ||
855 | pp->page = ZERO_PAGE(0); | ||
856 | |||
857 | err = -EFAULT; | 847 | err = -EFAULT; |
858 | vma = find_vma(mm, pp->addr); | 848 | vma = find_vma(mm, pp->addr); |
859 | if (!vma || !vma_migratable(vma)) | 849 | if (!vma || !vma_migratable(vma)) |
@@ -919,41 +909,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
919 | const int __user *nodes, | 909 | const int __user *nodes, |
920 | int __user *status, int flags) | 910 | int __user *status, int flags) |
921 | { | 911 | { |
922 | struct page_to_node *pm = NULL; | 912 | struct page_to_node *pm; |
923 | nodemask_t task_nodes; | 913 | nodemask_t task_nodes; |
924 | int err = 0; | 914 | unsigned long chunk_nr_pages; |
925 | int i; | 915 | unsigned long chunk_start; |
916 | int err; | ||
926 | 917 | ||
927 | task_nodes = cpuset_mems_allowed(task); | 918 | task_nodes = cpuset_mems_allowed(task); |
928 | 919 | ||
929 | /* Limit nr_pages so that the multiplication may not overflow */ | 920 | err = -ENOMEM; |
930 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | 921 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
931 | err = -E2BIG; | 922 | if (!pm) |
932 | goto out; | ||
933 | } | ||
934 | |||
935 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); | ||
936 | if (!pm) { | ||
937 | err = -ENOMEM; | ||
938 | goto out; | 923 | goto out; |
939 | } | ||
940 | |||
941 | /* | 924 | /* |
942 | * Get parameters from user space and initialize the pm | 925 | * Store a chunk of page_to_node array in a page, |
943 | * array. Return various errors if the user did something wrong. | 926 | * but keep the last one as a marker |
944 | */ | 927 | */ |
945 | for (i = 0; i < nr_pages; i++) { | 928 | chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; |
946 | const void __user *p; | ||
947 | 929 | ||
948 | err = -EFAULT; | 930 | for (chunk_start = 0; |
949 | if (get_user(p, pages + i)) | 931 | chunk_start < nr_pages; |
950 | goto out_pm; | 932 | chunk_start += chunk_nr_pages) { |
933 | int j; | ||
951 | 934 | ||
952 | pm[i].addr = (unsigned long)p; | 935 | if (chunk_start + chunk_nr_pages > nr_pages) |
953 | if (nodes) { | 936 | chunk_nr_pages = nr_pages - chunk_start; |
937 | |||
938 | /* fill the chunk pm with addrs and nodes from user-space */ | ||
939 | for (j = 0; j < chunk_nr_pages; j++) { | ||
940 | const void __user *p; | ||
954 | int node; | 941 | int node; |
955 | 942 | ||
956 | if (get_user(node, nodes + i)) | 943 | err = -EFAULT; |
944 | if (get_user(p, pages + j + chunk_start)) | ||
945 | goto out_pm; | ||
946 | pm[j].addr = (unsigned long) p; | ||
947 | |||
948 | if (get_user(node, nodes + j + chunk_start)) | ||
957 | goto out_pm; | 949 | goto out_pm; |
958 | 950 | ||
959 | err = -ENODEV; | 951 | err = -ENODEV; |
@@ -964,22 +956,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
964 | if (!node_isset(node, task_nodes)) | 956 | if (!node_isset(node, task_nodes)) |
965 | goto out_pm; | 957 | goto out_pm; |
966 | 958 | ||
967 | pm[i].node = node; | 959 | pm[j].node = node; |
968 | } else | 960 | } |
969 | pm[i].node = 0; /* anything to not match MAX_NUMNODES */ | 961 | |
970 | } | 962 | /* End marker for this chunk */ |
971 | /* End marker */ | 963 | pm[chunk_nr_pages].node = MAX_NUMNODES; |
972 | pm[nr_pages].node = MAX_NUMNODES; | 964 | |
965 | /* Migrate this chunk */ | ||
966 | err = do_move_page_to_node_array(mm, pm, | ||
967 | flags & MPOL_MF_MOVE_ALL); | ||
968 | if (err < 0) | ||
969 | goto out_pm; | ||
973 | 970 | ||
974 | err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
975 | if (err >= 0) | ||
976 | /* Return status information */ | 971 | /* Return status information */ |
977 | for (i = 0; i < nr_pages; i++) | 972 | for (j = 0; j < chunk_nr_pages; j++) |
978 | if (put_user(pm[i].status, status + i)) | 973 | if (put_user(pm[j].status, status + j + chunk_start)) { |
979 | err = -EFAULT; | 974 | err = -EFAULT; |
975 | goto out_pm; | ||
976 | } | ||
977 | } | ||
978 | err = 0; | ||
980 | 979 | ||
981 | out_pm: | 980 | out_pm: |
982 | vfree(pm); | 981 | free_page((unsigned long)pm); |
983 | out: | 982 | out: |
984 | return err; | 983 | return err; |
985 | } | 984 | } |
diff --git a/mm/mlock.c b/mm/mlock.c index 3035a56e7616..e125156c664e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
173 | (atomic_read(&mm->mm_users) != 0)); | 173 | (atomic_read(&mm->mm_users) != 0)); |
174 | 174 | ||
175 | /* | 175 | /* |
176 | * mlock: don't page populate if page has PROT_NONE permission. | 176 | * mlock: don't page populate if vma has PROT_NONE permission. |
177 | * munlock: the pages always do munlock althrough | 177 | * munlock: always do munlock although the vma has PROT_NONE |
178 | * its has PROT_NONE permission. | 178 | * permission, or SIGKILL is pending. |
179 | */ | 179 | */ |
180 | if (!mlock) | 180 | if (!mlock) |
181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; | 181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | |
182 | GUP_FLAGS_IGNORE_SIGKILL; | ||
182 | 183 | ||
183 | if (vma->vm_flags & VM_WRITE) | 184 | if (vma->vm_flags & VM_WRITE) |
184 | gup_flags |= GUP_FLAGS_WRITE; | 185 | gup_flags |= GUP_FLAGS_WRITE; |
@@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | |||
413 | 413 | ||
414 | static void __vma_link_file(struct vm_area_struct *vma) | 414 | static void __vma_link_file(struct vm_area_struct *vma) |
415 | { | 415 | { |
416 | struct file * file; | 416 | struct file *file; |
417 | 417 | ||
418 | file = vma->vm_file; | 418 | file = vma->vm_file; |
419 | if (file) { | 419 | if (file) { |
@@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
474 | * insert vm structure into list and rbtree and anon_vma, | 474 | * insert vm structure into list and rbtree and anon_vma, |
475 | * but it has already been inserted into prio_tree earlier. | 475 | * but it has already been inserted into prio_tree earlier. |
476 | */ | 476 | */ |
477 | static void | 477 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
478 | __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | ||
479 | { | 478 | { |
480 | struct vm_area_struct * __vma, * prev; | 479 | struct vm_area_struct *__vma, *prev; |
481 | struct rb_node ** rb_link, * rb_parent; | 480 | struct rb_node **rb_link, *rb_parent; |
482 | 481 | ||
483 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | 482 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); |
484 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); | 483 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); |
@@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
908 | * The caller must hold down_write(current->mm->mmap_sem). | 907 | * The caller must hold down_write(current->mm->mmap_sem). |
909 | */ | 908 | */ |
910 | 909 | ||
911 | unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | 910 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
912 | unsigned long len, unsigned long prot, | 911 | unsigned long len, unsigned long prot, |
913 | unsigned long flags, unsigned long pgoff) | 912 | unsigned long flags, unsigned long pgoff) |
914 | { | 913 | { |
@@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1464 | EXPORT_SYMBOL(get_unmapped_area); | 1463 | EXPORT_SYMBOL(get_unmapped_area); |
1465 | 1464 | ||
1466 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1465 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1467 | struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) | 1466 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1468 | { | 1467 | { |
1469 | struct vm_area_struct *vma = NULL; | 1468 | struct vm_area_struct *vma = NULL; |
1470 | 1469 | ||
@@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, | |||
1507 | struct vm_area_struct **pprev) | 1506 | struct vm_area_struct **pprev) |
1508 | { | 1507 | { |
1509 | struct vm_area_struct *vma = NULL, *prev = NULL; | 1508 | struct vm_area_struct *vma = NULL, *prev = NULL; |
1510 | struct rb_node * rb_node; | 1509 | struct rb_node *rb_node; |
1511 | if (!mm) | 1510 | if (!mm) |
1512 | goto out; | 1511 | goto out; |
1513 | 1512 | ||
@@ -1541,7 +1540,7 @@ out: | |||
1541 | * update accounting. This is shared with both the | 1540 | * update accounting. This is shared with both the |
1542 | * grow-up and grow-down cases. | 1541 | * grow-up and grow-down cases. |
1543 | */ | 1542 | */ |
1544 | static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) | 1543 | static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) |
1545 | { | 1544 | { |
1546 | struct mm_struct *mm = vma->vm_mm; | 1545 | struct mm_struct *mm = vma->vm_mm; |
1547 | struct rlimit *rlim = current->signal->rlim; | 1546 | struct rlimit *rlim = current->signal->rlim; |
@@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm) | |||
2091 | arch_exit_mmap(mm); | 2090 | arch_exit_mmap(mm); |
2092 | mmu_notifier_release(mm); | 2091 | mmu_notifier_release(mm); |
2093 | 2092 | ||
2093 | if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ | ||
2094 | return; | ||
2095 | |||
2094 | if (mm->locked_vm) { | 2096 | if (mm->locked_vm) { |
2095 | vma = mm->mmap; | 2097 | vma = mm->mmap; |
2096 | while (vma) { | 2098 | while (vma) { |
@@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2103 | lru_add_drain(); | 2105 | lru_add_drain(); |
2104 | flush_cache_mm(mm); | 2106 | flush_cache_mm(mm); |
2105 | tlb = tlb_gather_mmu(mm, 1); | 2107 | tlb = tlb_gather_mmu(mm, 1); |
2106 | /* Don't update_hiwater_rss(mm) here, do_exit already did */ | 2108 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2107 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2109 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2108 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2110 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
2109 | vm_unacct_memory(nr_accounted); | 2111 | vm_unacct_memory(nr_accounted); |
diff --git a/mm/mprotect.c b/mm/mprotect.c index cfb4c4852062..d0f6e7ce09f1 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | 24 | #include <linux/mmu_notifier.h> |
25 | #include <linux/migrate.h> | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | #include <asm/pgtable.h> | 27 | #include <asm/pgtable.h> |
27 | #include <asm/cacheflush.h> | 28 | #include <asm/cacheflush.h> |
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
59 | ptent = pte_mkwrite(ptent); | 60 | ptent = pte_mkwrite(ptent); |
60 | 61 | ||
61 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 62 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
62 | #ifdef CONFIG_MIGRATION | 63 | } else if (PAGE_MIGRATION && !pte_file(oldpte)) { |
63 | } else if (!pte_file(oldpte)) { | ||
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 64 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
65 | 65 | ||
66 | if (is_write_migration_entry(entry)) { | 66 | if (is_write_migration_entry(entry)) { |
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
72 | set_pte_at(mm, addr, pte, | 72 | set_pte_at(mm, addr, pte, |
73 | swp_entry_to_pte(entry)); | 73 | swp_entry_to_pte(entry)); |
74 | } | 74 | } |
75 | #endif | ||
76 | } | 75 | } |
77 | |||
78 | } while (pte++, addr += PAGE_SIZE, addr != end); | 76 | } while (pte++, addr += PAGE_SIZE, addr != end); |
79 | arch_leave_lazy_mmu_mode(); | 77 | arch_leave_lazy_mmu_mode(); |
80 | pte_unmap_unlock(pte - 1, ptl); | 78 | pte_unmap_unlock(pte - 1, ptl); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 558f9afe6e4e..6b9e758c98a5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -31,7 +31,7 @@ | |||
31 | int sysctl_panic_on_oom; | 31 | int sysctl_panic_on_oom; |
32 | int sysctl_oom_kill_allocating_task; | 32 | int sysctl_oom_kill_allocating_task; |
33 | int sysctl_oom_dump_tasks; | 33 | int sysctl_oom_dump_tasks; |
34 | static DEFINE_SPINLOCK(zone_scan_mutex); | 34 | static DEFINE_SPINLOCK(zone_scan_lock); |
35 | /* #define DEBUG */ | 35 | /* #define DEBUG */ |
36 | 36 | ||
37 | /** | 37 | /** |
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
392 | printk(KERN_WARNING "%s invoked oom-killer: " | 392 | printk(KERN_WARNING "%s invoked oom-killer: " |
393 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | 393 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", |
394 | current->comm, gfp_mask, order, current->oomkilladj); | 394 | current->comm, gfp_mask, order, current->oomkilladj); |
395 | task_lock(current); | ||
396 | cpuset_print_task_mems_allowed(current); | ||
397 | task_unlock(current); | ||
395 | dump_stack(); | 398 | dump_stack(); |
396 | show_mem(); | 399 | show_mem(); |
397 | if (sysctl_oom_dump_tasks) | 400 | if (sysctl_oom_dump_tasks) |
@@ -470,7 +473,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
470 | struct zone *zone; | 473 | struct zone *zone; |
471 | int ret = 1; | 474 | int ret = 1; |
472 | 475 | ||
473 | spin_lock(&zone_scan_mutex); | 476 | spin_lock(&zone_scan_lock); |
474 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 477 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
475 | if (zone_is_oom_locked(zone)) { | 478 | if (zone_is_oom_locked(zone)) { |
476 | ret = 0; | 479 | ret = 0; |
@@ -480,7 +483,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
480 | 483 | ||
481 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 484 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
482 | /* | 485 | /* |
483 | * Lock each zone in the zonelist under zone_scan_mutex so a | 486 | * Lock each zone in the zonelist under zone_scan_lock so a |
484 | * parallel invocation of try_set_zone_oom() doesn't succeed | 487 | * parallel invocation of try_set_zone_oom() doesn't succeed |
485 | * when it shouldn't. | 488 | * when it shouldn't. |
486 | */ | 489 | */ |
@@ -488,7 +491,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
488 | } | 491 | } |
489 | 492 | ||
490 | out: | 493 | out: |
491 | spin_unlock(&zone_scan_mutex); | 494 | spin_unlock(&zone_scan_lock); |
492 | return ret; | 495 | return ret; |
493 | } | 496 | } |
494 | 497 | ||
@@ -502,11 +505,74 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
502 | struct zoneref *z; | 505 | struct zoneref *z; |
503 | struct zone *zone; | 506 | struct zone *zone; |
504 | 507 | ||
505 | spin_lock(&zone_scan_mutex); | 508 | spin_lock(&zone_scan_lock); |
506 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 509 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { |
507 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | 510 | zone_clear_flag(zone, ZONE_OOM_LOCKED); |
508 | } | 511 | } |
509 | spin_unlock(&zone_scan_mutex); | 512 | spin_unlock(&zone_scan_lock); |
513 | } | ||
514 | |||
515 | /* | ||
516 | * Must be called with tasklist_lock held for read. | ||
517 | */ | ||
518 | static void __out_of_memory(gfp_t gfp_mask, int order) | ||
519 | { | ||
520 | if (sysctl_oom_kill_allocating_task) { | ||
521 | oom_kill_process(current, gfp_mask, order, 0, NULL, | ||
522 | "Out of memory (oom_kill_allocating_task)"); | ||
523 | |||
524 | } else { | ||
525 | unsigned long points; | ||
526 | struct task_struct *p; | ||
527 | |||
528 | retry: | ||
529 | /* | ||
530 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
531 | * issues we may have. | ||
532 | */ | ||
533 | p = select_bad_process(&points, NULL); | ||
534 | |||
535 | if (PTR_ERR(p) == -1UL) | ||
536 | return; | ||
537 | |||
538 | /* Found nothing?!?! Either we hang forever, or we panic. */ | ||
539 | if (!p) { | ||
540 | read_unlock(&tasklist_lock); | ||
541 | panic("Out of memory and no killable processes...\n"); | ||
542 | } | ||
543 | |||
544 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | ||
545 | "Out of memory")) | ||
546 | goto retry; | ||
547 | } | ||
548 | } | ||
549 | |||
550 | /* | ||
551 | * pagefault handler calls into here because it is out of memory but | ||
552 | * doesn't know exactly how or why. | ||
553 | */ | ||
554 | void pagefault_out_of_memory(void) | ||
555 | { | ||
556 | unsigned long freed = 0; | ||
557 | |||
558 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | ||
559 | if (freed > 0) | ||
560 | /* Got some memory back in the last second. */ | ||
561 | return; | ||
562 | |||
563 | if (sysctl_panic_on_oom) | ||
564 | panic("out of memory from page fault. panic_on_oom is selected.\n"); | ||
565 | |||
566 | read_lock(&tasklist_lock); | ||
567 | __out_of_memory(0, 0); /* unknown gfp_mask and order */ | ||
568 | read_unlock(&tasklist_lock); | ||
569 | |||
570 | /* | ||
571 | * Give "p" a good chance of killing itself before we | ||
572 | * retry to allocate memory. | ||
573 | */ | ||
574 | if (!test_thread_flag(TIF_MEMDIE)) | ||
575 | schedule_timeout_uninterruptible(1); | ||
510 | } | 576 | } |
511 | 577 | ||
512 | /** | 578 | /** |
@@ -522,8 +588,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
522 | */ | 588 | */ |
523 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 589 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) |
524 | { | 590 | { |
525 | struct task_struct *p; | ||
526 | unsigned long points = 0; | ||
527 | unsigned long freed = 0; | 591 | unsigned long freed = 0; |
528 | enum oom_constraint constraint; | 592 | enum oom_constraint constraint; |
529 | 593 | ||
@@ -544,7 +608,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
544 | 608 | ||
545 | switch (constraint) { | 609 | switch (constraint) { |
546 | case CONSTRAINT_MEMORY_POLICY: | 610 | case CONSTRAINT_MEMORY_POLICY: |
547 | oom_kill_process(current, gfp_mask, order, points, NULL, | 611 | oom_kill_process(current, gfp_mask, order, 0, NULL, |
548 | "No available memory (MPOL_BIND)"); | 612 | "No available memory (MPOL_BIND)"); |
549 | break; | 613 | break; |
550 | 614 | ||
@@ -553,35 +617,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
553 | panic("out of memory. panic_on_oom is selected\n"); | 617 | panic("out of memory. panic_on_oom is selected\n"); |
554 | /* Fall-through */ | 618 | /* Fall-through */ |
555 | case CONSTRAINT_CPUSET: | 619 | case CONSTRAINT_CPUSET: |
556 | if (sysctl_oom_kill_allocating_task) { | 620 | __out_of_memory(gfp_mask, order); |
557 | oom_kill_process(current, gfp_mask, order, points, NULL, | ||
558 | "Out of memory (oom_kill_allocating_task)"); | ||
559 | break; | ||
560 | } | ||
561 | retry: | ||
562 | /* | ||
563 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
564 | * issues we may have. | ||
565 | */ | ||
566 | p = select_bad_process(&points, NULL); | ||
567 | |||
568 | if (PTR_ERR(p) == -1UL) | ||
569 | goto out; | ||
570 | |||
571 | /* Found nothing?!?! Either we hang forever, or we panic. */ | ||
572 | if (!p) { | ||
573 | read_unlock(&tasklist_lock); | ||
574 | panic("Out of memory and no killable processes...\n"); | ||
575 | } | ||
576 | |||
577 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | ||
578 | "Out of memory")) | ||
579 | goto retry; | ||
580 | |||
581 | break; | 621 | break; |
582 | } | 622 | } |
583 | 623 | ||
584 | out: | ||
585 | read_unlock(&tasklist_lock); | 624 | read_unlock(&tasklist_lock); |
586 | 625 | ||
587 | /* | 626 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2970e35fd03f..b493db7841dc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) | |||
69 | int dirty_background_ratio = 5; | 69 | int dirty_background_ratio = 5; |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * dirty_background_bytes starts at 0 (disabled) so that it is a function of | ||
73 | * dirty_background_ratio * the amount of dirtyable memory | ||
74 | */ | ||
75 | unsigned long dirty_background_bytes; | ||
76 | |||
77 | /* | ||
72 | * free highmem will not be subtracted from the total free memory | 78 | * free highmem will not be subtracted from the total free memory |
73 | * for calculating free ratios if vm_highmem_is_dirtyable is true | 79 | * for calculating free ratios if vm_highmem_is_dirtyable is true |
74 | */ | 80 | */ |
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable; | |||
80 | int vm_dirty_ratio = 10; | 86 | int vm_dirty_ratio = 10; |
81 | 87 | ||
82 | /* | 88 | /* |
89 | * vm_dirty_bytes starts at 0 (disabled) so that it is a function of | ||
90 | * vm_dirty_ratio * the amount of dirtyable memory | ||
91 | */ | ||
92 | unsigned long vm_dirty_bytes; | ||
93 | |||
94 | /* | ||
83 | * The interval between `kupdate'-style writebacks, in jiffies | 95 | * The interval between `kupdate'-style writebacks, in jiffies |
84 | */ | 96 | */ |
85 | int dirty_writeback_interval = 5 * HZ; | 97 | int dirty_writeback_interval = 5 * HZ; |
@@ -135,23 +147,75 @@ static int calc_period_shift(void) | |||
135 | { | 147 | { |
136 | unsigned long dirty_total; | 148 | unsigned long dirty_total; |
137 | 149 | ||
138 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; | 150 | if (vm_dirty_bytes) |
151 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
152 | else | ||
153 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / | ||
154 | 100; | ||
139 | return 2 + ilog2(dirty_total - 1); | 155 | return 2 + ilog2(dirty_total - 1); |
140 | } | 156 | } |
141 | 157 | ||
142 | /* | 158 | /* |
143 | * update the period when the dirty ratio changes. | 159 | * update the period when the dirty threshold changes. |
144 | */ | 160 | */ |
161 | static void update_completion_period(void) | ||
162 | { | ||
163 | int shift = calc_period_shift(); | ||
164 | prop_change_shift(&vm_completions, shift); | ||
165 | prop_change_shift(&vm_dirties, shift); | ||
166 | } | ||
167 | |||
168 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | ||
169 | struct file *filp, void __user *buffer, size_t *lenp, | ||
170 | loff_t *ppos) | ||
171 | { | ||
172 | int ret; | ||
173 | |||
174 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
175 | if (ret == 0 && write) | ||
176 | dirty_background_bytes = 0; | ||
177 | return ret; | ||
178 | } | ||
179 | |||
180 | int dirty_background_bytes_handler(struct ctl_table *table, int write, | ||
181 | struct file *filp, void __user *buffer, size_t *lenp, | ||
182 | loff_t *ppos) | ||
183 | { | ||
184 | int ret; | ||
185 | |||
186 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
187 | if (ret == 0 && write) | ||
188 | dirty_background_ratio = 0; | ||
189 | return ret; | ||
190 | } | ||
191 | |||
145 | int dirty_ratio_handler(struct ctl_table *table, int write, | 192 | int dirty_ratio_handler(struct ctl_table *table, int write, |
146 | struct file *filp, void __user *buffer, size_t *lenp, | 193 | struct file *filp, void __user *buffer, size_t *lenp, |
147 | loff_t *ppos) | 194 | loff_t *ppos) |
148 | { | 195 | { |
149 | int old_ratio = vm_dirty_ratio; | 196 | int old_ratio = vm_dirty_ratio; |
150 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | 197 | int ret; |
198 | |||
199 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
151 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 200 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
152 | int shift = calc_period_shift(); | 201 | update_completion_period(); |
153 | prop_change_shift(&vm_completions, shift); | 202 | vm_dirty_bytes = 0; |
154 | prop_change_shift(&vm_dirties, shift); | 203 | } |
204 | return ret; | ||
205 | } | ||
206 | |||
207 | |||
208 | int dirty_bytes_handler(struct ctl_table *table, int write, | ||
209 | struct file *filp, void __user *buffer, size_t *lenp, | ||
210 | loff_t *ppos) | ||
211 | { | ||
212 | int old_bytes = vm_dirty_bytes; | ||
213 | int ret; | ||
214 | |||
215 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
216 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | ||
217 | update_completion_period(); | ||
218 | vm_dirty_ratio = 0; | ||
155 | } | 219 | } |
156 | return ret; | 220 | return ret; |
157 | } | 221 | } |
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void) | |||
362 | } | 426 | } |
363 | 427 | ||
364 | void | 428 | void |
365 | get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, | 429 | get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, |
366 | struct backing_dev_info *bdi) | 430 | unsigned long *pbdi_dirty, struct backing_dev_info *bdi) |
367 | { | 431 | { |
368 | int background_ratio; /* Percentages */ | 432 | unsigned long background; |
369 | int dirty_ratio; | 433 | unsigned long dirty; |
370 | long background; | ||
371 | long dirty; | ||
372 | unsigned long available_memory = determine_dirtyable_memory(); | 434 | unsigned long available_memory = determine_dirtyable_memory(); |
373 | struct task_struct *tsk; | 435 | struct task_struct *tsk; |
374 | 436 | ||
375 | dirty_ratio = vm_dirty_ratio; | 437 | if (vm_dirty_bytes) |
376 | if (dirty_ratio < 5) | 438 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
377 | dirty_ratio = 5; | 439 | else { |
440 | int dirty_ratio; | ||
378 | 441 | ||
379 | background_ratio = dirty_background_ratio; | 442 | dirty_ratio = vm_dirty_ratio; |
380 | if (background_ratio >= dirty_ratio) | 443 | if (dirty_ratio < 5) |
381 | background_ratio = dirty_ratio / 2; | 444 | dirty_ratio = 5; |
445 | dirty = (dirty_ratio * available_memory) / 100; | ||
446 | } | ||
447 | |||
448 | if (dirty_background_bytes) | ||
449 | background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); | ||
450 | else | ||
451 | background = (dirty_background_ratio * available_memory) / 100; | ||
382 | 452 | ||
383 | background = (background_ratio * available_memory) / 100; | 453 | if (background >= dirty) |
384 | dirty = (dirty_ratio * available_memory) / 100; | 454 | background = dirty / 2; |
385 | tsk = current; | 455 | tsk = current; |
386 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | 456 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { |
387 | background += background / 4; | 457 | background += background / 4; |
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
423 | { | 493 | { |
424 | long nr_reclaimable, bdi_nr_reclaimable; | 494 | long nr_reclaimable, bdi_nr_reclaimable; |
425 | long nr_writeback, bdi_nr_writeback; | 495 | long nr_writeback, bdi_nr_writeback; |
426 | long background_thresh; | 496 | unsigned long background_thresh; |
427 | long dirty_thresh; | 497 | unsigned long dirty_thresh; |
428 | long bdi_thresh; | 498 | unsigned long bdi_thresh; |
429 | unsigned long pages_written = 0; | 499 | unsigned long pages_written = 0; |
430 | unsigned long write_chunk = sync_writeback_pages(); | 500 | unsigned long write_chunk = sync_writeback_pages(); |
431 | 501 | ||
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | |||
580 | 650 | ||
581 | void throttle_vm_writeout(gfp_t gfp_mask) | 651 | void throttle_vm_writeout(gfp_t gfp_mask) |
582 | { | 652 | { |
583 | long background_thresh; | 653 | unsigned long background_thresh; |
584 | long dirty_thresh; | 654 | unsigned long dirty_thresh; |
585 | 655 | ||
586 | for ( ; ; ) { | 656 | for ( ; ; ) { |
587 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 657 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) | |||
624 | }; | 694 | }; |
625 | 695 | ||
626 | for ( ; ; ) { | 696 | for ( ; ; ) { |
627 | long background_thresh; | 697 | unsigned long background_thresh; |
628 | long dirty_thresh; | 698 | unsigned long dirty_thresh; |
629 | 699 | ||
630 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 700 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
631 | if (global_page_state(NR_FILE_DIRTY) + | 701 | if (global_page_state(NR_FILE_DIRTY) + |
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping, | |||
868 | int done = 0; | 938 | int done = 0; |
869 | struct pagevec pvec; | 939 | struct pagevec pvec; |
870 | int nr_pages; | 940 | int nr_pages; |
941 | pgoff_t uninitialized_var(writeback_index); | ||
871 | pgoff_t index; | 942 | pgoff_t index; |
872 | pgoff_t end; /* Inclusive */ | 943 | pgoff_t end; /* Inclusive */ |
873 | int scanned = 0; | 944 | pgoff_t done_index; |
945 | int cycled; | ||
874 | int range_whole = 0; | 946 | int range_whole = 0; |
875 | long nr_to_write = wbc->nr_to_write; | 947 | long nr_to_write = wbc->nr_to_write; |
876 | 948 | ||
@@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping, | |||
881 | 953 | ||
882 | pagevec_init(&pvec, 0); | 954 | pagevec_init(&pvec, 0); |
883 | if (wbc->range_cyclic) { | 955 | if (wbc->range_cyclic) { |
884 | index = mapping->writeback_index; /* Start from prev offset */ | 956 | writeback_index = mapping->writeback_index; /* prev offset */ |
957 | index = writeback_index; | ||
958 | if (index == 0) | ||
959 | cycled = 1; | ||
960 | else | ||
961 | cycled = 0; | ||
885 | end = -1; | 962 | end = -1; |
886 | } else { | 963 | } else { |
887 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 964 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
888 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 965 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
889 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 966 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
890 | range_whole = 1; | 967 | range_whole = 1; |
891 | scanned = 1; | 968 | cycled = 1; /* ignore range_cyclic tests */ |
892 | } | 969 | } |
893 | retry: | 970 | retry: |
894 | while (!done && (index <= end) && | 971 | done_index = index; |
895 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 972 | while (!done && (index <= end)) { |
896 | PAGECACHE_TAG_DIRTY, | 973 | int i; |
897 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | 974 | |
898 | unsigned i; | 975 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
976 | PAGECACHE_TAG_DIRTY, | ||
977 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); | ||
978 | if (nr_pages == 0) | ||
979 | break; | ||
899 | 980 | ||
900 | scanned = 1; | ||
901 | for (i = 0; i < nr_pages; i++) { | 981 | for (i = 0; i < nr_pages; i++) { |
902 | struct page *page = pvec.pages[i]; | 982 | struct page *page = pvec.pages[i]; |
903 | 983 | ||
904 | /* | 984 | /* |
905 | * At this point we hold neither mapping->tree_lock nor | 985 | * At this point, the page may be truncated or |
906 | * lock on the page itself: the page may be truncated or | 986 | * invalidated (changing page->mapping to NULL), or |
907 | * invalidated (changing page->mapping to NULL), or even | 987 | * even swizzled back from swapper_space to tmpfs file |
908 | * swizzled back from swapper_space to tmpfs file | 988 | * mapping. However, page->index will not change |
909 | * mapping | 989 | * because we have a reference on the page. |
910 | */ | 990 | */ |
991 | if (page->index > end) { | ||
992 | /* | ||
993 | * can't be range_cyclic (1st pass) because | ||
994 | * end == -1 in that case. | ||
995 | */ | ||
996 | done = 1; | ||
997 | break; | ||
998 | } | ||
999 | |||
1000 | done_index = page->index + 1; | ||
1001 | |||
911 | lock_page(page); | 1002 | lock_page(page); |
912 | 1003 | ||
1004 | /* | ||
1005 | * Page truncated or invalidated. We can freely skip it | ||
1006 | * then, even for data integrity operations: the page | ||
1007 | * has disappeared concurrently, so there could be no | ||
1008 | * real expectation of this data interity operation | ||
1009 | * even if there is now a new, dirty page at the same | ||
1010 | * pagecache address. | ||
1011 | */ | ||
913 | if (unlikely(page->mapping != mapping)) { | 1012 | if (unlikely(page->mapping != mapping)) { |
1013 | continue_unlock: | ||
914 | unlock_page(page); | 1014 | unlock_page(page); |
915 | continue; | 1015 | continue; |
916 | } | 1016 | } |
917 | 1017 | ||
918 | if (!wbc->range_cyclic && page->index > end) { | 1018 | if (!PageDirty(page)) { |
919 | done = 1; | 1019 | /* someone wrote it for us */ |
920 | unlock_page(page); | 1020 | goto continue_unlock; |
921 | continue; | ||
922 | } | 1021 | } |
923 | 1022 | ||
924 | if (wbc->sync_mode != WB_SYNC_NONE) | 1023 | if (PageWriteback(page)) { |
925 | wait_on_page_writeback(page); | 1024 | if (wbc->sync_mode != WB_SYNC_NONE) |
926 | 1025 | wait_on_page_writeback(page); | |
927 | if (PageWriteback(page) || | 1026 | else |
928 | !clear_page_dirty_for_io(page)) { | 1027 | goto continue_unlock; |
929 | unlock_page(page); | ||
930 | continue; | ||
931 | } | 1028 | } |
932 | 1029 | ||
933 | ret = (*writepage)(page, wbc, data); | 1030 | BUG_ON(PageWriteback(page)); |
1031 | if (!clear_page_dirty_for_io(page)) | ||
1032 | goto continue_unlock; | ||
934 | 1033 | ||
935 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { | 1034 | ret = (*writepage)(page, wbc, data); |
936 | unlock_page(page); | 1035 | if (unlikely(ret)) { |
937 | ret = 0; | 1036 | if (ret == AOP_WRITEPAGE_ACTIVATE) { |
1037 | unlock_page(page); | ||
1038 | ret = 0; | ||
1039 | } else { | ||
1040 | /* | ||
1041 | * done_index is set past this page, | ||
1042 | * so media errors will not choke | ||
1043 | * background writeout for the entire | ||
1044 | * file. This has consequences for | ||
1045 | * range_cyclic semantics (ie. it may | ||
1046 | * not be suitable for data integrity | ||
1047 | * writeout). | ||
1048 | */ | ||
1049 | done = 1; | ||
1050 | break; | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | if (wbc->sync_mode == WB_SYNC_NONE) { | ||
1055 | wbc->nr_to_write--; | ||
1056 | if (wbc->nr_to_write <= 0) { | ||
1057 | done = 1; | ||
1058 | break; | ||
1059 | } | ||
938 | } | 1060 | } |
939 | if (ret || (--nr_to_write <= 0)) | ||
940 | done = 1; | ||
941 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 1061 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
942 | wbc->encountered_congestion = 1; | 1062 | wbc->encountered_congestion = 1; |
943 | done = 1; | 1063 | done = 1; |
1064 | break; | ||
944 | } | 1065 | } |
945 | } | 1066 | } |
946 | pagevec_release(&pvec); | 1067 | pagevec_release(&pvec); |
947 | cond_resched(); | 1068 | cond_resched(); |
948 | } | 1069 | } |
949 | if (!scanned && !done) { | 1070 | if (!cycled) { |
950 | /* | 1071 | /* |
1072 | * range_cyclic: | ||
951 | * We hit the last page and there is more work to be done: wrap | 1073 | * We hit the last page and there is more work to be done: wrap |
952 | * back to the start of the file | 1074 | * back to the start of the file |
953 | */ | 1075 | */ |
954 | scanned = 1; | 1076 | cycled = 1; |
955 | index = 0; | 1077 | index = 0; |
1078 | end = writeback_index - 1; | ||
956 | goto retry; | 1079 | goto retry; |
957 | } | 1080 | } |
958 | if (!wbc->no_nrwrite_index_update) { | 1081 | if (!wbc->no_nrwrite_index_update) { |
959 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) | 1082 | if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) |
960 | mapping->writeback_index = index; | 1083 | mapping->writeback_index = done_index; |
961 | wbc->nr_to_write = nr_to_write; | 1084 | wbc->nr_to_write = nr_to_write; |
962 | } | 1085 | } |
963 | 1086 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8ac01474563..7bf22e045318 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states); | |||
69 | 69 | ||
70 | unsigned long totalram_pages __read_mostly; | 70 | unsigned long totalram_pages __read_mostly; |
71 | unsigned long totalreserve_pages __read_mostly; | 71 | unsigned long totalreserve_pages __read_mostly; |
72 | long nr_swap_pages; | 72 | unsigned long highest_memmap_pfn __read_mostly; |
73 | int percpu_pagelist_fraction; | 73 | int percpu_pagelist_fraction; |
74 | 74 | ||
75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
223 | 223 | ||
224 | static void bad_page(struct page *page) | 224 | static void bad_page(struct page *page) |
225 | { | 225 | { |
226 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG | 226 | static unsigned long resume; |
227 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 227 | static unsigned long nr_shown; |
228 | current->comm, page, (int)(2*sizeof(unsigned long)), | 228 | static unsigned long nr_unshown; |
229 | (unsigned long)page->flags, page->mapping, | 229 | |
230 | page_mapcount(page), page_count(page)); | 230 | /* |
231 | * Allow a burst of 60 reports, then keep quiet for that minute; | ||
232 | * or allow a steady drip of one report per second. | ||
233 | */ | ||
234 | if (nr_shown == 60) { | ||
235 | if (time_before(jiffies, resume)) { | ||
236 | nr_unshown++; | ||
237 | goto out; | ||
238 | } | ||
239 | if (nr_unshown) { | ||
240 | printk(KERN_ALERT | ||
241 | "BUG: Bad page state: %lu messages suppressed\n", | ||
242 | nr_unshown); | ||
243 | nr_unshown = 0; | ||
244 | } | ||
245 | nr_shown = 0; | ||
246 | } | ||
247 | if (nr_shown++ == 0) | ||
248 | resume = jiffies + 60 * HZ; | ||
249 | |||
250 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | ||
251 | current->comm, page_to_pfn(page)); | ||
252 | printk(KERN_ALERT | ||
253 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
254 | page, (void *)page->flags, page_count(page), | ||
255 | page_mapcount(page), page->mapping, page->index); | ||
231 | 256 | ||
232 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | ||
233 | KERN_EMERG "Backtrace:\n"); | ||
234 | dump_stack(); | 257 | dump_stack(); |
235 | page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; | 258 | out: |
236 | set_page_count(page, 0); | 259 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
237 | reset_page_mapcount(page); | 260 | __ClearPageBuddy(page); |
238 | page->mapping = NULL; | ||
239 | add_taint(TAINT_BAD_PAGE); | 261 | add_taint(TAINT_BAD_PAGE); |
240 | } | 262 | } |
241 | 263 | ||
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
292 | } | 314 | } |
293 | #endif | 315 | #endif |
294 | 316 | ||
295 | static void destroy_compound_page(struct page *page, unsigned long order) | 317 | static int destroy_compound_page(struct page *page, unsigned long order) |
296 | { | 318 | { |
297 | int i; | 319 | int i; |
298 | int nr_pages = 1 << order; | 320 | int nr_pages = 1 << order; |
321 | int bad = 0; | ||
299 | 322 | ||
300 | if (unlikely(compound_order(page) != order)) | 323 | if (unlikely(compound_order(page) != order) || |
324 | unlikely(!PageHead(page))) { | ||
301 | bad_page(page); | 325 | bad_page(page); |
326 | bad++; | ||
327 | } | ||
302 | 328 | ||
303 | if (unlikely(!PageHead(page))) | ||
304 | bad_page(page); | ||
305 | __ClearPageHead(page); | 329 | __ClearPageHead(page); |
330 | |||
306 | for (i = 1; i < nr_pages; i++) { | 331 | for (i = 1; i < nr_pages; i++) { |
307 | struct page *p = page + i; | 332 | struct page *p = page + i; |
308 | 333 | ||
309 | if (unlikely(!PageTail(p) | | 334 | if (unlikely(!PageTail(p) | (p->first_page != page))) { |
310 | (p->first_page != page))) | ||
311 | bad_page(page); | 335 | bad_page(page); |
336 | bad++; | ||
337 | } | ||
312 | __ClearPageTail(p); | 338 | __ClearPageTail(p); |
313 | } | 339 | } |
340 | |||
341 | return bad; | ||
314 | } | 342 | } |
315 | 343 | ||
316 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 344 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) |
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page, | |||
430 | int migratetype = get_pageblock_migratetype(page); | 458 | int migratetype = get_pageblock_migratetype(page); |
431 | 459 | ||
432 | if (unlikely(PageCompound(page))) | 460 | if (unlikely(PageCompound(page))) |
433 | destroy_compound_page(page, order); | 461 | if (unlikely(destroy_compound_page(page, order))) |
462 | return; | ||
434 | 463 | ||
435 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 464 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
436 | 465 | ||
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page) | |||
467 | if (unlikely(page_mapcount(page) | | 496 | if (unlikely(page_mapcount(page) | |
468 | (page->mapping != NULL) | | 497 | (page->mapping != NULL) | |
469 | (page_count(page) != 0) | | 498 | (page_count(page) != 0) | |
470 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) | 499 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
471 | bad_page(page); | 500 | bad_page(page); |
472 | if (PageDirty(page)) | 501 | return 1; |
473 | __ClearPageDirty(page); | 502 | } |
474 | if (PageSwapBacked(page)) | 503 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
475 | __ClearPageSwapBacked(page); | 504 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
476 | /* | 505 | return 0; |
477 | * For now, we report if PG_reserved was found set, but do not | ||
478 | * clear it, and do not free the page. But we shall soon need | ||
479 | * to do more, for when the ZERO_PAGE count wraps negative. | ||
480 | */ | ||
481 | return PageReserved(page); | ||
482 | } | 506 | } |
483 | 507 | ||
484 | /* | 508 | /* |
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
523 | { | 547 | { |
524 | unsigned long flags; | 548 | unsigned long flags; |
525 | int i; | 549 | int i; |
526 | int reserved = 0; | 550 | int bad = 0; |
527 | 551 | ||
528 | for (i = 0 ; i < (1 << order) ; ++i) | 552 | for (i = 0 ; i < (1 << order) ; ++i) |
529 | reserved += free_pages_check(page + i); | 553 | bad += free_pages_check(page + i); |
530 | if (reserved) | 554 | if (bad) |
531 | return; | 555 | return; |
532 | 556 | ||
533 | if (!PageHighMem(page)) { | 557 | if (!PageHighMem(page)) { |
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
612 | if (unlikely(page_mapcount(page) | | 636 | if (unlikely(page_mapcount(page) | |
613 | (page->mapping != NULL) | | 637 | (page->mapping != NULL) | |
614 | (page_count(page) != 0) | | 638 | (page_count(page) != 0) | |
615 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) | 639 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
616 | bad_page(page); | 640 | bad_page(page); |
617 | |||
618 | /* | ||
619 | * For now, we report if PG_reserved was found set, but do not | ||
620 | * clear it, and do not allocate the page: as a safety net. | ||
621 | */ | ||
622 | if (PageReserved(page)) | ||
623 | return 1; | 641 | return 1; |
642 | } | ||
624 | 643 | ||
625 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | | ||
626 | 1 << PG_referenced | 1 << PG_arch_1 | | ||
627 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk | ||
628 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
629 | | 1 << PG_mlocked | ||
630 | #endif | ||
631 | ); | ||
632 | set_page_private(page, 0); | 644 | set_page_private(page, 0); |
633 | set_page_refcounted(page); | 645 | set_page_refcounted(page); |
634 | 646 | ||
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2609 | unsigned long pfn; | 2621 | unsigned long pfn; |
2610 | struct zone *z; | 2622 | struct zone *z; |
2611 | 2623 | ||
2624 | if (highest_memmap_pfn < end_pfn - 1) | ||
2625 | highest_memmap_pfn = end_pfn - 1; | ||
2626 | |||
2612 | z = &NODE_DATA(nid)->node_zones[zone]; | 2627 | z = &NODE_DATA(nid)->node_zones[zone]; |
2613 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 2628 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
2614 | /* | 2629 | /* |
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
3381 | { | 3396 | { |
3382 | unsigned long usemapsize = usemap_size(zonesize); | 3397 | unsigned long usemapsize = usemap_size(zonesize); |
3383 | zone->pageblock_flags = NULL; | 3398 | zone->pageblock_flags = NULL; |
3384 | if (usemapsize) { | 3399 | if (usemapsize) |
3385 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 3400 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); |
3386 | memset(zone->pageblock_flags, 0, usemapsize); | ||
3387 | } | ||
3388 | } | 3401 | } |
3389 | #else | 3402 | #else |
3390 | static void inline setup_usemap(struct pglist_data *pgdat, | 3403 | static void inline setup_usemap(struct pglist_data *pgdat, |
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3469 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 3482 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
3470 | if (realsize >= memmap_pages) { | 3483 | if (realsize >= memmap_pages) { |
3471 | realsize -= memmap_pages; | 3484 | realsize -= memmap_pages; |
3472 | printk(KERN_DEBUG | 3485 | if (memmap_pages) |
3473 | " %s zone: %lu pages used for memmap\n", | 3486 | printk(KERN_DEBUG |
3474 | zone_names[j], memmap_pages); | 3487 | " %s zone: %lu pages used for memmap\n", |
3488 | zone_names[j], memmap_pages); | ||
3475 | } else | 3489 | } else |
3476 | printk(KERN_WARNING | 3490 | printk(KERN_WARNING |
3477 | " %s zone: %lu pages exceeds realsize %lu\n", | 3491 | " %s zone: %lu pages exceeds realsize %lu\n", |
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void) | |||
4316 | * 1TB 101 10GB | 4330 | * 1TB 101 10GB |
4317 | * 10TB 320 32GB | 4331 | * 10TB 320 32GB |
4318 | */ | 4332 | */ |
4319 | void setup_per_zone_inactive_ratio(void) | 4333 | static void setup_per_zone_inactive_ratio(void) |
4320 | { | 4334 | { |
4321 | struct zone *zone; | 4335 | struct zone *zone; |
4322 | 4336 | ||
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4573 | return table; | 4587 | return table; |
4574 | } | 4588 | } |
4575 | 4589 | ||
4576 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | ||
4577 | struct page *pfn_to_page(unsigned long pfn) | ||
4578 | { | ||
4579 | return __pfn_to_page(pfn); | ||
4580 | } | ||
4581 | unsigned long page_to_pfn(struct page *page) | ||
4582 | { | ||
4583 | return __page_to_pfn(page); | ||
4584 | } | ||
4585 | EXPORT_SYMBOL(pfn_to_page); | ||
4586 | EXPORT_SYMBOL(page_to_pfn); | ||
4587 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | ||
4588 | |||
4589 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | 4590 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
4590 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | 4591 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, |
4591 | unsigned long pfn) | 4592 | unsigned long pfn) |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ab27ff750519..d6507a660ed6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -101,7 +101,7 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
101 | } | 101 | } |
102 | 102 | ||
103 | /* __alloc_bootmem...() is protected by !slab_available() */ | 103 | /* __alloc_bootmem...() is protected by !slab_available() */ |
104 | int __init_refok init_section_page_cgroup(unsigned long pfn) | 104 | static int __init_refok init_section_page_cgroup(unsigned long pfn) |
105 | { | 105 | { |
106 | struct mem_section *section; | 106 | struct mem_section *section; |
107 | struct page_cgroup *base, *pc; | 107 | struct page_cgroup *base, *pc; |
diff --git a/mm/page_io.c b/mm/page_io.c index 065c4480eaf0..dc6ce0afbded 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
98 | struct bio *bio; | 98 | struct bio *bio; |
99 | int ret = 0, rw = WRITE; | 99 | int ret = 0, rw = WRITE; |
100 | 100 | ||
101 | if (remove_exclusive_swap_page(page)) { | 101 | if (try_to_free_swap(page)) { |
102 | unlock_page(page); | 102 | unlock_page(page); |
103 | goto out; | 103 | goto out; |
104 | } | 104 | } |
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page) | |||
125 | struct bio *bio; | 125 | struct bio *bio; |
126 | int ret = 0; | 126 | int ret = 0; |
127 | 127 | ||
128 | BUG_ON(!PageLocked(page)); | 128 | VM_BUG_ON(!PageLocked(page)); |
129 | BUG_ON(PageUptodate(page)); | 129 | VM_BUG_ON(PageUptodate(page)); |
130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, |
131 | end_swap_bio_read); | 131 | end_swap_bio_read); |
132 | if (bio == NULL) { | 132 | if (bio == NULL) { |
@@ -47,9 +47,9 @@ | |||
47 | #include <linux/rmap.h> | 47 | #include <linux/rmap.h> |
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | ||
51 | #include <linux/memcontrol.h> | 50 | #include <linux/memcontrol.h> |
52 | #include <linux/mmu_notifier.h> | 51 | #include <linux/mmu_notifier.h> |
52 | #include <linux/migrate.h> | ||
53 | 53 | ||
54 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
55 | 55 | ||
@@ -191,7 +191,7 @@ void __init anon_vma_init(void) | |||
191 | * Getting a lock on a stable anon_vma from a page off the LRU is | 191 | * Getting a lock on a stable anon_vma from a page off the LRU is |
192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
193 | */ | 193 | */ |
194 | struct anon_vma *page_lock_anon_vma(struct page *page) | 194 | static struct anon_vma *page_lock_anon_vma(struct page *page) |
195 | { | 195 | { |
196 | struct anon_vma *anon_vma; | 196 | struct anon_vma *anon_vma; |
197 | unsigned long anon_mapping; | 197 | unsigned long anon_mapping; |
@@ -211,7 +211,7 @@ out: | |||
211 | return NULL; | 211 | return NULL; |
212 | } | 212 | } |
213 | 213 | ||
214 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 214 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) |
215 | { | 215 | { |
216 | spin_unlock(&anon_vma->lock); | 216 | spin_unlock(&anon_vma->lock); |
217 | rcu_read_unlock(); | 217 | rcu_read_unlock(); |
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page, | |||
359 | goto out_unmap; | 359 | goto out_unmap; |
360 | } | 360 | } |
361 | 361 | ||
362 | if (ptep_clear_flush_young_notify(vma, address, pte)) | 362 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
363 | referenced++; | 363 | /* |
364 | * Don't treat a reference through a sequentially read | ||
365 | * mapping as such. If the page has been used in | ||
366 | * another mapping, we will catch it; if this other | ||
367 | * mapping is already gone, the unmap path will have | ||
368 | * set PG_referenced or activated the page. | ||
369 | */ | ||
370 | if (likely(!VM_SequentialReadHint(vma))) | ||
371 | referenced++; | ||
372 | } | ||
364 | 373 | ||
365 | /* Pretend the page is referenced if the task has the | 374 | /* Pretend the page is referenced if the task has the |
366 | swap token and is in the middle of a page fault. */ | 375 | swap token and is in the middle of a page fault. */ |
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page, | |||
661 | void page_add_new_anon_rmap(struct page *page, | 670 | void page_add_new_anon_rmap(struct page *page, |
662 | struct vm_area_struct *vma, unsigned long address) | 671 | struct vm_area_struct *vma, unsigned long address) |
663 | { | 672 | { |
664 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 673 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
665 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | 674 | SetPageSwapBacked(page); |
675 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | ||
666 | __page_set_anon_rmap(page, vma, address); | 676 | __page_set_anon_rmap(page, vma, address); |
677 | if (page_evictable(page, vma)) | ||
678 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | ||
679 | else | ||
680 | add_page_to_unevictable_list(page); | ||
667 | } | 681 | } |
668 | 682 | ||
669 | /** | 683 | /** |
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page) | |||
693 | */ | 707 | */ |
694 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) | 708 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) |
695 | { | 709 | { |
696 | BUG_ON(page_mapcount(page) == 0); | ||
697 | if (PageAnon(page)) | 710 | if (PageAnon(page)) |
698 | __page_check_anon_rmap(page, vma, address); | 711 | __page_check_anon_rmap(page, vma, address); |
699 | atomic_inc(&page->_mapcount); | 712 | atomic_inc(&page->_mapcount); |
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long | |||
703 | /** | 716 | /** |
704 | * page_remove_rmap - take down pte mapping from a page | 717 | * page_remove_rmap - take down pte mapping from a page |
705 | * @page: page to remove mapping from | 718 | * @page: page to remove mapping from |
706 | * @vma: the vm area in which the mapping is removed | ||
707 | * | 719 | * |
708 | * The caller needs to hold the pte lock. | 720 | * The caller needs to hold the pte lock. |
709 | */ | 721 | */ |
710 | void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | 722 | void page_remove_rmap(struct page *page) |
711 | { | 723 | { |
712 | if (atomic_add_negative(-1, &page->_mapcount)) { | 724 | if (atomic_add_negative(-1, &page->_mapcount)) { |
713 | if (unlikely(page_mapcount(page) < 0)) { | ||
714 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | ||
715 | printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); | ||
716 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | ||
717 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | ||
718 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | ||
719 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); | ||
720 | if (vma->vm_ops) { | ||
721 | print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); | ||
722 | } | ||
723 | if (vma->vm_file && vma->vm_file->f_op) | ||
724 | print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); | ||
725 | BUG(); | ||
726 | } | ||
727 | |||
728 | /* | 725 | /* |
729 | * Now that the last pte has gone, s390 must transfer dirty | 726 | * Now that the last pte has gone, s390 must transfer dirty |
730 | * flag from storage key to struct page. We can usually skip | 727 | * flag from storage key to struct page. We can usually skip |
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
818 | spin_unlock(&mmlist_lock); | 815 | spin_unlock(&mmlist_lock); |
819 | } | 816 | } |
820 | dec_mm_counter(mm, anon_rss); | 817 | dec_mm_counter(mm, anon_rss); |
821 | #ifdef CONFIG_MIGRATION | 818 | } else if (PAGE_MIGRATION) { |
822 | } else { | ||
823 | /* | 819 | /* |
824 | * Store the pfn of the page in a special migration | 820 | * Store the pfn of the page in a special migration |
825 | * pte. do_swap_page() will wait until the migration | 821 | * pte. do_swap_page() will wait until the migration |
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
827 | */ | 823 | */ |
828 | BUG_ON(!migration); | 824 | BUG_ON(!migration); |
829 | entry = make_migration_entry(page, pte_write(pteval)); | 825 | entry = make_migration_entry(page, pte_write(pteval)); |
830 | #endif | ||
831 | } | 826 | } |
832 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 827 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
833 | BUG_ON(pte_file(*pte)); | 828 | BUG_ON(pte_file(*pte)); |
834 | } else | 829 | } else if (PAGE_MIGRATION && migration) { |
835 | #ifdef CONFIG_MIGRATION | ||
836 | if (migration) { | ||
837 | /* Establish migration entry for a file page */ | 830 | /* Establish migration entry for a file page */ |
838 | swp_entry_t entry; | 831 | swp_entry_t entry; |
839 | entry = make_migration_entry(page, pte_write(pteval)); | 832 | entry = make_migration_entry(page, pte_write(pteval)); |
840 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 833 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
841 | } else | 834 | } else |
842 | #endif | ||
843 | dec_mm_counter(mm, file_rss); | 835 | dec_mm_counter(mm, file_rss); |
844 | 836 | ||
845 | 837 | ||
846 | page_remove_rmap(page, vma); | 838 | page_remove_rmap(page); |
847 | page_cache_release(page); | 839 | page_cache_release(page); |
848 | 840 | ||
849 | out_unmap: | 841 | out_unmap: |
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
958 | if (pte_dirty(pteval)) | 950 | if (pte_dirty(pteval)) |
959 | set_page_dirty(page); | 951 | set_page_dirty(page); |
960 | 952 | ||
961 | page_remove_rmap(page, vma); | 953 | page_remove_rmap(page); |
962 | page_cache_release(page); | 954 | page_cache_release(page); |
963 | dec_mm_counter(mm, file_rss); | 955 | dec_mm_counter(mm, file_rss); |
964 | (*mapcount)--; | 956 | (*mapcount)--; |
diff --git a/mm/shmem.c b/mm/shmem.c index f1b0d4871f3a..5941f9801363 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -14,31 +14,39 @@ | |||
14 | * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> | 14 | * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> |
15 | * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> | 15 | * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> |
16 | * | 16 | * |
17 | * tiny-shmem: | ||
18 | * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> | ||
19 | * | ||
17 | * This file is released under the GPL. | 20 | * This file is released under the GPL. |
18 | */ | 21 | */ |
19 | 22 | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/vfs.h> | ||
26 | #include <linux/mount.h> | ||
27 | #include <linux/file.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/swap.h> | ||
31 | |||
32 | static struct vfsmount *shm_mnt; | ||
33 | |||
34 | #ifdef CONFIG_SHMEM | ||
20 | /* | 35 | /* |
21 | * This virtual memory filesystem is heavily based on the ramfs. It | 36 | * This virtual memory filesystem is heavily based on the ramfs. It |
22 | * extends ramfs by the ability to use swap and honor resource limits | 37 | * extends ramfs by the ability to use swap and honor resource limits |
23 | * which makes it a completely usable filesystem. | 38 | * which makes it a completely usable filesystem. |
24 | */ | 39 | */ |
25 | 40 | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/fs.h> | ||
29 | #include <linux/xattr.h> | 41 | #include <linux/xattr.h> |
30 | #include <linux/exportfs.h> | 42 | #include <linux/exportfs.h> |
31 | #include <linux/generic_acl.h> | 43 | #include <linux/generic_acl.h> |
32 | #include <linux/mm.h> | ||
33 | #include <linux/mman.h> | 44 | #include <linux/mman.h> |
34 | #include <linux/file.h> | ||
35 | #include <linux/swap.h> | ||
36 | #include <linux/pagemap.h> | 45 | #include <linux/pagemap.h> |
37 | #include <linux/string.h> | 46 | #include <linux/string.h> |
38 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
39 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
40 | #include <linux/shmem_fs.h> | 49 | #include <linux/shmem_fs.h> |
41 | #include <linux/mount.h> | ||
42 | #include <linux/writeback.h> | 50 | #include <linux/writeback.h> |
43 | #include <linux/vfs.h> | 51 | #include <linux/vfs.h> |
44 | #include <linux/blkdev.h> | 52 | #include <linux/blkdev.h> |
@@ -1444,7 +1452,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1444 | if (error) | 1452 | if (error) |
1445 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1453 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1446 | 1454 | ||
1447 | mark_page_accessed(vmf->page); | ||
1448 | return ret | VM_FAULT_LOCKED; | 1455 | return ret | VM_FAULT_LOCKED; |
1449 | } | 1456 | } |
1450 | 1457 | ||
@@ -2486,7 +2493,6 @@ static struct file_system_type tmpfs_fs_type = { | |||
2486 | .get_sb = shmem_get_sb, | 2493 | .get_sb = shmem_get_sb, |
2487 | .kill_sb = kill_litter_super, | 2494 | .kill_sb = kill_litter_super, |
2488 | }; | 2495 | }; |
2489 | static struct vfsmount *shm_mnt; | ||
2490 | 2496 | ||
2491 | static int __init init_tmpfs(void) | 2497 | static int __init init_tmpfs(void) |
2492 | { | 2498 | { |
@@ -2525,7 +2531,51 @@ out4: | |||
2525 | shm_mnt = ERR_PTR(error); | 2531 | shm_mnt = ERR_PTR(error); |
2526 | return error; | 2532 | return error; |
2527 | } | 2533 | } |
2528 | module_init(init_tmpfs) | 2534 | |
2535 | #else /* !CONFIG_SHMEM */ | ||
2536 | |||
2537 | /* | ||
2538 | * tiny-shmem: simple shmemfs and tmpfs using ramfs code | ||
2539 | * | ||
2540 | * This is intended for small system where the benefits of the full | ||
2541 | * shmem code (swap-backed and resource-limited) are outweighed by | ||
2542 | * their complexity. On systems without swap this code should be | ||
2543 | * effectively equivalent, but much lighter weight. | ||
2544 | */ | ||
2545 | |||
2546 | #include <linux/ramfs.h> | ||
2547 | |||
2548 | static struct file_system_type tmpfs_fs_type = { | ||
2549 | .name = "tmpfs", | ||
2550 | .get_sb = ramfs_get_sb, | ||
2551 | .kill_sb = kill_litter_super, | ||
2552 | }; | ||
2553 | |||
2554 | static int __init init_tmpfs(void) | ||
2555 | { | ||
2556 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | ||
2557 | |||
2558 | shm_mnt = kern_mount(&tmpfs_fs_type); | ||
2559 | BUG_ON(IS_ERR(shm_mnt)); | ||
2560 | |||
2561 | return 0; | ||
2562 | } | ||
2563 | |||
2564 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
2565 | { | ||
2566 | return 0; | ||
2567 | } | ||
2568 | |||
2569 | #define shmem_file_operations ramfs_file_operations | ||
2570 | #define shmem_vm_ops generic_file_vm_ops | ||
2571 | #define shmem_get_inode ramfs_get_inode | ||
2572 | #define shmem_acct_size(a, b) 0 | ||
2573 | #define shmem_unacct_size(a, b) do {} while (0) | ||
2574 | #define SHMEM_MAX_BYTES LLONG_MAX | ||
2575 | |||
2576 | #endif /* CONFIG_SHMEM */ | ||
2577 | |||
2578 | /* common code */ | ||
2529 | 2579 | ||
2530 | /** | 2580 | /** |
2531 | * shmem_file_setup - get an unlinked file living in tmpfs | 2581 | * shmem_file_setup - get an unlinked file living in tmpfs |
@@ -2569,12 +2619,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2569 | if (!inode) | 2619 | if (!inode) |
2570 | goto close_file; | 2620 | goto close_file; |
2571 | 2621 | ||
2622 | #ifdef CONFIG_SHMEM | ||
2572 | SHMEM_I(inode)->flags = flags & VM_ACCOUNT; | 2623 | SHMEM_I(inode)->flags = flags & VM_ACCOUNT; |
2624 | #endif | ||
2573 | d_instantiate(dentry, inode); | 2625 | d_instantiate(dentry, inode); |
2574 | inode->i_size = size; | 2626 | inode->i_size = size; |
2575 | inode->i_nlink = 0; /* It is unlinked */ | 2627 | inode->i_nlink = 0; /* It is unlinked */ |
2576 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | 2628 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, |
2577 | &shmem_file_operations); | 2629 | &shmem_file_operations); |
2630 | |||
2631 | #ifndef CONFIG_MMU | ||
2632 | error = ramfs_nommu_expand_for_mapping(inode, size); | ||
2633 | if (error) | ||
2634 | goto close_file; | ||
2635 | #endif | ||
2578 | return file; | 2636 | return file; |
2579 | 2637 | ||
2580 | close_file: | 2638 | close_file: |
@@ -2606,3 +2664,5 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2606 | vma->vm_ops = &shmem_vm_ops; | 2664 | vma->vm_ops = &shmem_vm_ops; |
2607 | return 0; | 2665 | return 0; |
2608 | } | 2666 | } |
2667 | |||
2668 | module_init(init_tmpfs) | ||
@@ -246,25 +246,6 @@ void add_page_to_unevictable_list(struct page *page) | |||
246 | spin_unlock_irq(&zone->lru_lock); | 246 | spin_unlock_irq(&zone->lru_lock); |
247 | } | 247 | } |
248 | 248 | ||
249 | /** | ||
250 | * lru_cache_add_active_or_unevictable | ||
251 | * @page: the page to be added to LRU | ||
252 | * @vma: vma in which page is mapped for determining reclaimability | ||
253 | * | ||
254 | * place @page on active or unevictable LRU list, depending on | ||
255 | * page_evictable(). Note that if the page is not evictable, | ||
256 | * it goes directly back onto it's zone's unevictable list. It does | ||
257 | * NOT use a per cpu pagevec. | ||
258 | */ | ||
259 | void lru_cache_add_active_or_unevictable(struct page *page, | ||
260 | struct vm_area_struct *vma) | ||
261 | { | ||
262 | if (page_evictable(page, vma)) | ||
263 | lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); | ||
264 | else | ||
265 | add_page_to_unevictable_list(page); | ||
266 | } | ||
267 | |||
268 | /* | 249 | /* |
269 | * Drain pages out of the cpu's pagevecs. | 250 | * Drain pages out of the cpu's pagevecs. |
270 | * Either "cpu" is the current CPU, and preemption has already been | 251 | * Either "cpu" is the current CPU, and preemption has already been |
@@ -398,28 +379,6 @@ void __pagevec_release(struct pagevec *pvec) | |||
398 | EXPORT_SYMBOL(__pagevec_release); | 379 | EXPORT_SYMBOL(__pagevec_release); |
399 | 380 | ||
400 | /* | 381 | /* |
401 | * pagevec_release() for pages which are known to not be on the LRU | ||
402 | * | ||
403 | * This function reinitialises the caller's pagevec. | ||
404 | */ | ||
405 | void __pagevec_release_nonlru(struct pagevec *pvec) | ||
406 | { | ||
407 | int i; | ||
408 | struct pagevec pages_to_free; | ||
409 | |||
410 | pagevec_init(&pages_to_free, pvec->cold); | ||
411 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
412 | struct page *page = pvec->pages[i]; | ||
413 | |||
414 | VM_BUG_ON(PageLRU(page)); | ||
415 | if (put_page_testzero(page)) | ||
416 | pagevec_add(&pages_to_free, page); | ||
417 | } | ||
418 | pagevec_free(&pages_to_free); | ||
419 | pagevec_reinit(pvec); | ||
420 | } | ||
421 | |||
422 | /* | ||
423 | * Add the passed pages to the LRU, then drop the caller's refcount | 382 | * Add the passed pages to the LRU, then drop the caller's refcount |
424 | * on them. Reinitialises the caller's pagevec. | 383 | * on them. Reinitialises the caller's pagevec. |
425 | */ | 384 | */ |
@@ -495,8 +454,7 @@ void pagevec_swap_free(struct pagevec *pvec) | |||
495 | struct page *page = pvec->pages[i]; | 454 | struct page *page = pvec->pages[i]; |
496 | 455 | ||
497 | if (PageSwapCache(page) && trylock_page(page)) { | 456 | if (PageSwapCache(page) && trylock_page(page)) { |
498 | if (PageSwapCache(page)) | 457 | try_to_free_swap(page); |
499 | remove_exclusive_swap_page_ref(page); | ||
500 | unlock_page(page); | 458 | unlock_page(page); |
501 | } | 459 | } |
502 | } | 460 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 3353c9029cef..81c825f67a7f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -72,10 +72,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
72 | { | 72 | { |
73 | int error; | 73 | int error; |
74 | 74 | ||
75 | BUG_ON(!PageLocked(page)); | 75 | VM_BUG_ON(!PageLocked(page)); |
76 | BUG_ON(PageSwapCache(page)); | 76 | VM_BUG_ON(PageSwapCache(page)); |
77 | BUG_ON(PagePrivate(page)); | 77 | VM_BUG_ON(!PageSwapBacked(page)); |
78 | BUG_ON(!PageSwapBacked(page)); | 78 | |
79 | error = radix_tree_preload(gfp_mask); | 79 | error = radix_tree_preload(gfp_mask); |
80 | if (!error) { | 80 | if (!error) { |
81 | page_cache_get(page); | 81 | page_cache_get(page); |
@@ -108,10 +108,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
108 | */ | 108 | */ |
109 | void __delete_from_swap_cache(struct page *page) | 109 | void __delete_from_swap_cache(struct page *page) |
110 | { | 110 | { |
111 | BUG_ON(!PageLocked(page)); | 111 | VM_BUG_ON(!PageLocked(page)); |
112 | BUG_ON(!PageSwapCache(page)); | 112 | VM_BUG_ON(!PageSwapCache(page)); |
113 | BUG_ON(PageWriteback(page)); | 113 | VM_BUG_ON(PageWriteback(page)); |
114 | BUG_ON(PagePrivate(page)); | ||
115 | 114 | ||
116 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 115 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); |
117 | set_page_private(page, 0); | 116 | set_page_private(page, 0); |
@@ -129,13 +128,13 @@ void __delete_from_swap_cache(struct page *page) | |||
129 | * Allocate swap space for the page and add the page to the | 128 | * Allocate swap space for the page and add the page to the |
130 | * swap cache. Caller needs to hold the page lock. | 129 | * swap cache. Caller needs to hold the page lock. |
131 | */ | 130 | */ |
132 | int add_to_swap(struct page * page, gfp_t gfp_mask) | 131 | int add_to_swap(struct page *page) |
133 | { | 132 | { |
134 | swp_entry_t entry; | 133 | swp_entry_t entry; |
135 | int err; | 134 | int err; |
136 | 135 | ||
137 | BUG_ON(!PageLocked(page)); | 136 | VM_BUG_ON(!PageLocked(page)); |
138 | BUG_ON(!PageUptodate(page)); | 137 | VM_BUG_ON(!PageUptodate(page)); |
139 | 138 | ||
140 | for (;;) { | 139 | for (;;) { |
141 | entry = get_swap_page(); | 140 | entry = get_swap_page(); |
@@ -154,7 +153,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) | |||
154 | * Add it to the swap cache and mark it dirty | 153 | * Add it to the swap cache and mark it dirty |
155 | */ | 154 | */ |
156 | err = add_to_swap_cache(page, entry, | 155 | err = add_to_swap_cache(page, entry, |
157 | gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); | 156 | __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); |
158 | 157 | ||
159 | switch (err) { | 158 | switch (err) { |
160 | case 0: /* Success */ | 159 | case 0: /* Success */ |
@@ -196,14 +195,14 @@ void delete_from_swap_cache(struct page *page) | |||
196 | * If we are the only user, then try to free up the swap cache. | 195 | * If we are the only user, then try to free up the swap cache. |
197 | * | 196 | * |
198 | * Its ok to check for PageSwapCache without the page lock | 197 | * Its ok to check for PageSwapCache without the page lock |
199 | * here because we are going to recheck again inside | 198 | * here because we are going to recheck again inside |
200 | * exclusive_swap_page() _with_ the lock. | 199 | * try_to_free_swap() _with_ the lock. |
201 | * - Marcelo | 200 | * - Marcelo |
202 | */ | 201 | */ |
203 | static inline void free_swap_cache(struct page *page) | 202 | static inline void free_swap_cache(struct page *page) |
204 | { | 203 | { |
205 | if (PageSwapCache(page) && trylock_page(page)) { | 204 | if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { |
206 | remove_exclusive_swap_page(page); | 205 | try_to_free_swap(page); |
207 | unlock_page(page); | 206 | unlock_page(page); |
208 | } | 207 | } |
209 | } | 208 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a9f87e5162..eec5ca758a23 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/namei.h> | 16 | #include <linux/namei.h> |
17 | #include <linux/shm.h> | 17 | #include <linux/shm.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/random.h> | ||
19 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
20 | #include <linux/proc_fs.h> | 21 | #include <linux/proc_fs.h> |
21 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
@@ -35,6 +36,7 @@ | |||
35 | 36 | ||
36 | static DEFINE_SPINLOCK(swap_lock); | 37 | static DEFINE_SPINLOCK(swap_lock); |
37 | static unsigned int nr_swapfiles; | 38 | static unsigned int nr_swapfiles; |
39 | long nr_swap_pages; | ||
38 | long total_swap_pages; | 40 | long total_swap_pages; |
39 | static int swap_overflow; | 41 | static int swap_overflow; |
40 | static int least_priority; | 42 | static int least_priority; |
@@ -83,15 +85,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
83 | up_read(&swap_unplug_sem); | 85 | up_read(&swap_unplug_sem); |
84 | } | 86 | } |
85 | 87 | ||
88 | /* | ||
89 | * swapon tell device that all the old swap contents can be discarded, | ||
90 | * to allow the swap device to optimize its wear-levelling. | ||
91 | */ | ||
92 | static int discard_swap(struct swap_info_struct *si) | ||
93 | { | ||
94 | struct swap_extent *se; | ||
95 | int err = 0; | ||
96 | |||
97 | list_for_each_entry(se, &si->extent_list, list) { | ||
98 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | ||
99 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | ||
100 | |||
101 | if (se->start_page == 0) { | ||
102 | /* Do not discard the swap header page! */ | ||
103 | start_block += 1 << (PAGE_SHIFT - 9); | ||
104 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
105 | if (!nr_blocks) | ||
106 | continue; | ||
107 | } | ||
108 | |||
109 | err = blkdev_issue_discard(si->bdev, start_block, | ||
110 | nr_blocks, GFP_KERNEL); | ||
111 | if (err) | ||
112 | break; | ||
113 | |||
114 | cond_resched(); | ||
115 | } | ||
116 | return err; /* That will often be -EOPNOTSUPP */ | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * swap allocation tell device that a cluster of swap can now be discarded, | ||
121 | * to allow the swap device to optimize its wear-levelling. | ||
122 | */ | ||
123 | static void discard_swap_cluster(struct swap_info_struct *si, | ||
124 | pgoff_t start_page, pgoff_t nr_pages) | ||
125 | { | ||
126 | struct swap_extent *se = si->curr_swap_extent; | ||
127 | int found_extent = 0; | ||
128 | |||
129 | while (nr_pages) { | ||
130 | struct list_head *lh; | ||
131 | |||
132 | if (se->start_page <= start_page && | ||
133 | start_page < se->start_page + se->nr_pages) { | ||
134 | pgoff_t offset = start_page - se->start_page; | ||
135 | sector_t start_block = se->start_block + offset; | ||
136 | sector_t nr_blocks = se->nr_pages - offset; | ||
137 | |||
138 | if (nr_blocks > nr_pages) | ||
139 | nr_blocks = nr_pages; | ||
140 | start_page += nr_blocks; | ||
141 | nr_pages -= nr_blocks; | ||
142 | |||
143 | if (!found_extent++) | ||
144 | si->curr_swap_extent = se; | ||
145 | |||
146 | start_block <<= PAGE_SHIFT - 9; | ||
147 | nr_blocks <<= PAGE_SHIFT - 9; | ||
148 | if (blkdev_issue_discard(si->bdev, start_block, | ||
149 | nr_blocks, GFP_NOIO)) | ||
150 | break; | ||
151 | } | ||
152 | |||
153 | lh = se->list.next; | ||
154 | if (lh == &si->extent_list) | ||
155 | lh = lh->next; | ||
156 | se = list_entry(lh, struct swap_extent, list); | ||
157 | } | ||
158 | } | ||
159 | |||
160 | static int wait_for_discard(void *word) | ||
161 | { | ||
162 | schedule(); | ||
163 | return 0; | ||
164 | } | ||
165 | |||
86 | #define SWAPFILE_CLUSTER 256 | 166 | #define SWAPFILE_CLUSTER 256 |
87 | #define LATENCY_LIMIT 256 | 167 | #define LATENCY_LIMIT 256 |
88 | 168 | ||
89 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 169 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) |
90 | { | 170 | { |
91 | unsigned long offset, last_in_cluster; | 171 | unsigned long offset; |
172 | unsigned long scan_base; | ||
173 | unsigned long last_in_cluster = 0; | ||
92 | int latency_ration = LATENCY_LIMIT; | 174 | int latency_ration = LATENCY_LIMIT; |
175 | int found_free_cluster = 0; | ||
93 | 176 | ||
94 | /* | 177 | /* |
95 | * We try to cluster swap pages by allocating them sequentially | 178 | * We try to cluster swap pages by allocating them sequentially |
96 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this | 179 | * in swap. Once we've allocated SWAPFILE_CLUSTER pages this |
97 | * way, however, we resort to first-free allocation, starting | 180 | * way, however, we resort to first-free allocation, starting |
@@ -99,16 +182,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
99 | * all over the entire swap partition, so that we reduce | 182 | * all over the entire swap partition, so that we reduce |
100 | * overall disk seek times between swap pages. -- sct | 183 | * overall disk seek times between swap pages. -- sct |
101 | * But we do now try to find an empty cluster. -Andrea | 184 | * But we do now try to find an empty cluster. -Andrea |
185 | * And we let swap pages go all over an SSD partition. Hugh | ||
102 | */ | 186 | */ |
103 | 187 | ||
104 | si->flags += SWP_SCANNING; | 188 | si->flags += SWP_SCANNING; |
105 | if (unlikely(!si->cluster_nr)) { | 189 | scan_base = offset = si->cluster_next; |
106 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 190 | |
107 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) | 191 | if (unlikely(!si->cluster_nr--)) { |
108 | goto lowest; | 192 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
193 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
194 | goto checks; | ||
195 | } | ||
196 | if (si->flags & SWP_DISCARDABLE) { | ||
197 | /* | ||
198 | * Start range check on racing allocations, in case | ||
199 | * they overlap the cluster we eventually decide on | ||
200 | * (we scan without swap_lock to allow preemption). | ||
201 | * It's hardly conceivable that cluster_nr could be | ||
202 | * wrapped during our scan, but don't depend on it. | ||
203 | */ | ||
204 | if (si->lowest_alloc) | ||
205 | goto checks; | ||
206 | si->lowest_alloc = si->max; | ||
207 | si->highest_alloc = 0; | ||
208 | } | ||
109 | spin_unlock(&swap_lock); | 209 | spin_unlock(&swap_lock); |
110 | 210 | ||
111 | offset = si->lowest_bit; | 211 | /* |
212 | * If seek is expensive, start searching for new cluster from | ||
213 | * start of partition, to minimize the span of allocated swap. | ||
214 | * But if seek is cheap, search from our current position, so | ||
215 | * that swap is allocated from all over the partition: if the | ||
216 | * Flash Translation Layer only remaps within limited zones, | ||
217 | * we don't want to wear out the first zone too quickly. | ||
218 | */ | ||
219 | if (!(si->flags & SWP_SOLIDSTATE)) | ||
220 | scan_base = offset = si->lowest_bit; | ||
112 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | 221 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; |
113 | 222 | ||
114 | /* Locate the first empty (unaligned) cluster */ | 223 | /* Locate the first empty (unaligned) cluster */ |
@@ -117,43 +226,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
117 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 226 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
118 | else if (offset == last_in_cluster) { | 227 | else if (offset == last_in_cluster) { |
119 | spin_lock(&swap_lock); | 228 | spin_lock(&swap_lock); |
120 | si->cluster_next = offset-SWAPFILE_CLUSTER+1; | 229 | offset -= SWAPFILE_CLUSTER - 1; |
121 | goto cluster; | 230 | si->cluster_next = offset; |
231 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
232 | found_free_cluster = 1; | ||
233 | goto checks; | ||
122 | } | 234 | } |
123 | if (unlikely(--latency_ration < 0)) { | 235 | if (unlikely(--latency_ration < 0)) { |
124 | cond_resched(); | 236 | cond_resched(); |
125 | latency_ration = LATENCY_LIMIT; | 237 | latency_ration = LATENCY_LIMIT; |
126 | } | 238 | } |
127 | } | 239 | } |
240 | |||
241 | offset = si->lowest_bit; | ||
242 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | ||
243 | |||
244 | /* Locate the first empty (unaligned) cluster */ | ||
245 | for (; last_in_cluster < scan_base; offset++) { | ||
246 | if (si->swap_map[offset]) | ||
247 | last_in_cluster = offset + SWAPFILE_CLUSTER; | ||
248 | else if (offset == last_in_cluster) { | ||
249 | spin_lock(&swap_lock); | ||
250 | offset -= SWAPFILE_CLUSTER - 1; | ||
251 | si->cluster_next = offset; | ||
252 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
253 | found_free_cluster = 1; | ||
254 | goto checks; | ||
255 | } | ||
256 | if (unlikely(--latency_ration < 0)) { | ||
257 | cond_resched(); | ||
258 | latency_ration = LATENCY_LIMIT; | ||
259 | } | ||
260 | } | ||
261 | |||
262 | offset = scan_base; | ||
128 | spin_lock(&swap_lock); | 263 | spin_lock(&swap_lock); |
129 | goto lowest; | 264 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
265 | si->lowest_alloc = 0; | ||
130 | } | 266 | } |
131 | 267 | ||
132 | si->cluster_nr--; | 268 | checks: |
133 | cluster: | 269 | if (!(si->flags & SWP_WRITEOK)) |
134 | offset = si->cluster_next; | ||
135 | if (offset > si->highest_bit) | ||
136 | lowest: offset = si->lowest_bit; | ||
137 | checks: if (!(si->flags & SWP_WRITEOK)) | ||
138 | goto no_page; | 270 | goto no_page; |
139 | if (!si->highest_bit) | 271 | if (!si->highest_bit) |
140 | goto no_page; | 272 | goto no_page; |
141 | if (!si->swap_map[offset]) { | 273 | if (offset > si->highest_bit) |
142 | if (offset == si->lowest_bit) | 274 | scan_base = offset = si->lowest_bit; |
143 | si->lowest_bit++; | 275 | if (si->swap_map[offset]) |
144 | if (offset == si->highest_bit) | 276 | goto scan; |
145 | si->highest_bit--; | 277 | |
146 | si->inuse_pages++; | 278 | if (offset == si->lowest_bit) |
147 | if (si->inuse_pages == si->pages) { | 279 | si->lowest_bit++; |
148 | si->lowest_bit = si->max; | 280 | if (offset == si->highest_bit) |
149 | si->highest_bit = 0; | 281 | si->highest_bit--; |
282 | si->inuse_pages++; | ||
283 | if (si->inuse_pages == si->pages) { | ||
284 | si->lowest_bit = si->max; | ||
285 | si->highest_bit = 0; | ||
286 | } | ||
287 | si->swap_map[offset] = 1; | ||
288 | si->cluster_next = offset + 1; | ||
289 | si->flags -= SWP_SCANNING; | ||
290 | |||
291 | if (si->lowest_alloc) { | ||
292 | /* | ||
293 | * Only set when SWP_DISCARDABLE, and there's a scan | ||
294 | * for a free cluster in progress or just completed. | ||
295 | */ | ||
296 | if (found_free_cluster) { | ||
297 | /* | ||
298 | * To optimize wear-levelling, discard the | ||
299 | * old data of the cluster, taking care not to | ||
300 | * discard any of its pages that have already | ||
301 | * been allocated by racing tasks (offset has | ||
302 | * already stepped over any at the beginning). | ||
303 | */ | ||
304 | if (offset < si->highest_alloc && | ||
305 | si->lowest_alloc <= last_in_cluster) | ||
306 | last_in_cluster = si->lowest_alloc - 1; | ||
307 | si->flags |= SWP_DISCARDING; | ||
308 | spin_unlock(&swap_lock); | ||
309 | |||
310 | if (offset < last_in_cluster) | ||
311 | discard_swap_cluster(si, offset, | ||
312 | last_in_cluster - offset + 1); | ||
313 | |||
314 | spin_lock(&swap_lock); | ||
315 | si->lowest_alloc = 0; | ||
316 | si->flags &= ~SWP_DISCARDING; | ||
317 | |||
318 | smp_mb(); /* wake_up_bit advises this */ | ||
319 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | ||
320 | |||
321 | } else if (si->flags & SWP_DISCARDING) { | ||
322 | /* | ||
323 | * Delay using pages allocated by racing tasks | ||
324 | * until the whole discard has been issued. We | ||
325 | * could defer that delay until swap_writepage, | ||
326 | * but it's easier to keep this self-contained. | ||
327 | */ | ||
328 | spin_unlock(&swap_lock); | ||
329 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | ||
330 | wait_for_discard, TASK_UNINTERRUPTIBLE); | ||
331 | spin_lock(&swap_lock); | ||
332 | } else { | ||
333 | /* | ||
334 | * Note pages allocated by racing tasks while | ||
335 | * scan for a free cluster is in progress, so | ||
336 | * that its final discard can exclude them. | ||
337 | */ | ||
338 | if (offset < si->lowest_alloc) | ||
339 | si->lowest_alloc = offset; | ||
340 | if (offset > si->highest_alloc) | ||
341 | si->highest_alloc = offset; | ||
150 | } | 342 | } |
151 | si->swap_map[offset] = 1; | ||
152 | si->cluster_next = offset + 1; | ||
153 | si->flags -= SWP_SCANNING; | ||
154 | return offset; | ||
155 | } | 343 | } |
344 | return offset; | ||
156 | 345 | ||
346 | scan: | ||
157 | spin_unlock(&swap_lock); | 347 | spin_unlock(&swap_lock); |
158 | while (++offset <= si->highest_bit) { | 348 | while (++offset <= si->highest_bit) { |
159 | if (!si->swap_map[offset]) { | 349 | if (!si->swap_map[offset]) { |
@@ -165,8 +355,18 @@ checks: if (!(si->flags & SWP_WRITEOK)) | |||
165 | latency_ration = LATENCY_LIMIT; | 355 | latency_ration = LATENCY_LIMIT; |
166 | } | 356 | } |
167 | } | 357 | } |
358 | offset = si->lowest_bit; | ||
359 | while (++offset < scan_base) { | ||
360 | if (!si->swap_map[offset]) { | ||
361 | spin_lock(&swap_lock); | ||
362 | goto checks; | ||
363 | } | ||
364 | if (unlikely(--latency_ration < 0)) { | ||
365 | cond_resched(); | ||
366 | latency_ration = LATENCY_LIMIT; | ||
367 | } | ||
368 | } | ||
168 | spin_lock(&swap_lock); | 369 | spin_lock(&swap_lock); |
169 | goto lowest; | ||
170 | 370 | ||
171 | no_page: | 371 | no_page: |
172 | si->flags -= SWP_SCANNING; | 372 | si->flags -= SWP_SCANNING; |
@@ -268,7 +468,7 @@ bad_nofile: | |||
268 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); | 468 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); |
269 | out: | 469 | out: |
270 | return NULL; | 470 | return NULL; |
271 | } | 471 | } |
272 | 472 | ||
273 | static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) | 473 | static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) |
274 | { | 474 | { |
@@ -326,97 +526,58 @@ static inline int page_swapcount(struct page *page) | |||
326 | } | 526 | } |
327 | 527 | ||
328 | /* | 528 | /* |
329 | * We can use this swap cache entry directly | 529 | * We can write to an anon page without COW if there are no other references |
330 | * if there are no other references to it. | 530 | * to it. And as a side-effect, free up its swap: because the old content |
531 | * on disk will never be read, and seeking back there to write new content | ||
532 | * later would only waste time away from clustering. | ||
331 | */ | 533 | */ |
332 | int can_share_swap_page(struct page *page) | 534 | int reuse_swap_page(struct page *page) |
333 | { | 535 | { |
334 | int count; | 536 | int count; |
335 | 537 | ||
336 | BUG_ON(!PageLocked(page)); | 538 | VM_BUG_ON(!PageLocked(page)); |
337 | count = page_mapcount(page); | 539 | count = page_mapcount(page); |
338 | if (count <= 1 && PageSwapCache(page)) | 540 | if (count <= 1 && PageSwapCache(page)) { |
339 | count += page_swapcount(page); | 541 | count += page_swapcount(page); |
542 | if (count == 1 && !PageWriteback(page)) { | ||
543 | delete_from_swap_cache(page); | ||
544 | SetPageDirty(page); | ||
545 | } | ||
546 | } | ||
340 | return count == 1; | 547 | return count == 1; |
341 | } | 548 | } |
342 | 549 | ||
343 | /* | 550 | /* |
344 | * Work out if there are any other processes sharing this | 551 | * If swap is getting full, or if there are no more mappings of this page, |
345 | * swap cache page. Free it if you can. Return success. | 552 | * then try_to_free_swap is called to free its swap space. |
346 | */ | 553 | */ |
347 | static int remove_exclusive_swap_page_count(struct page *page, int count) | 554 | int try_to_free_swap(struct page *page) |
348 | { | 555 | { |
349 | int retval; | 556 | VM_BUG_ON(!PageLocked(page)); |
350 | struct swap_info_struct * p; | ||
351 | swp_entry_t entry; | ||
352 | |||
353 | BUG_ON(PagePrivate(page)); | ||
354 | BUG_ON(!PageLocked(page)); | ||
355 | 557 | ||
356 | if (!PageSwapCache(page)) | 558 | if (!PageSwapCache(page)) |
357 | return 0; | 559 | return 0; |
358 | if (PageWriteback(page)) | 560 | if (PageWriteback(page)) |
359 | return 0; | 561 | return 0; |
360 | if (page_count(page) != count) /* us + cache + ptes */ | 562 | if (page_swapcount(page)) |
361 | return 0; | ||
362 | |||
363 | entry.val = page_private(page); | ||
364 | p = swap_info_get(entry); | ||
365 | if (!p) | ||
366 | return 0; | 563 | return 0; |
367 | 564 | ||
368 | /* Is the only swap cache user the cache itself? */ | 565 | delete_from_swap_cache(page); |
369 | retval = 0; | 566 | SetPageDirty(page); |
370 | if (p->swap_map[swp_offset(entry)] == 1) { | 567 | return 1; |
371 | /* Recheck the page count with the swapcache lock held.. */ | ||
372 | spin_lock_irq(&swapper_space.tree_lock); | ||
373 | if ((page_count(page) == count) && !PageWriteback(page)) { | ||
374 | __delete_from_swap_cache(page); | ||
375 | SetPageDirty(page); | ||
376 | retval = 1; | ||
377 | } | ||
378 | spin_unlock_irq(&swapper_space.tree_lock); | ||
379 | } | ||
380 | spin_unlock(&swap_lock); | ||
381 | |||
382 | if (retval) { | ||
383 | swap_free(entry); | ||
384 | page_cache_release(page); | ||
385 | } | ||
386 | |||
387 | return retval; | ||
388 | } | ||
389 | |||
390 | /* | ||
391 | * Most of the time the page should have two references: one for the | ||
392 | * process and one for the swap cache. | ||
393 | */ | ||
394 | int remove_exclusive_swap_page(struct page *page) | ||
395 | { | ||
396 | return remove_exclusive_swap_page_count(page, 2); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * The pageout code holds an extra reference to the page. That raises | ||
401 | * the reference count to test for to 2 for a page that is only in the | ||
402 | * swap cache plus 1 for each process that maps the page. | ||
403 | */ | ||
404 | int remove_exclusive_swap_page_ref(struct page *page) | ||
405 | { | ||
406 | return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); | ||
407 | } | 568 | } |
408 | 569 | ||
409 | /* | 570 | /* |
410 | * Free the swap entry like above, but also try to | 571 | * Free the swap entry like above, but also try to |
411 | * free the page cache entry if it is the last user. | 572 | * free the page cache entry if it is the last user. |
412 | */ | 573 | */ |
413 | void free_swap_and_cache(swp_entry_t entry) | 574 | int free_swap_and_cache(swp_entry_t entry) |
414 | { | 575 | { |
415 | struct swap_info_struct * p; | 576 | struct swap_info_struct *p; |
416 | struct page *page = NULL; | 577 | struct page *page = NULL; |
417 | 578 | ||
418 | if (is_migration_entry(entry)) | 579 | if (is_migration_entry(entry)) |
419 | return; | 580 | return 1; |
420 | 581 | ||
421 | p = swap_info_get(entry); | 582 | p = swap_info_get(entry); |
422 | if (p) { | 583 | if (p) { |
@@ -430,20 +591,19 @@ void free_swap_and_cache(swp_entry_t entry) | |||
430 | spin_unlock(&swap_lock); | 591 | spin_unlock(&swap_lock); |
431 | } | 592 | } |
432 | if (page) { | 593 | if (page) { |
433 | int one_user; | 594 | /* |
434 | 595 | * Not mapped elsewhere, or swap space full? Free it! | |
435 | BUG_ON(PagePrivate(page)); | 596 | * Also recheck PageSwapCache now page is locked (above). |
436 | one_user = (page_count(page) == 2); | 597 | */ |
437 | /* Only cache user (+us), or swap space full? Free it! */ | ||
438 | /* Also recheck PageSwapCache after page is locked (above) */ | ||
439 | if (PageSwapCache(page) && !PageWriteback(page) && | 598 | if (PageSwapCache(page) && !PageWriteback(page) && |
440 | (one_user || vm_swap_full())) { | 599 | (!page_mapped(page) || vm_swap_full())) { |
441 | delete_from_swap_cache(page); | 600 | delete_from_swap_cache(page); |
442 | SetPageDirty(page); | 601 | SetPageDirty(page); |
443 | } | 602 | } |
444 | unlock_page(page); | 603 | unlock_page(page); |
445 | page_cache_release(page); | 604 | page_cache_release(page); |
446 | } | 605 | } |
606 | return p != NULL; | ||
447 | } | 607 | } |
448 | 608 | ||
449 | #ifdef CONFIG_HIBERNATION | 609 | #ifdef CONFIG_HIBERNATION |
@@ -776,10 +936,10 @@ static int try_to_unuse(unsigned int type) | |||
776 | break; | 936 | break; |
777 | } | 937 | } |
778 | 938 | ||
779 | /* | 939 | /* |
780 | * Get a page for the entry, using the existing swap | 940 | * Get a page for the entry, using the existing swap |
781 | * cache page if there is one. Otherwise, get a clean | 941 | * cache page if there is one. Otherwise, get a clean |
782 | * page and read the swap into it. | 942 | * page and read the swap into it. |
783 | */ | 943 | */ |
784 | swap_map = &si->swap_map[i]; | 944 | swap_map = &si->swap_map[i]; |
785 | entry = swp_entry(type, i); | 945 | entry = swp_entry(type, i); |
@@ -930,7 +1090,16 @@ static int try_to_unuse(unsigned int type) | |||
930 | lock_page(page); | 1090 | lock_page(page); |
931 | wait_on_page_writeback(page); | 1091 | wait_on_page_writeback(page); |
932 | } | 1092 | } |
933 | if (PageSwapCache(page)) | 1093 | |
1094 | /* | ||
1095 | * It is conceivable that a racing task removed this page from | ||
1096 | * swap cache just before we acquired the page lock at the top, | ||
1097 | * or while we dropped it in unuse_mm(). The page might even | ||
1098 | * be back in swap cache on another swap area: that we must not | ||
1099 | * delete, since it may not have been written out to swap yet. | ||
1100 | */ | ||
1101 | if (PageSwapCache(page) && | ||
1102 | likely(page_private(page) == entry.val)) | ||
934 | delete_from_swap_cache(page); | 1103 | delete_from_swap_cache(page); |
935 | 1104 | ||
936 | /* | 1105 | /* |
@@ -1203,26 +1372,6 @@ out: | |||
1203 | return ret; | 1372 | return ret; |
1204 | } | 1373 | } |
1205 | 1374 | ||
1206 | #if 0 /* We don't need this yet */ | ||
1207 | #include <linux/backing-dev.h> | ||
1208 | int page_queue_congested(struct page *page) | ||
1209 | { | ||
1210 | struct backing_dev_info *bdi; | ||
1211 | |||
1212 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ | ||
1213 | |||
1214 | if (PageSwapCache(page)) { | ||
1215 | swp_entry_t entry = { .val = page_private(page) }; | ||
1216 | struct swap_info_struct *sis; | ||
1217 | |||
1218 | sis = get_swap_info_struct(swp_type(entry)); | ||
1219 | bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; | ||
1220 | } else | ||
1221 | bdi = page->mapping->backing_dev_info; | ||
1222 | return bdi_write_congested(bdi); | ||
1223 | } | ||
1224 | #endif | ||
1225 | |||
1226 | asmlinkage long sys_swapoff(const char __user * specialfile) | 1375 | asmlinkage long sys_swapoff(const char __user * specialfile) |
1227 | { | 1376 | { |
1228 | struct swap_info_struct * p = NULL; | 1377 | struct swap_info_struct * p = NULL; |
@@ -1233,7 +1382,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1233 | char * pathname; | 1382 | char * pathname; |
1234 | int i, type, prev; | 1383 | int i, type, prev; |
1235 | int err; | 1384 | int err; |
1236 | 1385 | ||
1237 | if (!capable(CAP_SYS_ADMIN)) | 1386 | if (!capable(CAP_SYS_ADMIN)) |
1238 | return -EPERM; | 1387 | return -EPERM; |
1239 | 1388 | ||
@@ -1253,7 +1402,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1253 | spin_lock(&swap_lock); | 1402 | spin_lock(&swap_lock); |
1254 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1403 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { |
1255 | p = swap_info + type; | 1404 | p = swap_info + type; |
1256 | if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { | 1405 | if (p->flags & SWP_WRITEOK) { |
1257 | if (p->swap_file->f_mapping == mapping) | 1406 | if (p->swap_file->f_mapping == mapping) |
1258 | break; | 1407 | break; |
1259 | } | 1408 | } |
@@ -1426,12 +1575,12 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1426 | file = ptr->swap_file; | 1575 | file = ptr->swap_file; |
1427 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1576 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
1428 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1577 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1429 | len < 40 ? 40 - len : 1, " ", | 1578 | len < 40 ? 40 - len : 1, " ", |
1430 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1579 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1431 | "partition" : "file\t", | 1580 | "partition" : "file\t", |
1432 | ptr->pages << (PAGE_SHIFT - 10), | 1581 | ptr->pages << (PAGE_SHIFT - 10), |
1433 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1582 | ptr->inuse_pages << (PAGE_SHIFT - 10), |
1434 | ptr->prio); | 1583 | ptr->prio); |
1435 | return 0; | 1584 | return 0; |
1436 | } | 1585 | } |
1437 | 1586 | ||
@@ -1487,12 +1636,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1487 | int i, prev; | 1636 | int i, prev; |
1488 | int error; | 1637 | int error; |
1489 | union swap_header *swap_header = NULL; | 1638 | union swap_header *swap_header = NULL; |
1490 | int swap_header_version; | ||
1491 | unsigned int nr_good_pages = 0; | 1639 | unsigned int nr_good_pages = 0; |
1492 | int nr_extents = 0; | 1640 | int nr_extents = 0; |
1493 | sector_t span; | 1641 | sector_t span; |
1494 | unsigned long maxpages = 1; | 1642 | unsigned long maxpages = 1; |
1495 | int swapfilesize; | 1643 | unsigned long swapfilepages; |
1496 | unsigned short *swap_map = NULL; | 1644 | unsigned short *swap_map = NULL; |
1497 | struct page *page = NULL; | 1645 | struct page *page = NULL; |
1498 | struct inode *inode = NULL; | 1646 | struct inode *inode = NULL; |
@@ -1570,7 +1718,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1570 | goto bad_swap; | 1718 | goto bad_swap; |
1571 | } | 1719 | } |
1572 | 1720 | ||
1573 | swapfilesize = i_size_read(inode) >> PAGE_SHIFT; | 1721 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; |
1574 | 1722 | ||
1575 | /* | 1723 | /* |
1576 | * Read the swap header. | 1724 | * Read the swap header. |
@@ -1584,101 +1732,86 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1584 | error = PTR_ERR(page); | 1732 | error = PTR_ERR(page); |
1585 | goto bad_swap; | 1733 | goto bad_swap; |
1586 | } | 1734 | } |
1587 | kmap(page); | 1735 | swap_header = kmap(page); |
1588 | swap_header = page_address(page); | ||
1589 | 1736 | ||
1590 | if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) | 1737 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
1591 | swap_header_version = 1; | ||
1592 | else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) | ||
1593 | swap_header_version = 2; | ||
1594 | else { | ||
1595 | printk(KERN_ERR "Unable to find swap-space signature\n"); | 1738 | printk(KERN_ERR "Unable to find swap-space signature\n"); |
1596 | error = -EINVAL; | 1739 | error = -EINVAL; |
1597 | goto bad_swap; | 1740 | goto bad_swap; |
1598 | } | 1741 | } |
1599 | 1742 | ||
1600 | switch (swap_header_version) { | 1743 | /* swap partition endianess hack... */ |
1601 | case 1: | 1744 | if (swab32(swap_header->info.version) == 1) { |
1602 | printk(KERN_ERR "version 0 swap is no longer supported. " | 1745 | swab32s(&swap_header->info.version); |
1603 | "Use mkswap -v1 %s\n", name); | 1746 | swab32s(&swap_header->info.last_page); |
1747 | swab32s(&swap_header->info.nr_badpages); | ||
1748 | for (i = 0; i < swap_header->info.nr_badpages; i++) | ||
1749 | swab32s(&swap_header->info.badpages[i]); | ||
1750 | } | ||
1751 | /* Check the swap header's sub-version */ | ||
1752 | if (swap_header->info.version != 1) { | ||
1753 | printk(KERN_WARNING | ||
1754 | "Unable to handle swap header version %d\n", | ||
1755 | swap_header->info.version); | ||
1604 | error = -EINVAL; | 1756 | error = -EINVAL; |
1605 | goto bad_swap; | 1757 | goto bad_swap; |
1606 | case 2: | 1758 | } |
1607 | /* swap partition endianess hack... */ | ||
1608 | if (swab32(swap_header->info.version) == 1) { | ||
1609 | swab32s(&swap_header->info.version); | ||
1610 | swab32s(&swap_header->info.last_page); | ||
1611 | swab32s(&swap_header->info.nr_badpages); | ||
1612 | for (i = 0; i < swap_header->info.nr_badpages; i++) | ||
1613 | swab32s(&swap_header->info.badpages[i]); | ||
1614 | } | ||
1615 | /* Check the swap header's sub-version and the size of | ||
1616 | the swap file and bad block lists */ | ||
1617 | if (swap_header->info.version != 1) { | ||
1618 | printk(KERN_WARNING | ||
1619 | "Unable to handle swap header version %d\n", | ||
1620 | swap_header->info.version); | ||
1621 | error = -EINVAL; | ||
1622 | goto bad_swap; | ||
1623 | } | ||
1624 | 1759 | ||
1625 | p->lowest_bit = 1; | 1760 | p->lowest_bit = 1; |
1626 | p->cluster_next = 1; | 1761 | p->cluster_next = 1; |
1627 | 1762 | ||
1628 | /* | 1763 | /* |
1629 | * Find out how many pages are allowed for a single swap | 1764 | * Find out how many pages are allowed for a single swap |
1630 | * device. There are two limiting factors: 1) the number of | 1765 | * device. There are two limiting factors: 1) the number of |
1631 | * bits for the swap offset in the swp_entry_t type and | 1766 | * bits for the swap offset in the swp_entry_t type and |
1632 | * 2) the number of bits in the a swap pte as defined by | 1767 | * 2) the number of bits in the a swap pte as defined by |
1633 | * the different architectures. In order to find the | 1768 | * the different architectures. In order to find the |
1634 | * largest possible bit mask a swap entry with swap type 0 | 1769 | * largest possible bit mask a swap entry with swap type 0 |
1635 | * and swap offset ~0UL is created, encoded to a swap pte, | 1770 | * and swap offset ~0UL is created, encoded to a swap pte, |
1636 | * decoded to a swp_entry_t again and finally the swap | 1771 | * decoded to a swp_entry_t again and finally the swap |
1637 | * offset is extracted. This will mask all the bits from | 1772 | * offset is extracted. This will mask all the bits from |
1638 | * the initial ~0UL mask that can't be encoded in either | 1773 | * the initial ~0UL mask that can't be encoded in either |
1639 | * the swp_entry_t or the architecture definition of a | 1774 | * the swp_entry_t or the architecture definition of a |
1640 | * swap pte. | 1775 | * swap pte. |
1641 | */ | 1776 | */ |
1642 | maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; | 1777 | maxpages = swp_offset(pte_to_swp_entry( |
1643 | if (maxpages > swap_header->info.last_page) | 1778 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; |
1644 | maxpages = swap_header->info.last_page; | 1779 | if (maxpages > swap_header->info.last_page) |
1645 | p->highest_bit = maxpages - 1; | 1780 | maxpages = swap_header->info.last_page; |
1781 | p->highest_bit = maxpages - 1; | ||
1646 | 1782 | ||
1647 | error = -EINVAL; | 1783 | error = -EINVAL; |
1648 | if (!maxpages) | 1784 | if (!maxpages) |
1649 | goto bad_swap; | 1785 | goto bad_swap; |
1650 | if (swapfilesize && maxpages > swapfilesize) { | 1786 | if (swapfilepages && maxpages > swapfilepages) { |
1651 | printk(KERN_WARNING | 1787 | printk(KERN_WARNING |
1652 | "Swap area shorter than signature indicates\n"); | 1788 | "Swap area shorter than signature indicates\n"); |
1653 | goto bad_swap; | 1789 | goto bad_swap; |
1654 | } | 1790 | } |
1655 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1791 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
1656 | goto bad_swap; | 1792 | goto bad_swap; |
1657 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1793 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
1658 | goto bad_swap; | 1794 | goto bad_swap; |
1659 | 1795 | ||
1660 | /* OK, set up the swap map and apply the bad block list */ | 1796 | /* OK, set up the swap map and apply the bad block list */ |
1661 | swap_map = vmalloc(maxpages * sizeof(short)); | 1797 | swap_map = vmalloc(maxpages * sizeof(short)); |
1662 | if (!swap_map) { | 1798 | if (!swap_map) { |
1663 | error = -ENOMEM; | 1799 | error = -ENOMEM; |
1664 | goto bad_swap; | 1800 | goto bad_swap; |
1665 | } | 1801 | } |
1666 | 1802 | ||
1667 | error = 0; | 1803 | memset(swap_map, 0, maxpages * sizeof(short)); |
1668 | memset(swap_map, 0, maxpages * sizeof(short)); | 1804 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1669 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1805 | int page_nr = swap_header->info.badpages[i]; |
1670 | int page_nr = swap_header->info.badpages[i]; | 1806 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { |
1671 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) | 1807 | error = -EINVAL; |
1672 | error = -EINVAL; | ||
1673 | else | ||
1674 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1675 | } | ||
1676 | nr_good_pages = swap_header->info.last_page - | ||
1677 | swap_header->info.nr_badpages - | ||
1678 | 1 /* header page */; | ||
1679 | if (error) | ||
1680 | goto bad_swap; | 1808 | goto bad_swap; |
1809 | } | ||
1810 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1681 | } | 1811 | } |
1812 | nr_good_pages = swap_header->info.last_page - | ||
1813 | swap_header->info.nr_badpages - | ||
1814 | 1 /* header page */; | ||
1682 | 1815 | ||
1683 | if (nr_good_pages) { | 1816 | if (nr_good_pages) { |
1684 | swap_map[0] = SWAP_MAP_BAD; | 1817 | swap_map[0] = SWAP_MAP_BAD; |
@@ -1697,6 +1830,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1697 | goto bad_swap; | 1830 | goto bad_swap; |
1698 | } | 1831 | } |
1699 | 1832 | ||
1833 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | ||
1834 | p->flags |= SWP_SOLIDSTATE; | ||
1835 | p->cluster_next = 1 + (random32() % p->highest_bit); | ||
1836 | } | ||
1837 | if (discard_swap(p) == 0) | ||
1838 | p->flags |= SWP_DISCARDABLE; | ||
1839 | |||
1700 | mutex_lock(&swapon_mutex); | 1840 | mutex_lock(&swapon_mutex); |
1701 | spin_lock(&swap_lock); | 1841 | spin_lock(&swap_lock); |
1702 | if (swap_flags & SWAP_FLAG_PREFER) | 1842 | if (swap_flags & SWAP_FLAG_PREFER) |
@@ -1705,14 +1845,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1705 | else | 1845 | else |
1706 | p->prio = --least_priority; | 1846 | p->prio = --least_priority; |
1707 | p->swap_map = swap_map; | 1847 | p->swap_map = swap_map; |
1708 | p->flags = SWP_ACTIVE; | 1848 | p->flags |= SWP_WRITEOK; |
1709 | nr_swap_pages += nr_good_pages; | 1849 | nr_swap_pages += nr_good_pages; |
1710 | total_swap_pages += nr_good_pages; | 1850 | total_swap_pages += nr_good_pages; |
1711 | 1851 | ||
1712 | printk(KERN_INFO "Adding %uk swap on %s. " | 1852 | printk(KERN_INFO "Adding %uk swap on %s. " |
1713 | "Priority:%d extents:%d across:%lluk\n", | 1853 | "Priority:%d extents:%d across:%lluk %s%s\n", |
1714 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, | 1854 | nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, |
1715 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); | 1855 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
1856 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | ||
1857 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | ||
1716 | 1858 | ||
1717 | /* insert swap space into swap_list: */ | 1859 | /* insert swap space into swap_list: */ |
1718 | prev = -1; | 1860 | prev = -1; |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c deleted file mode 100644 index 3e67d575ee6e..000000000000 --- a/mm/tiny-shmem.c +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | /* | ||
2 | * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code | ||
3 | * | ||
4 | * Matt Mackall <mpm@selenic.com> January, 2004 | ||
5 | * derived from mm/shmem.c and fs/ramfs/inode.c | ||
6 | * | ||
7 | * This is intended for small system where the benefits of the full | ||
8 | * shmem code (swap-backed and resource-limited) are outweighed by | ||
9 | * their complexity. On systems without swap this code should be | ||
10 | * effectively equivalent, but much lighter weight. | ||
11 | */ | ||
12 | |||
13 | #include <linux/fs.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/vfs.h> | ||
16 | #include <linux/mount.h> | ||
17 | #include <linux/file.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/swap.h> | ||
21 | #include <linux/ramfs.h> | ||
22 | |||
23 | static struct file_system_type tmpfs_fs_type = { | ||
24 | .name = "tmpfs", | ||
25 | .get_sb = ramfs_get_sb, | ||
26 | .kill_sb = kill_litter_super, | ||
27 | }; | ||
28 | |||
29 | static struct vfsmount *shm_mnt; | ||
30 | |||
31 | static int __init init_tmpfs(void) | ||
32 | { | ||
33 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | ||
34 | |||
35 | shm_mnt = kern_mount(&tmpfs_fs_type); | ||
36 | BUG_ON(IS_ERR(shm_mnt)); | ||
37 | |||
38 | return 0; | ||
39 | } | ||
40 | module_init(init_tmpfs) | ||
41 | |||
42 | /** | ||
43 | * shmem_file_setup - get an unlinked file living in tmpfs | ||
44 | * @name: name for dentry (to be seen in /proc/<pid>/maps | ||
45 | * @size: size to be set for the file | ||
46 | * @flags: vm_flags | ||
47 | */ | ||
48 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | ||
49 | { | ||
50 | int error; | ||
51 | struct file *file; | ||
52 | struct inode *inode; | ||
53 | struct dentry *dentry, *root; | ||
54 | struct qstr this; | ||
55 | |||
56 | if (IS_ERR(shm_mnt)) | ||
57 | return (void *)shm_mnt; | ||
58 | |||
59 | error = -ENOMEM; | ||
60 | this.name = name; | ||
61 | this.len = strlen(name); | ||
62 | this.hash = 0; /* will go */ | ||
63 | root = shm_mnt->mnt_root; | ||
64 | dentry = d_alloc(root, &this); | ||
65 | if (!dentry) | ||
66 | goto put_memory; | ||
67 | |||
68 | error = -ENFILE; | ||
69 | file = get_empty_filp(); | ||
70 | if (!file) | ||
71 | goto put_dentry; | ||
72 | |||
73 | error = -ENOSPC; | ||
74 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | ||
75 | if (!inode) | ||
76 | goto close_file; | ||
77 | |||
78 | d_instantiate(dentry, inode); | ||
79 | inode->i_size = size; | ||
80 | inode->i_nlink = 0; /* It is unlinked */ | ||
81 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
82 | &ramfs_file_operations); | ||
83 | |||
84 | #ifndef CONFIG_MMU | ||
85 | error = ramfs_nommu_expand_for_mapping(inode, size); | ||
86 | if (error) | ||
87 | goto close_file; | ||
88 | #endif | ||
89 | return file; | ||
90 | |||
91 | close_file: | ||
92 | put_filp(file); | ||
93 | put_dentry: | ||
94 | dput(dentry); | ||
95 | put_memory: | ||
96 | return ERR_PTR(error); | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(shmem_file_setup); | ||
99 | |||
100 | /** | ||
101 | * shmem_zero_setup - setup a shared anonymous mapping | ||
102 | * @vma: the vma to be mmapped is prepared by do_mmap_pgoff | ||
103 | */ | ||
104 | int shmem_zero_setup(struct vm_area_struct *vma) | ||
105 | { | ||
106 | struct file *file; | ||
107 | loff_t size = vma->vm_end - vma->vm_start; | ||
108 | |||
109 | file = shmem_file_setup("dev/zero", size, vma->vm_flags); | ||
110 | if (IS_ERR(file)) | ||
111 | return PTR_ERR(file); | ||
112 | |||
113 | if (vma->vm_file) | ||
114 | fput(vma->vm_file); | ||
115 | vma->vm_file = file; | ||
116 | vma->vm_ops = &generic_file_vm_ops; | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | int shmem_unuse(swp_entry_t entry, struct page *page) | ||
121 | { | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | #ifndef CONFIG_MMU | ||
126 | unsigned long shmem_get_unmapped_area(struct file *file, | ||
127 | unsigned long addr, | ||
128 | unsigned long len, | ||
129 | unsigned long pgoff, | ||
130 | unsigned long flags) | ||
131 | { | ||
132 | return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); | ||
133 | } | ||
134 | #endif | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7465f22fec0c..c5db9a7264d9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
17 | #include <linux/mutex.h> | ||
17 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
18 | #include <linux/proc_fs.h> | 19 | #include <linux/proc_fs.h> |
19 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
@@ -381,8 +382,9 @@ found: | |||
381 | goto retry; | 382 | goto retry; |
382 | } | 383 | } |
383 | if (printk_ratelimit()) | 384 | if (printk_ratelimit()) |
384 | printk(KERN_WARNING "vmap allocation failed: " | 385 | printk(KERN_WARNING |
385 | "use vmalloc=<size> to increase size.\n"); | 386 | "vmap allocation for size %lu failed: " |
387 | "use vmalloc=<size> to increase size.\n", size); | ||
386 | return ERR_PTR(-EBUSY); | 388 | return ERR_PTR(-EBUSY); |
387 | } | 389 | } |
388 | 390 | ||
@@ -432,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va) | |||
432 | vunmap_page_range(va->va_start, va->va_end); | 434 | vunmap_page_range(va->va_start, va->va_end); |
433 | } | 435 | } |
434 | 436 | ||
437 | static void vmap_debug_free_range(unsigned long start, unsigned long end) | ||
438 | { | ||
439 | /* | ||
440 | * Unmap page tables and force a TLB flush immediately if | ||
441 | * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free | ||
442 | * bugs similarly to those in linear kernel virtual address | ||
443 | * space after a page has been freed. | ||
444 | * | ||
445 | * All the lazy freeing logic is still retained, in order to | ||
446 | * minimise intrusiveness of this debugging feature. | ||
447 | * | ||
448 | * This is going to be *slow* (linear kernel virtual address | ||
449 | * debugging doesn't do a broadcast TLB flush so it is a lot | ||
450 | * faster). | ||
451 | */ | ||
452 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
453 | vunmap_page_range(start, end); | ||
454 | flush_tlb_kernel_range(start, end); | ||
455 | #endif | ||
456 | } | ||
457 | |||
435 | /* | 458 | /* |
436 | * lazy_max_pages is the maximum amount of virtual address space we gather up | 459 | * lazy_max_pages is the maximum amount of virtual address space we gather up |
437 | * before attempting to purge with a TLB flush. | 460 | * before attempting to purge with a TLB flush. |
@@ -472,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | |||
472 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | 495 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, |
473 | int sync, int force_flush) | 496 | int sync, int force_flush) |
474 | { | 497 | { |
475 | static DEFINE_SPINLOCK(purge_lock); | 498 | static DEFINE_MUTEX(purge_lock); |
476 | LIST_HEAD(valist); | 499 | LIST_HEAD(valist); |
477 | struct vmap_area *va; | 500 | struct vmap_area *va; |
478 | int nr = 0; | 501 | int nr = 0; |
@@ -483,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
483 | * the case that isn't actually used at the moment anyway. | 506 | * the case that isn't actually used at the moment anyway. |
484 | */ | 507 | */ |
485 | if (!sync && !force_flush) { | 508 | if (!sync && !force_flush) { |
486 | if (!spin_trylock(&purge_lock)) | 509 | if (!mutex_trylock(&purge_lock)) |
487 | return; | 510 | return; |
488 | } else | 511 | } else |
489 | spin_lock(&purge_lock); | 512 | mutex_lock(&purge_lock); |
490 | 513 | ||
491 | rcu_read_lock(); | 514 | rcu_read_lock(); |
492 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | 515 | list_for_each_entry_rcu(va, &vmap_area_list, list) { |
@@ -518,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
518 | __free_vmap_area(va); | 541 | __free_vmap_area(va); |
519 | spin_unlock(&vmap_area_lock); | 542 | spin_unlock(&vmap_area_lock); |
520 | } | 543 | } |
521 | spin_unlock(&purge_lock); | 544 | mutex_unlock(&purge_lock); |
522 | } | 545 | } |
523 | 546 | ||
524 | /* | 547 | /* |
@@ -912,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) | |||
912 | BUG_ON(addr & (PAGE_SIZE-1)); | 935 | BUG_ON(addr & (PAGE_SIZE-1)); |
913 | 936 | ||
914 | debug_check_no_locks_freed(mem, size); | 937 | debug_check_no_locks_freed(mem, size); |
938 | vmap_debug_free_range(addr, addr+size); | ||
915 | 939 | ||
916 | if (likely(count <= VMAP_MAX_ALLOC)) | 940 | if (likely(count <= VMAP_MAX_ALLOC)) |
917 | vb_free(mem, size); | 941 | vb_free(mem, size); |
@@ -1128,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1128 | if (va && va->flags & VM_VM_AREA) { | 1152 | if (va && va->flags & VM_VM_AREA) { |
1129 | struct vm_struct *vm = va->private; | 1153 | struct vm_struct *vm = va->private; |
1130 | struct vm_struct *tmp, **p; | 1154 | struct vm_struct *tmp, **p; |
1155 | |||
1156 | vmap_debug_free_range(va->va_start, va->va_end); | ||
1131 | free_unmap_vmap_area(va); | 1157 | free_unmap_vmap_area(va); |
1132 | vm->size -= PAGE_SIZE; | 1158 | vm->size -= PAGE_SIZE; |
1133 | 1159 | ||
@@ -1375,7 +1401,8 @@ void *vmalloc_user(unsigned long size) | |||
1375 | struct vm_struct *area; | 1401 | struct vm_struct *area; |
1376 | void *ret; | 1402 | void *ret; |
1377 | 1403 | ||
1378 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1404 | ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
1405 | PAGE_KERNEL, -1, __builtin_return_address(0)); | ||
1379 | if (ret) { | 1406 | if (ret) { |
1380 | area = find_vm_area(ret); | 1407 | area = find_vm_area(ret); |
1381 | area->flags |= VM_USERMAP; | 1408 | area->flags |= VM_USERMAP; |
@@ -1420,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node); | |||
1420 | 1447 | ||
1421 | void *vmalloc_exec(unsigned long size) | 1448 | void *vmalloc_exec(unsigned long size) |
1422 | { | 1449 | { |
1423 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | 1450 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
1451 | -1, __builtin_return_address(0)); | ||
1424 | } | 1452 | } |
1425 | 1453 | ||
1426 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 1454 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
@@ -1440,7 +1468,8 @@ void *vmalloc_exec(unsigned long size) | |||
1440 | */ | 1468 | */ |
1441 | void *vmalloc_32(unsigned long size) | 1469 | void *vmalloc_32(unsigned long size) |
1442 | { | 1470 | { |
1443 | return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); | 1471 | return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, |
1472 | -1, __builtin_return_address(0)); | ||
1444 | } | 1473 | } |
1445 | EXPORT_SYMBOL(vmalloc_32); | 1474 | EXPORT_SYMBOL(vmalloc_32); |
1446 | 1475 | ||
@@ -1456,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size) | |||
1456 | struct vm_struct *area; | 1485 | struct vm_struct *area; |
1457 | void *ret; | 1486 | void *ret; |
1458 | 1487 | ||
1459 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1488 | ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
1489 | -1, __builtin_return_address(0)); | ||
1460 | if (ret) { | 1490 | if (ret) { |
1461 | area = find_vm_area(ret); | 1491 | area = find_vm_area(ret); |
1462 | area->flags |= VM_USERMAP; | 1492 | area->flags |= VM_USERMAP; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d196f46c8808..b07c48b09a93 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -52,6 +52,9 @@ struct scan_control { | |||
52 | /* Incremented by the number of inactive pages that were scanned */ | 52 | /* Incremented by the number of inactive pages that were scanned */ |
53 | unsigned long nr_scanned; | 53 | unsigned long nr_scanned; |
54 | 54 | ||
55 | /* Number of pages freed so far during a call to shrink_zones() */ | ||
56 | unsigned long nr_reclaimed; | ||
57 | |||
55 | /* This context's GFP mask */ | 58 | /* This context's GFP mask */ |
56 | gfp_t gfp_mask; | 59 | gfp_t gfp_mask; |
57 | 60 | ||
@@ -617,7 +620,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
617 | referenced && page_mapping_inuse(page)) | 620 | referenced && page_mapping_inuse(page)) |
618 | goto activate_locked; | 621 | goto activate_locked; |
619 | 622 | ||
620 | #ifdef CONFIG_SWAP | ||
621 | /* | 623 | /* |
622 | * Anonymous process memory has backing store? | 624 | * Anonymous process memory has backing store? |
623 | * Try to allocate it some swap space here. | 625 | * Try to allocate it some swap space here. |
@@ -625,20 +627,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
625 | if (PageAnon(page) && !PageSwapCache(page)) { | 627 | if (PageAnon(page) && !PageSwapCache(page)) { |
626 | if (!(sc->gfp_mask & __GFP_IO)) | 628 | if (!(sc->gfp_mask & __GFP_IO)) |
627 | goto keep_locked; | 629 | goto keep_locked; |
628 | switch (try_to_munlock(page)) { | 630 | if (!add_to_swap(page)) |
629 | case SWAP_FAIL: /* shouldn't happen */ | ||
630 | case SWAP_AGAIN: | ||
631 | goto keep_locked; | ||
632 | case SWAP_MLOCK: | ||
633 | goto cull_mlocked; | ||
634 | case SWAP_SUCCESS: | ||
635 | ; /* fall thru'; add to swap cache */ | ||
636 | } | ||
637 | if (!add_to_swap(page, GFP_ATOMIC)) | ||
638 | goto activate_locked; | 631 | goto activate_locked; |
639 | may_enter_fs = 1; | 632 | may_enter_fs = 1; |
640 | } | 633 | } |
641 | #endif /* CONFIG_SWAP */ | ||
642 | 634 | ||
643 | mapping = page_mapping(page); | 635 | mapping = page_mapping(page); |
644 | 636 | ||
@@ -752,6 +744,8 @@ free_it: | |||
752 | continue; | 744 | continue; |
753 | 745 | ||
754 | cull_mlocked: | 746 | cull_mlocked: |
747 | if (PageSwapCache(page)) | ||
748 | try_to_free_swap(page); | ||
755 | unlock_page(page); | 749 | unlock_page(page); |
756 | putback_lru_page(page); | 750 | putback_lru_page(page); |
757 | continue; | 751 | continue; |
@@ -759,7 +753,7 @@ cull_mlocked: | |||
759 | activate_locked: | 753 | activate_locked: |
760 | /* Not a candidate for swapping, so reclaim swap space. */ | 754 | /* Not a candidate for swapping, so reclaim swap space. */ |
761 | if (PageSwapCache(page) && vm_swap_full()) | 755 | if (PageSwapCache(page) && vm_swap_full()) |
762 | remove_exclusive_swap_page_ref(page); | 756 | try_to_free_swap(page); |
763 | VM_BUG_ON(PageActive(page)); | 757 | VM_BUG_ON(PageActive(page)); |
764 | SetPageActive(page); | 758 | SetPageActive(page); |
765 | pgactivate++; | 759 | pgactivate++; |
@@ -1173,11 +1167,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
1173 | zone->prev_priority = priority; | 1167 | zone->prev_priority = priority; |
1174 | } | 1168 | } |
1175 | 1169 | ||
1176 | static inline int zone_is_near_oom(struct zone *zone) | ||
1177 | { | ||
1178 | return zone->pages_scanned >= (zone_lru_pages(zone) * 3); | ||
1179 | } | ||
1180 | |||
1181 | /* | 1170 | /* |
1182 | * This moves pages from the active list to the inactive list. | 1171 | * This moves pages from the active list to the inactive list. |
1183 | * | 1172 | * |
@@ -1248,6 +1237,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1248 | list_add(&page->lru, &l_inactive); | 1237 | list_add(&page->lru, &l_inactive); |
1249 | } | 1238 | } |
1250 | 1239 | ||
1240 | /* | ||
1241 | * Move the pages to the [file or anon] inactive list. | ||
1242 | */ | ||
1243 | pagevec_init(&pvec, 1); | ||
1244 | pgmoved = 0; | ||
1245 | lru = LRU_BASE + file * LRU_FILE; | ||
1246 | |||
1251 | spin_lock_irq(&zone->lru_lock); | 1247 | spin_lock_irq(&zone->lru_lock); |
1252 | /* | 1248 | /* |
1253 | * Count referenced pages from currently used mappings as | 1249 | * Count referenced pages from currently used mappings as |
@@ -1255,15 +1251,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1255 | * This helps balance scan pressure between file and anonymous | 1251 | * This helps balance scan pressure between file and anonymous |
1256 | * pages in get_scan_ratio. | 1252 | * pages in get_scan_ratio. |
1257 | */ | 1253 | */ |
1258 | zone->recent_rotated[!!file] += pgmoved; | 1254 | if (scan_global_lru(sc)) |
1259 | 1255 | zone->recent_rotated[!!file] += pgmoved; | |
1260 | /* | ||
1261 | * Move the pages to the [file or anon] inactive list. | ||
1262 | */ | ||
1263 | pagevec_init(&pvec, 1); | ||
1264 | 1256 | ||
1265 | pgmoved = 0; | ||
1266 | lru = LRU_BASE + file * LRU_FILE; | ||
1267 | while (!list_empty(&l_inactive)) { | 1257 | while (!list_empty(&l_inactive)) { |
1268 | page = lru_to_page(&l_inactive); | 1258 | page = lru_to_page(&l_inactive); |
1269 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1259 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
@@ -1336,12 +1326,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1336 | unsigned long anon_prio, file_prio; | 1326 | unsigned long anon_prio, file_prio; |
1337 | unsigned long ap, fp; | 1327 | unsigned long ap, fp; |
1338 | 1328 | ||
1339 | anon = zone_page_state(zone, NR_ACTIVE_ANON) + | ||
1340 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
1341 | file = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
1342 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
1343 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
1344 | |||
1345 | /* If we have no swap space, do not bother scanning anon pages. */ | 1329 | /* If we have no swap space, do not bother scanning anon pages. */ |
1346 | if (nr_swap_pages <= 0) { | 1330 | if (nr_swap_pages <= 0) { |
1347 | percent[0] = 0; | 1331 | percent[0] = 0; |
@@ -1349,6 +1333,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1349 | return; | 1333 | return; |
1350 | } | 1334 | } |
1351 | 1335 | ||
1336 | anon = zone_page_state(zone, NR_ACTIVE_ANON) + | ||
1337 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
1338 | file = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
1339 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
1340 | free = zone_page_state(zone, NR_FREE_PAGES); | ||
1341 | |||
1352 | /* If we have very few page cache pages, force-scan anon pages. */ | 1342 | /* If we have very few page cache pages, force-scan anon pages. */ |
1353 | if (unlikely(file + free <= zone->pages_high)) { | 1343 | if (unlikely(file + free <= zone->pages_high)) { |
1354 | percent[0] = 100; | 1344 | percent[0] = 100; |
@@ -1408,14 +1398,15 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1408 | /* | 1398 | /* |
1409 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1399 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1410 | */ | 1400 | */ |
1411 | static unsigned long shrink_zone(int priority, struct zone *zone, | 1401 | static void shrink_zone(int priority, struct zone *zone, |
1412 | struct scan_control *sc) | 1402 | struct scan_control *sc) |
1413 | { | 1403 | { |
1414 | unsigned long nr[NR_LRU_LISTS]; | 1404 | unsigned long nr[NR_LRU_LISTS]; |
1415 | unsigned long nr_to_scan; | 1405 | unsigned long nr_to_scan; |
1416 | unsigned long nr_reclaimed = 0; | ||
1417 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1406 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
1418 | enum lru_list l; | 1407 | enum lru_list l; |
1408 | unsigned long nr_reclaimed = sc->nr_reclaimed; | ||
1409 | unsigned long swap_cluster_max = sc->swap_cluster_max; | ||
1419 | 1410 | ||
1420 | get_scan_ratio(zone, sc, percent); | 1411 | get_scan_ratio(zone, sc, percent); |
1421 | 1412 | ||
@@ -1431,7 +1422,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1431 | } | 1422 | } |
1432 | zone->lru[l].nr_scan += scan; | 1423 | zone->lru[l].nr_scan += scan; |
1433 | nr[l] = zone->lru[l].nr_scan; | 1424 | nr[l] = zone->lru[l].nr_scan; |
1434 | if (nr[l] >= sc->swap_cluster_max) | 1425 | if (nr[l] >= swap_cluster_max) |
1435 | zone->lru[l].nr_scan = 0; | 1426 | zone->lru[l].nr_scan = 0; |
1436 | else | 1427 | else |
1437 | nr[l] = 0; | 1428 | nr[l] = 0; |
@@ -1450,16 +1441,28 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1450 | nr[LRU_INACTIVE_FILE]) { | 1441 | nr[LRU_INACTIVE_FILE]) { |
1451 | for_each_evictable_lru(l) { | 1442 | for_each_evictable_lru(l) { |
1452 | if (nr[l]) { | 1443 | if (nr[l]) { |
1453 | nr_to_scan = min(nr[l], | 1444 | nr_to_scan = min(nr[l], swap_cluster_max); |
1454 | (unsigned long)sc->swap_cluster_max); | ||
1455 | nr[l] -= nr_to_scan; | 1445 | nr[l] -= nr_to_scan; |
1456 | 1446 | ||
1457 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1447 | nr_reclaimed += shrink_list(l, nr_to_scan, |
1458 | zone, sc, priority); | 1448 | zone, sc, priority); |
1459 | } | 1449 | } |
1460 | } | 1450 | } |
1451 | /* | ||
1452 | * On large memory systems, scan >> priority can become | ||
1453 | * really large. This is fine for the starting priority; | ||
1454 | * we want to put equal scanning pressure on each zone. | ||
1455 | * However, if the VM has a harder time of freeing pages, | ||
1456 | * with multiple processes reclaiming pages, the total | ||
1457 | * freeing target can get unreasonably large. | ||
1458 | */ | ||
1459 | if (nr_reclaimed > swap_cluster_max && | ||
1460 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
1461 | break; | ||
1461 | } | 1462 | } |
1462 | 1463 | ||
1464 | sc->nr_reclaimed = nr_reclaimed; | ||
1465 | |||
1463 | /* | 1466 | /* |
1464 | * Even if we did not try to evict anon pages at all, we want to | 1467 | * Even if we did not try to evict anon pages at all, we want to |
1465 | * rebalance the anon lru active/inactive ratio. | 1468 | * rebalance the anon lru active/inactive ratio. |
@@ -1470,7 +1473,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1470 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1473 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1471 | 1474 | ||
1472 | throttle_vm_writeout(sc->gfp_mask); | 1475 | throttle_vm_writeout(sc->gfp_mask); |
1473 | return nr_reclaimed; | ||
1474 | } | 1476 | } |
1475 | 1477 | ||
1476 | /* | 1478 | /* |
@@ -1484,16 +1486,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1484 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1486 | * b) The zones may be over pages_high but they must go *over* pages_high to |
1485 | * satisfy the `incremental min' zone defense algorithm. | 1487 | * satisfy the `incremental min' zone defense algorithm. |
1486 | * | 1488 | * |
1487 | * Returns the number of reclaimed pages. | ||
1488 | * | ||
1489 | * If a zone is deemed to be full of pinned pages then just give it a light | 1489 | * If a zone is deemed to be full of pinned pages then just give it a light |
1490 | * scan then give up on it. | 1490 | * scan then give up on it. |
1491 | */ | 1491 | */ |
1492 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | 1492 | static void shrink_zones(int priority, struct zonelist *zonelist, |
1493 | struct scan_control *sc) | 1493 | struct scan_control *sc) |
1494 | { | 1494 | { |
1495 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1495 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1496 | unsigned long nr_reclaimed = 0; | ||
1497 | struct zoneref *z; | 1496 | struct zoneref *z; |
1498 | struct zone *zone; | 1497 | struct zone *zone; |
1499 | 1498 | ||
@@ -1524,10 +1523,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
1524 | priority); | 1523 | priority); |
1525 | } | 1524 | } |
1526 | 1525 | ||
1527 | nr_reclaimed += shrink_zone(priority, zone, sc); | 1526 | shrink_zone(priority, zone, sc); |
1528 | } | 1527 | } |
1529 | |||
1530 | return nr_reclaimed; | ||
1531 | } | 1528 | } |
1532 | 1529 | ||
1533 | /* | 1530 | /* |
@@ -1552,7 +1549,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1552 | int priority; | 1549 | int priority; |
1553 | unsigned long ret = 0; | 1550 | unsigned long ret = 0; |
1554 | unsigned long total_scanned = 0; | 1551 | unsigned long total_scanned = 0; |
1555 | unsigned long nr_reclaimed = 0; | ||
1556 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1552 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1557 | unsigned long lru_pages = 0; | 1553 | unsigned long lru_pages = 0; |
1558 | struct zoneref *z; | 1554 | struct zoneref *z; |
@@ -1580,7 +1576,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1580 | sc->nr_scanned = 0; | 1576 | sc->nr_scanned = 0; |
1581 | if (!priority) | 1577 | if (!priority) |
1582 | disable_swap_token(); | 1578 | disable_swap_token(); |
1583 | nr_reclaimed += shrink_zones(priority, zonelist, sc); | 1579 | shrink_zones(priority, zonelist, sc); |
1584 | /* | 1580 | /* |
1585 | * Don't shrink slabs when reclaiming memory from | 1581 | * Don't shrink slabs when reclaiming memory from |
1586 | * over limit cgroups | 1582 | * over limit cgroups |
@@ -1588,13 +1584,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1588 | if (scan_global_lru(sc)) { | 1584 | if (scan_global_lru(sc)) { |
1589 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); | 1585 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); |
1590 | if (reclaim_state) { | 1586 | if (reclaim_state) { |
1591 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1587 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
1592 | reclaim_state->reclaimed_slab = 0; | 1588 | reclaim_state->reclaimed_slab = 0; |
1593 | } | 1589 | } |
1594 | } | 1590 | } |
1595 | total_scanned += sc->nr_scanned; | 1591 | total_scanned += sc->nr_scanned; |
1596 | if (nr_reclaimed >= sc->swap_cluster_max) { | 1592 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { |
1597 | ret = nr_reclaimed; | 1593 | ret = sc->nr_reclaimed; |
1598 | goto out; | 1594 | goto out; |
1599 | } | 1595 | } |
1600 | 1596 | ||
@@ -1617,7 +1613,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1617 | } | 1613 | } |
1618 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1614 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
1619 | if (!sc->all_unreclaimable && scan_global_lru(sc)) | 1615 | if (!sc->all_unreclaimable && scan_global_lru(sc)) |
1620 | ret = nr_reclaimed; | 1616 | ret = sc->nr_reclaimed; |
1621 | out: | 1617 | out: |
1622 | /* | 1618 | /* |
1623 | * Now that we've scanned all the zones at this priority level, note | 1619 | * Now that we've scanned all the zones at this priority level, note |
@@ -1712,7 +1708,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1712 | int priority; | 1708 | int priority; |
1713 | int i; | 1709 | int i; |
1714 | unsigned long total_scanned; | 1710 | unsigned long total_scanned; |
1715 | unsigned long nr_reclaimed; | ||
1716 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1711 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1717 | struct scan_control sc = { | 1712 | struct scan_control sc = { |
1718 | .gfp_mask = GFP_KERNEL, | 1713 | .gfp_mask = GFP_KERNEL, |
@@ -1731,7 +1726,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1731 | 1726 | ||
1732 | loop_again: | 1727 | loop_again: |
1733 | total_scanned = 0; | 1728 | total_scanned = 0; |
1734 | nr_reclaimed = 0; | 1729 | sc.nr_reclaimed = 0; |
1735 | sc.may_writepage = !laptop_mode; | 1730 | sc.may_writepage = !laptop_mode; |
1736 | count_vm_event(PAGEOUTRUN); | 1731 | count_vm_event(PAGEOUTRUN); |
1737 | 1732 | ||
@@ -1817,11 +1812,11 @@ loop_again: | |||
1817 | */ | 1812 | */ |
1818 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 1813 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, |
1819 | end_zone, 0)) | 1814 | end_zone, 0)) |
1820 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1815 | shrink_zone(priority, zone, &sc); |
1821 | reclaim_state->reclaimed_slab = 0; | 1816 | reclaim_state->reclaimed_slab = 0; |
1822 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1817 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1823 | lru_pages); | 1818 | lru_pages); |
1824 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1819 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
1825 | total_scanned += sc.nr_scanned; | 1820 | total_scanned += sc.nr_scanned; |
1826 | if (zone_is_all_unreclaimable(zone)) | 1821 | if (zone_is_all_unreclaimable(zone)) |
1827 | continue; | 1822 | continue; |
@@ -1835,7 +1830,7 @@ loop_again: | |||
1835 | * even in laptop mode | 1830 | * even in laptop mode |
1836 | */ | 1831 | */ |
1837 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 1832 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
1838 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1833 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
1839 | sc.may_writepage = 1; | 1834 | sc.may_writepage = 1; |
1840 | } | 1835 | } |
1841 | if (all_zones_ok) | 1836 | if (all_zones_ok) |
@@ -1853,7 +1848,7 @@ loop_again: | |||
1853 | * matches the direct reclaim path behaviour in terms of impact | 1848 | * matches the direct reclaim path behaviour in terms of impact |
1854 | * on zone->*_priority. | 1849 | * on zone->*_priority. |
1855 | */ | 1850 | */ |
1856 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) | 1851 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
1857 | break; | 1852 | break; |
1858 | } | 1853 | } |
1859 | out: | 1854 | out: |
@@ -1872,10 +1867,27 @@ out: | |||
1872 | 1867 | ||
1873 | try_to_freeze(); | 1868 | try_to_freeze(); |
1874 | 1869 | ||
1870 | /* | ||
1871 | * Fragmentation may mean that the system cannot be | ||
1872 | * rebalanced for high-order allocations in all zones. | ||
1873 | * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, | ||
1874 | * it means the zones have been fully scanned and are still | ||
1875 | * not balanced. For high-order allocations, there is | ||
1876 | * little point trying all over again as kswapd may | ||
1877 | * infinite loop. | ||
1878 | * | ||
1879 | * Instead, recheck all watermarks at order-0 as they | ||
1880 | * are the most important. If watermarks are ok, kswapd will go | ||
1881 | * back to sleep. High-order users can still perform direct | ||
1882 | * reclaim if they wish. | ||
1883 | */ | ||
1884 | if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) | ||
1885 | order = sc.order = 0; | ||
1886 | |||
1875 | goto loop_again; | 1887 | goto loop_again; |
1876 | } | 1888 | } |
1877 | 1889 | ||
1878 | return nr_reclaimed; | 1890 | return sc.nr_reclaimed; |
1879 | } | 1891 | } |
1880 | 1892 | ||
1881 | /* | 1893 | /* |
@@ -2227,7 +2239,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2227 | struct task_struct *p = current; | 2239 | struct task_struct *p = current; |
2228 | struct reclaim_state reclaim_state; | 2240 | struct reclaim_state reclaim_state; |
2229 | int priority; | 2241 | int priority; |
2230 | unsigned long nr_reclaimed = 0; | ||
2231 | struct scan_control sc = { | 2242 | struct scan_control sc = { |
2232 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2243 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
2233 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2244 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
@@ -2260,9 +2271,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2260 | priority = ZONE_RECLAIM_PRIORITY; | 2271 | priority = ZONE_RECLAIM_PRIORITY; |
2261 | do { | 2272 | do { |
2262 | note_zone_scanning_priority(zone, priority); | 2273 | note_zone_scanning_priority(zone, priority); |
2263 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 2274 | shrink_zone(priority, zone, &sc); |
2264 | priority--; | 2275 | priority--; |
2265 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 2276 | } while (priority >= 0 && sc.nr_reclaimed < nr_pages); |
2266 | } | 2277 | } |
2267 | 2278 | ||
2268 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 2279 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
@@ -2286,13 +2297,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2286 | * Update nr_reclaimed by the number of slab pages we | 2297 | * Update nr_reclaimed by the number of slab pages we |
2287 | * reclaimed from this zone. | 2298 | * reclaimed from this zone. |
2288 | */ | 2299 | */ |
2289 | nr_reclaimed += slab_reclaimable - | 2300 | sc.nr_reclaimed += slab_reclaimable - |
2290 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | 2301 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
2291 | } | 2302 | } |
2292 | 2303 | ||
2293 | p->reclaim_state = NULL; | 2304 | p->reclaim_state = NULL; |
2294 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 2305 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
2295 | return nr_reclaimed >= nr_pages; | 2306 | return sc.nr_reclaimed >= nr_pages; |
2296 | } | 2307 | } |
2297 | 2308 | ||
2298 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 2309 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
@@ -2472,7 +2483,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) | |||
2472 | * back onto @zone's unevictable list. | 2483 | * back onto @zone's unevictable list. |
2473 | */ | 2484 | */ |
2474 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | 2485 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ |
2475 | void scan_zone_unevictable_pages(struct zone *zone) | 2486 | static void scan_zone_unevictable_pages(struct zone *zone) |
2476 | { | 2487 | { |
2477 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | 2488 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; |
2478 | unsigned long scan; | 2489 | unsigned long scan; |
@@ -2514,7 +2525,7 @@ void scan_zone_unevictable_pages(struct zone *zone) | |||
2514 | * that has possibly/probably made some previously unevictable pages | 2525 | * that has possibly/probably made some previously unevictable pages |
2515 | * evictable. | 2526 | * evictable. |
2516 | */ | 2527 | */ |
2517 | void scan_all_zones_unevictable_pages(void) | 2528 | static void scan_all_zones_unevictable_pages(void) |
2518 | { | 2529 | { |
2519 | struct zone *zone; | 2530 | struct zone *zone; |
2520 | 2531 | ||