aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/filemap.c30
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memcontrol.c3
-rw-r--r--mm/memory.c176
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/migrate.c89
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c22
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/oom_kill.c109
-rw-r--r--mm/page-writeback.c245
-rw-r--r--mm/page_alloc.c135
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c82
-rw-r--r--mm/swap.c44
-rw-r--r--mm/swap_state.c31
-rw-r--r--mm/swapfile.c576
-rw-r--r--mm/tiny-shmem.c134
-rw-r--r--mm/vmalloc.c50
-rw-r--r--mm/vmscan.c143
29 files changed, 1183 insertions, 865 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f8a816..a5b77811fdf2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
181 example on NUMA systems to put pages nearer to the processors accessing 181 example on NUMA systems to put pages nearer to the processors accessing
182 the page. 182 the page.
183 183
184config RESOURCES_64BIT
185 bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
186 default 64BIT
187 help
188 This option allows memory and IO resources to be 64 bit.
189
190config PHYS_ADDR_T_64BIT 184config PHYS_ADDR_T_64BIT
191 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 185 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
192 186
diff --git a/mm/Makefile b/mm/Makefile
index 51c27709cc7c..72255be57f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
21obj-$(CONFIG_NUMA) += mempolicy.o 21obj-$(CONFIG_NUMA) += mempolicy.o
22obj-$(CONFIG_SPARSEMEM) += sparse.o 22obj-$(CONFIG_SPARSEMEM) += sparse.o
23obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 23obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
24obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
29obj-$(CONFIG_SLAB) += slab.o 27obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a7c6c5613ec9..8e8587444132 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
24static int bdi_debug_stats_show(struct seq_file *m, void *v) 24static int bdi_debug_stats_show(struct seq_file *m, void *v)
25{ 25{
26 struct backing_dev_info *bdi = m->private; 26 struct backing_dev_info *bdi = m->private;
27 long background_thresh; 27 unsigned long background_thresh;
28 long dirty_thresh; 28 unsigned long dirty_thresh;
29 long bdi_thresh; 29 unsigned long bdi_thresh;
30 30
31 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 31 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
32 32
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891f142a..51a0ccf61e0e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
435 unsigned long fallback = 0; 435 unsigned long fallback = 0;
436 unsigned long min, max, start, sidx, midx, step; 436 unsigned long min, max, start, sidx, midx, step;
437 437
438 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
439 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
440 align, goal, limit);
441
438 BUG_ON(!size); 442 BUG_ON(!size);
439 BUG_ON(align & (align - 1)); 443 BUG_ON(align & (align - 1));
440 BUG_ON(limit && goal + size > limit); 444 BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
442 if (!bdata->node_bootmem_map) 446 if (!bdata->node_bootmem_map)
443 return NULL; 447 return NULL;
444 448
445 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
446 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
447 align, goal, limit);
448
449 min = bdata->node_min_pfn; 449 min = bdata->node_min_pfn;
450 max = bdata->node_low_pfn; 450 max = bdata->node_low_pfn;
451 451
diff --git a/mm/filemap.c b/mm/filemap.c
index f5769b4dc075..2f55a1e2baf7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
210 int ret; 210 int ret;
211 struct writeback_control wbc = { 211 struct writeback_control wbc = {
212 .sync_mode = sync_mode, 212 .sync_mode = sync_mode,
213 .nr_to_write = mapping->nrpages * 2, 213 .nr_to_write = LONG_MAX,
214 .range_start = start, 214 .range_start = start,
215 .range_end = end, 215 .range_end = end,
216 }; 216 };
@@ -741,7 +741,14 @@ repeat:
741 page = __page_cache_alloc(gfp_mask); 741 page = __page_cache_alloc(gfp_mask);
742 if (!page) 742 if (!page)
743 return NULL; 743 return NULL;
744 err = add_to_page_cache_lru(page, mapping, index, gfp_mask); 744 /*
745 * We want a regular kernel memory (not highmem or DMA etc)
746 * allocation for the radix tree nodes, but we need to honour
747 * the context-specific requirements the caller has asked for.
748 * GFP_RECLAIM_MASK collects those requirements.
749 */
750 err = add_to_page_cache_lru(page, mapping, index,
751 (gfp_mask & GFP_RECLAIM_MASK));
745 if (unlikely(err)) { 752 if (unlikely(err)) {
746 page_cache_release(page); 753 page_cache_release(page);
747 page = NULL; 754 page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
950 return NULL; 957 return NULL;
951 } 958 }
952 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 959 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
953 if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { 960 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
954 page_cache_release(page); 961 page_cache_release(page);
955 page = NULL; 962 page = NULL;
956 } 963 }
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1317 goto out; /* skip atime */ 1324 goto out; /* skip atime */
1318 size = i_size_read(inode); 1325 size = i_size_read(inode);
1319 if (pos < size) { 1326 if (pos < size) {
1320 retval = filemap_write_and_wait(mapping); 1327 retval = filemap_write_and_wait_range(mapping, pos,
1328 pos + iov_length(iov, nr_segs) - 1);
1321 if (!retval) { 1329 if (!retval) {
1322 retval = mapping->a_ops->direct_IO(READ, iocb, 1330 retval = mapping->a_ops->direct_IO(READ, iocb,
1323 iov, pos, nr_segs); 1331 iov, pos, nr_segs);
@@ -1530,7 +1538,6 @@ retry_find:
1530 /* 1538 /*
1531 * Found the page and have a reference on it. 1539 * Found the page and have a reference on it.
1532 */ 1540 */
1533 mark_page_accessed(page);
1534 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; 1541 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1535 vmf->page = page; 1542 vmf->page = page;
1536 return ret | VM_FAULT_LOCKED; 1543 return ret | VM_FAULT_LOCKED;
@@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2060 if (count != ocount) 2067 if (count != ocount)
2061 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2068 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2062 2069
2063 /*
2064 * Unmap all mmappings of the file up-front.
2065 *
2066 * This will cause any pte dirty bits to be propagated into the
2067 * pageframes for the subsequent filemap_write_and_wait().
2068 */
2069 write_len = iov_length(iov, *nr_segs); 2070 write_len = iov_length(iov, *nr_segs);
2070 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2071 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2071 if (mapping_mapped(mapping))
2072 unmap_mapping_range(mapping, pos, write_len, 0);
2073 2072
2074 written = filemap_write_and_wait(mapping); 2073 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2075 if (written) 2074 if (written)
2076 goto out; 2075 goto out;
2077 2076
@@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2291 * the file data here, to try to honour O_DIRECT expectations. 2290 * the file data here, to try to honour O_DIRECT expectations.
2292 */ 2291 */
2293 if (unlikely(file->f_flags & O_DIRECT) && written) 2292 if (unlikely(file->f_flags & O_DIRECT) && written)
2294 status = filemap_write_and_wait(mapping); 2293 status = filemap_write_and_wait_range(mapping,
2294 pos, pos + written - 1);
2295 2295
2296 return written ? written : status; 2296 return written ? written : status;
2297} 2297}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167dfb2f2d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
193 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
194 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
196 page_remove_rmap(page, vma); 196 page_remove_rmap(page);
197 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, file_rss);
198 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca70ef7b..62d5bbda921a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
37 if (page) { 37 if (page) {
38 if (pte_dirty(pte)) 38 if (pte_dirty(pte))
39 set_page_dirty(page); 39 set_page_dirty(page);
40 page_remove_rmap(page, vma); 40 page_remove_rmap(page);
41 page_cache_release(page); 41 page_cache_release(page);
42 update_hiwater_rss(mm); 42 update_hiwater_rss(mm);
43 dec_mm_counter(mm, file_rss); 43 dec_mm_counter(mm, file_rss);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53dcb89..618e98304080 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220} 220}
221 221
222/* 222/*
223 * Return the size of the pages allocated when backing a VMA. In the majority
224 * cases this will be same size as used by the page table entries.
225 */
226unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
227{
228 struct hstate *hstate;
229
230 if (!is_vm_hugetlb_page(vma))
231 return PAGE_SIZE;
232
233 hstate = hstate_vma(vma);
234
235 return 1UL << (hstate->order + PAGE_SHIFT);
236}
237
238/*
239 * Return the page size being used by the MMU to back a VMA. In the majority
240 * of cases, the page size used by the kernel matches the MMU size. On
241 * architectures where it differs, an architecture-specific version of this
242 * function is required.
243 */
244#ifndef vma_mmu_pagesize
245unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
246{
247 return vma_kernel_pagesize(vma);
248}
249#endif
250
251/*
223 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 252 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
224 * bits of the reservation map pointer, which are always clear due to 253 * bits of the reservation map pointer, which are always clear due to
225 * alignment. 254 * alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
371{ 400{
372 int i; 401 int i;
373 402
374 if (unlikely(sz > MAX_ORDER_NR_PAGES)) 403 if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
375 return clear_gigantic_page(page, addr, sz); 404 clear_gigantic_page(page, addr, sz);
405 return;
406 }
376 407
377 might_sleep(); 408 might_sleep();
378 for (i = 0; i < sz/PAGE_SIZE; i++) { 409 for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
404 int i; 435 int i;
405 struct hstate *h = hstate_vma(vma); 436 struct hstate *h = hstate_vma(vma);
406 437
407 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) 438 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
408 return copy_gigantic_page(dst, src, addr, vma); 439 copy_gigantic_page(dst, src, addr, vma);
440 return;
441 }
409 442
410 might_sleep(); 443 might_sleep();
411 for (i = 0; i < pages_per_huge_page(h); i++) { 444 for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
972 return page; 1005 return page;
973} 1006}
974 1007
975__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) 1008int __weak alloc_bootmem_huge_page(struct hstate *h)
976{ 1009{
977 struct huge_bootmem_page *m; 1010 struct huge_bootmem_page *m;
978 int nr_nodes = nodes_weight(node_online_map); 1011 int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
991 * puts them into the mem_map). 1024 * puts them into the mem_map).
992 */ 1025 */
993 m = addr; 1026 m = addr;
994 if (m) 1027 goto found;
995 goto found;
996 } 1028 }
997 hstate_next_node(h); 1029 hstate_next_node(h);
998 nr_nodes--; 1030 nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc2eb68..478223b73a2a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
49/* 49/*
50 * in mm/page_alloc.c 50 * in mm/page_alloc.c
51 */ 51 */
52extern unsigned long highest_memmap_pfn;
52extern void __free_pages_bootmem(struct page *page, unsigned int order); 53extern void __free_pages_bootmem(struct page *page, unsigned int order);
53 54
54/* 55/*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
275#define GUP_FLAGS_WRITE 0x1 276#define GUP_FLAGS_WRITE 0x1
276#define GUP_FLAGS_FORCE 0x2 277#define GUP_FLAGS_FORCE 0x2
277#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 278#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
279#define GUP_FLAGS_IGNORE_SIGKILL 0x8
278 280
279int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 281int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
280 unsigned long start, int len, int flags, 282 unsigned long start, int len, int flags,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7eeb0c..51ee96545579 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -779,7 +779,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
779 return 0; 779 return 0;
780} 780}
781 781
782int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) 782static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
783 unsigned long long val)
783{ 784{
784 785
785 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 786 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
diff --git a/mm/memory.c b/mm/memory.c
index 7b9db658aca2..3f8fa06b963b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,9 @@
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h> 54#include <linux/mmu_notifier.h>
55#include <linux/kallsyms.h>
56#include <linux/swapops.h>
57#include <linux/elf.h>
55 58
56#include <asm/pgalloc.h> 59#include <asm/pgalloc.h>
57#include <asm/uaccess.h> 60#include <asm/uaccess.h>
@@ -59,9 +62,6 @@
59#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
60#include <asm/pgtable.h> 63#include <asm/pgtable.h>
61 64
62#include <linux/swapops.h>
63#include <linux/elf.h>
64
65#include "internal.h" 65#include "internal.h"
66 66
67#ifndef CONFIG_NEED_MULTIPLE_NODES 67#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
375 * 375 *
376 * The calling function must still handle the error. 376 * The calling function must still handle the error.
377 */ 377 */
378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, 378static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
379 unsigned long vaddr) 379 pte_t pte, struct page *page)
380{ 380{
381 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
382 "vm_flags = %lx, vaddr = %lx\n", 382 pud_t *pud = pud_offset(pgd, addr);
383 (long long)pte_val(pte), 383 pmd_t *pmd = pmd_offset(pud, addr);
384 (vma->vm_mm == current->mm ? current->comm : "???"), 384 struct address_space *mapping;
385 vma->vm_flags, vaddr); 385 pgoff_t index;
386 static unsigned long resume;
387 static unsigned long nr_shown;
388 static unsigned long nr_unshown;
389
390 /*
391 * Allow a burst of 60 reports, then keep quiet for that minute;
392 * or allow a steady drip of one report per second.
393 */
394 if (nr_shown == 60) {
395 if (time_before(jiffies, resume)) {
396 nr_unshown++;
397 return;
398 }
399 if (nr_unshown) {
400 printk(KERN_ALERT
401 "BUG: Bad page map: %lu messages suppressed\n",
402 nr_unshown);
403 nr_unshown = 0;
404 }
405 nr_shown = 0;
406 }
407 if (nr_shown++ == 0)
408 resume = jiffies + 60 * HZ;
409
410 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
411 index = linear_page_index(vma, addr);
412
413 printk(KERN_ALERT
414 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
415 current->comm,
416 (long long)pte_val(pte), (long long)pmd_val(*pmd));
417 if (page) {
418 printk(KERN_ALERT
419 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
420 page, (void *)page->flags, page_count(page),
421 page_mapcount(page), page->mapping, page->index);
422 }
423 printk(KERN_ALERT
424 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
425 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
426 /*
427 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
428 */
429 if (vma->vm_ops)
430 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
431 (unsigned long)vma->vm_ops->fault);
432 if (vma->vm_file && vma->vm_file->f_op)
433 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
434 (unsigned long)vma->vm_file->f_op->mmap);
386 dump_stack(); 435 dump_stack();
436 add_taint(TAINT_BAD_PAGE);
387} 437}
388 438
389static inline int is_cow_mapping(unsigned int flags) 439static inline int is_cow_mapping(unsigned int flags)
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags)
441struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 491struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
442 pte_t pte) 492 pte_t pte)
443{ 493{
444 unsigned long pfn; 494 unsigned long pfn = pte_pfn(pte);
445 495
446 if (HAVE_PTE_SPECIAL) { 496 if (HAVE_PTE_SPECIAL) {
447 if (likely(!pte_special(pte))) { 497 if (likely(!pte_special(pte)))
448 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 498 goto check_pfn;
449 return pte_page(pte); 499 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
450 } 500 print_bad_pte(vma, addr, pte, NULL);
451 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
452 return NULL; 501 return NULL;
453 } 502 }
454 503
455 /* !HAVE_PTE_SPECIAL case follows: */ 504 /* !HAVE_PTE_SPECIAL case follows: */
456 505
457 pfn = pte_pfn(pte);
458
459 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 506 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
460 if (vma->vm_flags & VM_MIXEDMAP) { 507 if (vma->vm_flags & VM_MIXEDMAP) {
461 if (!pfn_valid(pfn)) 508 if (!pfn_valid(pfn))
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
471 } 518 }
472 } 519 }
473 520
474 VM_BUG_ON(!pfn_valid(pfn)); 521check_pfn:
522 if (unlikely(pfn > highest_memmap_pfn)) {
523 print_bad_pte(vma, addr, pte, NULL);
524 return NULL;
525 }
475 526
476 /* 527 /*
477 * NOTE! We still have PageReserved() pages in the page tables. 528 * NOTE! We still have PageReserved() pages in the page tables.
478 *
479 * eg. VDSO mappings can cause them to exist. 529 * eg. VDSO mappings can cause them to exist.
480 */ 530 */
481out: 531out:
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
767 else { 817 else {
768 if (pte_dirty(ptent)) 818 if (pte_dirty(ptent))
769 set_page_dirty(page); 819 set_page_dirty(page);
770 if (pte_young(ptent)) 820 if (pte_young(ptent) &&
771 SetPageReferenced(page); 821 likely(!VM_SequentialReadHint(vma)))
822 mark_page_accessed(page);
772 file_rss--; 823 file_rss--;
773 } 824 }
774 page_remove_rmap(page, vma); 825 page_remove_rmap(page);
826 if (unlikely(page_mapcount(page) < 0))
827 print_bad_pte(vma, addr, ptent, page);
775 tlb_remove_page(tlb, page); 828 tlb_remove_page(tlb, page);
776 continue; 829 continue;
777 } 830 }
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
781 */ 834 */
782 if (unlikely(details)) 835 if (unlikely(details))
783 continue; 836 continue;
784 if (!pte_file(ptent)) 837 if (pte_file(ptent)) {
785 free_swap_and_cache(pte_to_swp_entry(ptent)); 838 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
839 print_bad_pte(vma, addr, ptent, NULL);
840 } else if
841 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
842 print_bad_pte(vma, addr, ptent, NULL);
786 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 843 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
787 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 844 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
788 845
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1153 int write = !!(flags & GUP_FLAGS_WRITE); 1210 int write = !!(flags & GUP_FLAGS_WRITE);
1154 int force = !!(flags & GUP_FLAGS_FORCE); 1211 int force = !!(flags & GUP_FLAGS_FORCE);
1155 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); 1212 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1213 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1156 1214
1157 if (len <= 0) 1215 if (len <= 0)
1158 return 0; 1216 return 0;
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1231 struct page *page; 1289 struct page *page;
1232 1290
1233 /* 1291 /*
1234 * If tsk is ooming, cut off its access to large memory 1292 * If we have a pending SIGKILL, don't keep faulting
1235 * allocations. It has a pending SIGKILL, but it can't 1293 * pages and potentially allocating memory, unless
1236 * be processed until returning to user space. 1294 * current is handling munlock--e.g., on exit. In
1295 * that case, we are not allocating memory. Rather,
1296 * we're only unlocking already resident/mapped pages.
1237 */ 1297 */
1238 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) 1298 if (unlikely(!ignore_sigkill &&
1239 return i ? i : -ENOMEM; 1299 fatal_signal_pending(current)))
1300 return i ? i : -ERESTARTSYS;
1240 1301
1241 if (write) 1302 if (write)
1242 foll_flags |= FOLL_WRITE; 1303 foll_flags |= FOLL_WRITE;
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1263 * do_wp_page has broken COW when necessary, 1324 * do_wp_page has broken COW when necessary,
1264 * even if maybe_mkwrite decided not to set 1325 * even if maybe_mkwrite decided not to set
1265 * pte_write. We can thus safely do subsequent 1326 * pte_write. We can thus safely do subsequent
1266 * page lookups as if they were reads. 1327 * page lookups as if they were reads. But only
1328 * do so when looping for pte_write is futile:
1329 * in some cases userspace may also be wanting
1330 * to write to the gotten user page, which a
1331 * read fault here might prevent (a readonly
1332 * page might get reCOWed by userspace write).
1267 */ 1333 */
1268 if (ret & VM_FAULT_WRITE) 1334 if ((ret & VM_FAULT_WRITE) &&
1335 !(vma->vm_flags & VM_WRITE))
1269 foll_flags &= ~FOLL_WRITE; 1336 foll_flags &= ~FOLL_WRITE;
1270 1337
1271 cond_resched(); 1338 cond_resched();
@@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1644 1711
1645 BUG_ON(pmd_huge(*pmd)); 1712 BUG_ON(pmd_huge(*pmd));
1646 1713
1714 arch_enter_lazy_mmu_mode();
1715
1647 token = pmd_pgtable(*pmd); 1716 token = pmd_pgtable(*pmd);
1648 1717
1649 do { 1718 do {
@@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1652 break; 1721 break;
1653 } while (pte++, addr += PAGE_SIZE, addr != end); 1722 } while (pte++, addr += PAGE_SIZE, addr != end);
1654 1723
1724 arch_leave_lazy_mmu_mode();
1725
1655 if (mm != &init_mm) 1726 if (mm != &init_mm)
1656 pte_unmap_unlock(pte-1, ptl); 1727 pte_unmap_unlock(pte-1, ptl);
1657 return err; 1728 return err;
@@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1837 * not dirty accountable. 1908 * not dirty accountable.
1838 */ 1909 */
1839 if (PageAnon(old_page)) { 1910 if (PageAnon(old_page)) {
1840 if (trylock_page(old_page)) { 1911 if (!trylock_page(old_page)) {
1841 reuse = can_share_swap_page(old_page); 1912 page_cache_get(old_page);
1842 unlock_page(old_page); 1913 pte_unmap_unlock(page_table, ptl);
1914 lock_page(old_page);
1915 page_table = pte_offset_map_lock(mm, pmd, address,
1916 &ptl);
1917 if (!pte_same(*page_table, orig_pte)) {
1918 unlock_page(old_page);
1919 page_cache_release(old_page);
1920 goto unlock;
1921 }
1922 page_cache_release(old_page);
1843 } 1923 }
1924 reuse = reuse_swap_page(old_page);
1925 unlock_page(old_page);
1844 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 1926 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1845 (VM_WRITE|VM_SHARED))) { 1927 (VM_WRITE|VM_SHARED))) {
1846 /* 1928 /*
@@ -1943,11 +2025,7 @@ gotten:
1943 * thread doing COW. 2025 * thread doing COW.
1944 */ 2026 */
1945 ptep_clear_flush_notify(vma, address, page_table); 2027 ptep_clear_flush_notify(vma, address, page_table);
1946 SetPageSwapBacked(new_page);
1947 lru_cache_add_active_or_unevictable(new_page, vma);
1948 page_add_new_anon_rmap(new_page, vma, address); 2028 page_add_new_anon_rmap(new_page, vma, address);
1949
1950//TODO: is this safe? do_anonymous_page() does it this way.
1951 set_pte_at(mm, address, page_table, entry); 2029 set_pte_at(mm, address, page_table, entry);
1952 update_mmu_cache(vma, address, entry); 2030 update_mmu_cache(vma, address, entry);
1953 if (old_page) { 2031 if (old_page) {
@@ -1973,7 +2051,7 @@ gotten:
1973 * mapcount is visible. So transitively, TLBs to 2051 * mapcount is visible. So transitively, TLBs to
1974 * old page will be flushed before it can be reused. 2052 * old page will be flushed before it can be reused.
1975 */ 2053 */
1976 page_remove_rmap(old_page, vma); 2054 page_remove_rmap(old_page);
1977 } 2055 }
1978 2056
1979 /* Free the old page.. */ 2057 /* Free the old page.. */
@@ -2374,7 +2452,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2374 2452
2375 inc_mm_counter(mm, anon_rss); 2453 inc_mm_counter(mm, anon_rss);
2376 pte = mk_pte(page, vma->vm_page_prot); 2454 pte = mk_pte(page, vma->vm_page_prot);
2377 if (write_access && can_share_swap_page(page)) { 2455 if (write_access && reuse_swap_page(page)) {
2378 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2456 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2379 write_access = 0; 2457 write_access = 0;
2380 } 2458 }
@@ -2385,7 +2463,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2385 2463
2386 swap_free(entry); 2464 swap_free(entry);
2387 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2465 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2388 remove_exclusive_swap_page(page); 2466 try_to_free_swap(page);
2389 unlock_page(page); 2467 unlock_page(page);
2390 2468
2391 if (write_access) { 2469 if (write_access) {
@@ -2442,8 +2520,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2442 if (!pte_none(*page_table)) 2520 if (!pte_none(*page_table))
2443 goto release; 2521 goto release;
2444 inc_mm_counter(mm, anon_rss); 2522 inc_mm_counter(mm, anon_rss);
2445 SetPageSwapBacked(page);
2446 lru_cache_add_active_or_unevictable(page, vma);
2447 page_add_new_anon_rmap(page, vma, address); 2523 page_add_new_anon_rmap(page, vma, address);
2448 set_pte_at(mm, address, page_table, entry); 2524 set_pte_at(mm, address, page_table, entry);
2449 2525
@@ -2591,8 +2667,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2591 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2667 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2592 if (anon) { 2668 if (anon) {
2593 inc_mm_counter(mm, anon_rss); 2669 inc_mm_counter(mm, anon_rss);
2594 SetPageSwapBacked(page);
2595 lru_cache_add_active_or_unevictable(page, vma);
2596 page_add_new_anon_rmap(page, vma, address); 2670 page_add_new_anon_rmap(page, vma, address);
2597 } else { 2671 } else {
2598 inc_mm_counter(mm, file_rss); 2672 inc_mm_counter(mm, file_rss);
@@ -2602,7 +2676,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2602 get_page(dirty_page); 2676 get_page(dirty_page);
2603 } 2677 }
2604 } 2678 }
2605//TODO: is this safe? do_anonymous_page() does it this way.
2606 set_pte_at(mm, address, page_table, entry); 2679 set_pte_at(mm, address, page_table, entry);
2607 2680
2608 /* no need to invalidate: a not-present page won't be cached */ 2681 /* no need to invalidate: a not-present page won't be cached */
@@ -2666,12 +2739,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2666 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2739 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2667 return 0; 2740 return 0;
2668 2741
2669 if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || 2742 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2670 !(vma->vm_flags & VM_CAN_NONLINEAR))) {
2671 /* 2743 /*
2672 * Page table corrupted: show pte and kill process. 2744 * Page table corrupted: show pte and kill process.
2673 */ 2745 */
2674 print_bad_pte(vma, orig_pte, address); 2746 print_bad_pte(vma, address, orig_pte, NULL);
2675 return VM_FAULT_OOM; 2747 return VM_FAULT_OOM;
2676 } 2748 }
2677 2749
@@ -2953,7 +3025,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2953{ 3025{
2954 resource_size_t phys_addr; 3026 resource_size_t phys_addr;
2955 unsigned long prot = 0; 3027 unsigned long prot = 0;
2956 void *maddr; 3028 void __iomem *maddr;
2957 int offset = addr & (PAGE_SIZE-1); 3029 int offset = addr & (PAGE_SIZE-1);
2958 3030
2959 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3031 if (follow_phys(vma, addr, write, &prot, &phys_addr))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b17371185468..c083cf5fd6df 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
216 return 0; 216 return 0;
217} 217}
218 218
219static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) 219static int __meminit __add_section(int nid, struct zone *zone,
220 unsigned long phys_start_pfn)
220{ 221{
221 int nr_pages = PAGES_PER_SECTION; 222 int nr_pages = PAGES_PER_SECTION;
222 int ret; 223 int ret;
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p
234 if (ret < 0) 235 if (ret < 0)
235 return ret; 236 return ret;
236 237
237 return register_new_memory(__pfn_to_section(phys_start_pfn)); 238 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
238} 239}
239 240
240#ifdef CONFIG_SPARSEMEM_VMEMMAP 241#ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
273 * call this function after deciding the zone to which to 274 * call this function after deciding the zone to which to
274 * add the new pages. 275 * add the new pages.
275 */ 276 */
276int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, 277int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
277 unsigned long nr_pages) 278 unsigned long nr_pages)
278{ 279{
279 unsigned long i; 280 unsigned long i;
280 int err = 0; 281 int err = 0;
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
284 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 285 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
285 286
286 for (i = start_sec; i <= end_sec; i++) { 287 for (i = start_sec; i <= end_sec; i++) {
287 err = __add_section(zone, i << PFN_SECTION_SHIFT); 288 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
288 289
289 /* 290 /*
290 * EEXIST is finally dealt with by ioresource collision 291 * EEXIST is finally dealt with by ioresource collision
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end)
626} 627}
627 628
628static struct page * 629static struct page *
629hotremove_migrate_alloc(struct page *page, 630hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
630 unsigned long private,
631 int **x)
632{ 631{
633 /* This should be improoooooved!! */ 632 /* This should be improooooved!! */
634 return alloc_page(GFP_HIGHUSER_PAGECACHE); 633 return alloc_page(GFP_HIGHUSER_MOVABLE);
635} 634}
636 635
637
638#define NR_OFFLINE_AT_ONCE_PAGES (256) 636#define NR_OFFLINE_AT_ONCE_PAGES (256)
639static int 637static int
640do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 638do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/migrate.c b/mm/migrate.c
index 21631ab8c08b..55373983c9c6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -300,12 +300,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
300 * Now we know that no one else is looking at the page. 300 * Now we know that no one else is looking at the page.
301 */ 301 */
302 get_page(newpage); /* add cache reference */ 302 get_page(newpage); /* add cache reference */
303#ifdef CONFIG_SWAP
304 if (PageSwapCache(page)) { 303 if (PageSwapCache(page)) {
305 SetPageSwapCache(newpage); 304 SetPageSwapCache(newpage);
306 set_page_private(newpage, page_private(page)); 305 set_page_private(newpage, page_private(page));
307 } 306 }
308#endif
309 307
310 radix_tree_replace_slot(pslot, newpage); 308 radix_tree_replace_slot(pslot, newpage);
311 309
@@ -373,9 +371,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
373 371
374 mlock_migrate_page(newpage, page); 372 mlock_migrate_page(newpage, page);
375 373
376#ifdef CONFIG_SWAP
377 ClearPageSwapCache(page); 374 ClearPageSwapCache(page);
378#endif
379 ClearPagePrivate(page); 375 ClearPagePrivate(page);
380 set_page_private(page, 0); 376 set_page_private(page, 0);
381 /* page->mapping contains a flag for PageAnon() */ 377 /* page->mapping contains a flag for PageAnon() */
@@ -848,12 +844,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
848 struct vm_area_struct *vma; 844 struct vm_area_struct *vma;
849 struct page *page; 845 struct page *page;
850 846
851 /*
852 * A valid page pointer that will not match any of the
853 * pages that will be moved.
854 */
855 pp->page = ZERO_PAGE(0);
856
857 err = -EFAULT; 847 err = -EFAULT;
858 vma = find_vma(mm, pp->addr); 848 vma = find_vma(mm, pp->addr);
859 if (!vma || !vma_migratable(vma)) 849 if (!vma || !vma_migratable(vma))
@@ -919,41 +909,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
919 const int __user *nodes, 909 const int __user *nodes,
920 int __user *status, int flags) 910 int __user *status, int flags)
921{ 911{
922 struct page_to_node *pm = NULL; 912 struct page_to_node *pm;
923 nodemask_t task_nodes; 913 nodemask_t task_nodes;
924 int err = 0; 914 unsigned long chunk_nr_pages;
925 int i; 915 unsigned long chunk_start;
916 int err;
926 917
927 task_nodes = cpuset_mems_allowed(task); 918 task_nodes = cpuset_mems_allowed(task);
928 919
929 /* Limit nr_pages so that the multiplication may not overflow */ 920 err = -ENOMEM;
930 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { 921 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
931 err = -E2BIG; 922 if (!pm)
932 goto out;
933 }
934
935 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
936 if (!pm) {
937 err = -ENOMEM;
938 goto out; 923 goto out;
939 }
940
941 /* 924 /*
942 * Get parameters from user space and initialize the pm 925 * Store a chunk of page_to_node array in a page,
943 * array. Return various errors if the user did something wrong. 926 * but keep the last one as a marker
944 */ 927 */
945 for (i = 0; i < nr_pages; i++) { 928 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
946 const void __user *p;
947 929
948 err = -EFAULT; 930 for (chunk_start = 0;
949 if (get_user(p, pages + i)) 931 chunk_start < nr_pages;
950 goto out_pm; 932 chunk_start += chunk_nr_pages) {
933 int j;
951 934
952 pm[i].addr = (unsigned long)p; 935 if (chunk_start + chunk_nr_pages > nr_pages)
953 if (nodes) { 936 chunk_nr_pages = nr_pages - chunk_start;
937
938 /* fill the chunk pm with addrs and nodes from user-space */
939 for (j = 0; j < chunk_nr_pages; j++) {
940 const void __user *p;
954 int node; 941 int node;
955 942
956 if (get_user(node, nodes + i)) 943 err = -EFAULT;
944 if (get_user(p, pages + j + chunk_start))
945 goto out_pm;
946 pm[j].addr = (unsigned long) p;
947
948 if (get_user(node, nodes + j + chunk_start))
957 goto out_pm; 949 goto out_pm;
958 950
959 err = -ENODEV; 951 err = -ENODEV;
@@ -964,22 +956,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
964 if (!node_isset(node, task_nodes)) 956 if (!node_isset(node, task_nodes))
965 goto out_pm; 957 goto out_pm;
966 958
967 pm[i].node = node; 959 pm[j].node = node;
968 } else 960 }
969 pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 961
970 } 962 /* End marker for this chunk */
971 /* End marker */ 963 pm[chunk_nr_pages].node = MAX_NUMNODES;
972 pm[nr_pages].node = MAX_NUMNODES; 964
965 /* Migrate this chunk */
966 err = do_move_page_to_node_array(mm, pm,
967 flags & MPOL_MF_MOVE_ALL);
968 if (err < 0)
969 goto out_pm;
973 970
974 err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
975 if (err >= 0)
976 /* Return status information */ 971 /* Return status information */
977 for (i = 0; i < nr_pages; i++) 972 for (j = 0; j < chunk_nr_pages; j++)
978 if (put_user(pm[i].status, status + i)) 973 if (put_user(pm[j].status, status + j + chunk_start)) {
979 err = -EFAULT; 974 err = -EFAULT;
975 goto out_pm;
976 }
977 }
978 err = 0;
980 979
981out_pm: 980out_pm:
982 vfree(pm); 981 free_page((unsigned long)pm);
983out: 982out:
984 return err; 983 return err;
985} 984}
diff --git a/mm/mlock.c b/mm/mlock.c
index 3035a56e7616..e125156c664e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
173 (atomic_read(&mm->mm_users) != 0)); 173 (atomic_read(&mm->mm_users) != 0));
174 174
175 /* 175 /*
176 * mlock: don't page populate if page has PROT_NONE permission. 176 * mlock: don't page populate if vma has PROT_NONE permission.
177 * munlock: the pages always do munlock althrough 177 * munlock: always do munlock although the vma has PROT_NONE
178 * its has PROT_NONE permission. 178 * permission, or SIGKILL is pending.
179 */ 179 */
180 if (!mlock) 180 if (!mlock)
181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; 181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
182 GUP_FLAGS_IGNORE_SIGKILL;
182 183
183 if (vma->vm_flags & VM_WRITE) 184 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE; 185 gup_flags |= GUP_FLAGS_WRITE;
diff --git a/mm/mmap.c b/mm/mmap.c
index 2c778fcfd9bd..a910c045cfd4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
413 413
414static void __vma_link_file(struct vm_area_struct *vma) 414static void __vma_link_file(struct vm_area_struct *vma)
415{ 415{
416 struct file * file; 416 struct file *file;
417 417
418 file = vma->vm_file; 418 file = vma->vm_file;
419 if (file) { 419 if (file) {
@@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
474 * insert vm structure into list and rbtree and anon_vma, 474 * insert vm structure into list and rbtree and anon_vma,
475 * but it has already been inserted into prio_tree earlier. 475 * but it has already been inserted into prio_tree earlier.
476 */ 476 */
477static void 477static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
478__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
479{ 478{
480 struct vm_area_struct * __vma, * prev; 479 struct vm_area_struct *__vma, *prev;
481 struct rb_node ** rb_link, * rb_parent; 480 struct rb_node **rb_link, *rb_parent;
482 481
483 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); 482 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
484 BUG_ON(__vma && __vma->vm_start < vma->vm_end); 483 BUG_ON(__vma && __vma->vm_start < vma->vm_end);
@@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
908 * The caller must hold down_write(current->mm->mmap_sem). 907 * The caller must hold down_write(current->mm->mmap_sem).
909 */ 908 */
910 909
911unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, 910unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
912 unsigned long len, unsigned long prot, 911 unsigned long len, unsigned long prot,
913 unsigned long flags, unsigned long pgoff) 912 unsigned long flags, unsigned long pgoff)
914{ 913{
@@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1464EXPORT_SYMBOL(get_unmapped_area); 1463EXPORT_SYMBOL(get_unmapped_area);
1465 1464
1466/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1465/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1467struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) 1466struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1468{ 1467{
1469 struct vm_area_struct *vma = NULL; 1468 struct vm_area_struct *vma = NULL;
1470 1469
@@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
1507 struct vm_area_struct **pprev) 1506 struct vm_area_struct **pprev)
1508{ 1507{
1509 struct vm_area_struct *vma = NULL, *prev = NULL; 1508 struct vm_area_struct *vma = NULL, *prev = NULL;
1510 struct rb_node * rb_node; 1509 struct rb_node *rb_node;
1511 if (!mm) 1510 if (!mm)
1512 goto out; 1511 goto out;
1513 1512
@@ -1541,7 +1540,7 @@ out:
1541 * update accounting. This is shared with both the 1540 * update accounting. This is shared with both the
1542 * grow-up and grow-down cases. 1541 * grow-up and grow-down cases.
1543 */ 1542 */
1544static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) 1543static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
1545{ 1544{
1546 struct mm_struct *mm = vma->vm_mm; 1545 struct mm_struct *mm = vma->vm_mm;
1547 struct rlimit *rlim = current->signal->rlim; 1546 struct rlimit *rlim = current->signal->rlim;
@@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm)
2091 arch_exit_mmap(mm); 2090 arch_exit_mmap(mm);
2092 mmu_notifier_release(mm); 2091 mmu_notifier_release(mm);
2093 2092
2093 if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */
2094 return;
2095
2094 if (mm->locked_vm) { 2096 if (mm->locked_vm) {
2095 vma = mm->mmap; 2097 vma = mm->mmap;
2096 while (vma) { 2098 while (vma) {
@@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm)
2103 lru_add_drain(); 2105 lru_add_drain();
2104 flush_cache_mm(mm); 2106 flush_cache_mm(mm);
2105 tlb = tlb_gather_mmu(mm, 1); 2107 tlb = tlb_gather_mmu(mm, 1);
2106 /* Don't update_hiwater_rss(mm) here, do_exit already did */ 2108 /* update_hiwater_rss(mm) here? but nobody should be looking */
2107 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2109 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2108 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2110 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2109 vm_unacct_memory(nr_accounted); 2111 vm_unacct_memory(nr_accounted);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cfb4c4852062..d0f6e7ce09f1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -22,6 +22,7 @@
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
27#include <asm/cacheflush.h> 28#include <asm/cacheflush.h>
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
59 ptent = pte_mkwrite(ptent); 60 ptent = pte_mkwrite(ptent);
60 61
61 ptep_modify_prot_commit(mm, addr, pte, ptent); 62 ptep_modify_prot_commit(mm, addr, pte, ptent);
62#ifdef CONFIG_MIGRATION 63 } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
63 } else if (!pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
65 65
66 if (is_write_migration_entry(entry)) { 66 if (is_write_migration_entry(entry)) {
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 72 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 73 swp_entry_to_pte(entry));
74 } 74 }
75#endif
76 } 75 }
77
78 } while (pte++, addr += PAGE_SIZE, addr != end); 76 } while (pte++, addr += PAGE_SIZE, addr != end);
79 arch_leave_lazy_mmu_mode(); 77 arch_leave_lazy_mmu_mode();
80 pte_unmap_unlock(pte - 1, ptl); 78 pte_unmap_unlock(pte - 1, ptl);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 558f9afe6e4e..6b9e758c98a5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,7 +31,7 @@
31int sysctl_panic_on_oom; 31int sysctl_panic_on_oom;
32int sysctl_oom_kill_allocating_task; 32int sysctl_oom_kill_allocating_task;
33int sysctl_oom_dump_tasks; 33int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_mutex); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/** 37/**
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
392 printk(KERN_WARNING "%s invoked oom-killer: " 392 printk(KERN_WARNING "%s invoked oom-killer: "
393 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 393 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
394 current->comm, gfp_mask, order, current->oomkilladj); 394 current->comm, gfp_mask, order, current->oomkilladj);
395 task_lock(current);
396 cpuset_print_task_mems_allowed(current);
397 task_unlock(current);
395 dump_stack(); 398 dump_stack();
396 show_mem(); 399 show_mem();
397 if (sysctl_oom_dump_tasks) 400 if (sysctl_oom_dump_tasks)
@@ -470,7 +473,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
470 struct zone *zone; 473 struct zone *zone;
471 int ret = 1; 474 int ret = 1;
472 475
473 spin_lock(&zone_scan_mutex); 476 spin_lock(&zone_scan_lock);
474 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 477 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
475 if (zone_is_oom_locked(zone)) { 478 if (zone_is_oom_locked(zone)) {
476 ret = 0; 479 ret = 0;
@@ -480,7 +483,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
480 483
481 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 484 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
482 /* 485 /*
483 * Lock each zone in the zonelist under zone_scan_mutex so a 486 * Lock each zone in the zonelist under zone_scan_lock so a
484 * parallel invocation of try_set_zone_oom() doesn't succeed 487 * parallel invocation of try_set_zone_oom() doesn't succeed
485 * when it shouldn't. 488 * when it shouldn't.
486 */ 489 */
@@ -488,7 +491,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
488 } 491 }
489 492
490out: 493out:
491 spin_unlock(&zone_scan_mutex); 494 spin_unlock(&zone_scan_lock);
492 return ret; 495 return ret;
493} 496}
494 497
@@ -502,11 +505,74 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
502 struct zoneref *z; 505 struct zoneref *z;
503 struct zone *zone; 506 struct zone *zone;
504 507
505 spin_lock(&zone_scan_mutex); 508 spin_lock(&zone_scan_lock);
506 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 509 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
507 zone_clear_flag(zone, ZONE_OOM_LOCKED); 510 zone_clear_flag(zone, ZONE_OOM_LOCKED);
508 } 511 }
509 spin_unlock(&zone_scan_mutex); 512 spin_unlock(&zone_scan_lock);
513}
514
515/*
516 * Must be called with tasklist_lock held for read.
517 */
518static void __out_of_memory(gfp_t gfp_mask, int order)
519{
520 if (sysctl_oom_kill_allocating_task) {
521 oom_kill_process(current, gfp_mask, order, 0, NULL,
522 "Out of memory (oom_kill_allocating_task)");
523
524 } else {
525 unsigned long points;
526 struct task_struct *p;
527
528retry:
529 /*
530 * Rambo mode: Shoot down a process and hope it solves whatever
531 * issues we may have.
532 */
533 p = select_bad_process(&points, NULL);
534
535 if (PTR_ERR(p) == -1UL)
536 return;
537
538 /* Found nothing?!?! Either we hang forever, or we panic. */
539 if (!p) {
540 read_unlock(&tasklist_lock);
541 panic("Out of memory and no killable processes...\n");
542 }
543
544 if (oom_kill_process(p, gfp_mask, order, points, NULL,
545 "Out of memory"))
546 goto retry;
547 }
548}
549
550/*
551 * pagefault handler calls into here because it is out of memory but
552 * doesn't know exactly how or why.
553 */
554void pagefault_out_of_memory(void)
555{
556 unsigned long freed = 0;
557
558 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
559 if (freed > 0)
560 /* Got some memory back in the last second. */
561 return;
562
563 if (sysctl_panic_on_oom)
564 panic("out of memory from page fault. panic_on_oom is selected.\n");
565
566 read_lock(&tasklist_lock);
567 __out_of_memory(0, 0); /* unknown gfp_mask and order */
568 read_unlock(&tasklist_lock);
569
570 /*
571 * Give "p" a good chance of killing itself before we
572 * retry to allocate memory.
573 */
574 if (!test_thread_flag(TIF_MEMDIE))
575 schedule_timeout_uninterruptible(1);
510} 576}
511 577
512/** 578/**
@@ -522,8 +588,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
522 */ 588 */
523void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 589void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
524{ 590{
525 struct task_struct *p;
526 unsigned long points = 0;
527 unsigned long freed = 0; 591 unsigned long freed = 0;
528 enum oom_constraint constraint; 592 enum oom_constraint constraint;
529 593
@@ -544,7 +608,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
544 608
545 switch (constraint) { 609 switch (constraint) {
546 case CONSTRAINT_MEMORY_POLICY: 610 case CONSTRAINT_MEMORY_POLICY:
547 oom_kill_process(current, gfp_mask, order, points, NULL, 611 oom_kill_process(current, gfp_mask, order, 0, NULL,
548 "No available memory (MPOL_BIND)"); 612 "No available memory (MPOL_BIND)");
549 break; 613 break;
550 614
@@ -553,35 +617,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
553 panic("out of memory. panic_on_oom is selected\n"); 617 panic("out of memory. panic_on_oom is selected\n");
554 /* Fall-through */ 618 /* Fall-through */
555 case CONSTRAINT_CPUSET: 619 case CONSTRAINT_CPUSET:
556 if (sysctl_oom_kill_allocating_task) { 620 __out_of_memory(gfp_mask, order);
557 oom_kill_process(current, gfp_mask, order, points, NULL,
558 "Out of memory (oom_kill_allocating_task)");
559 break;
560 }
561retry:
562 /*
563 * Rambo mode: Shoot down a process and hope it solves whatever
564 * issues we may have.
565 */
566 p = select_bad_process(&points, NULL);
567
568 if (PTR_ERR(p) == -1UL)
569 goto out;
570
571 /* Found nothing?!?! Either we hang forever, or we panic. */
572 if (!p) {
573 read_unlock(&tasklist_lock);
574 panic("Out of memory and no killable processes...\n");
575 }
576
577 if (oom_kill_process(p, gfp_mask, order, points, NULL,
578 "Out of memory"))
579 goto retry;
580
581 break; 621 break;
582 } 622 }
583 623
584out:
585 read_unlock(&tasklist_lock); 624 read_unlock(&tasklist_lock);
586 625
587 /* 626 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2970e35fd03f..b493db7841dc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
69int dirty_background_ratio = 5; 69int dirty_background_ratio = 5;
70 70
71/* 71/*
72 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
73 * dirty_background_ratio * the amount of dirtyable memory
74 */
75unsigned long dirty_background_bytes;
76
77/*
72 * free highmem will not be subtracted from the total free memory 78 * free highmem will not be subtracted from the total free memory
73 * for calculating free ratios if vm_highmem_is_dirtyable is true 79 * for calculating free ratios if vm_highmem_is_dirtyable is true
74 */ 80 */
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable;
80int vm_dirty_ratio = 10; 86int vm_dirty_ratio = 10;
81 87
82/* 88/*
89 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
90 * vm_dirty_ratio * the amount of dirtyable memory
91 */
92unsigned long vm_dirty_bytes;
93
94/*
83 * The interval between `kupdate'-style writebacks, in jiffies 95 * The interval between `kupdate'-style writebacks, in jiffies
84 */ 96 */
85int dirty_writeback_interval = 5 * HZ; 97int dirty_writeback_interval = 5 * HZ;
@@ -135,23 +147,75 @@ static int calc_period_shift(void)
135{ 147{
136 unsigned long dirty_total; 148 unsigned long dirty_total;
137 149
138 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; 150 if (vm_dirty_bytes)
151 dirty_total = vm_dirty_bytes / PAGE_SIZE;
152 else
153 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
154 100;
139 return 2 + ilog2(dirty_total - 1); 155 return 2 + ilog2(dirty_total - 1);
140} 156}
141 157
142/* 158/*
143 * update the period when the dirty ratio changes. 159 * update the period when the dirty threshold changes.
144 */ 160 */
161static void update_completion_period(void)
162{
163 int shift = calc_period_shift();
164 prop_change_shift(&vm_completions, shift);
165 prop_change_shift(&vm_dirties, shift);
166}
167
168int dirty_background_ratio_handler(struct ctl_table *table, int write,
169 struct file *filp, void __user *buffer, size_t *lenp,
170 loff_t *ppos)
171{
172 int ret;
173
174 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
175 if (ret == 0 && write)
176 dirty_background_bytes = 0;
177 return ret;
178}
179
180int dirty_background_bytes_handler(struct ctl_table *table, int write,
181 struct file *filp, void __user *buffer, size_t *lenp,
182 loff_t *ppos)
183{
184 int ret;
185
186 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
187 if (ret == 0 && write)
188 dirty_background_ratio = 0;
189 return ret;
190}
191
145int dirty_ratio_handler(struct ctl_table *table, int write, 192int dirty_ratio_handler(struct ctl_table *table, int write,
146 struct file *filp, void __user *buffer, size_t *lenp, 193 struct file *filp, void __user *buffer, size_t *lenp,
147 loff_t *ppos) 194 loff_t *ppos)
148{ 195{
149 int old_ratio = vm_dirty_ratio; 196 int old_ratio = vm_dirty_ratio;
150 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 197 int ret;
198
199 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
151 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 200 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
152 int shift = calc_period_shift(); 201 update_completion_period();
153 prop_change_shift(&vm_completions, shift); 202 vm_dirty_bytes = 0;
154 prop_change_shift(&vm_dirties, shift); 203 }
204 return ret;
205}
206
207
208int dirty_bytes_handler(struct ctl_table *table, int write,
209 struct file *filp, void __user *buffer, size_t *lenp,
210 loff_t *ppos)
211{
212 int old_bytes = vm_dirty_bytes;
213 int ret;
214
215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
216 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
217 update_completion_period();
218 vm_dirty_ratio = 0;
155 } 219 }
156 return ret; 220 return ret;
157} 221}
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void)
362} 426}
363 427
364void 428void
365get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, 429get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
366 struct backing_dev_info *bdi) 430 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
367{ 431{
368 int background_ratio; /* Percentages */ 432 unsigned long background;
369 int dirty_ratio; 433 unsigned long dirty;
370 long background;
371 long dirty;
372 unsigned long available_memory = determine_dirtyable_memory(); 434 unsigned long available_memory = determine_dirtyable_memory();
373 struct task_struct *tsk; 435 struct task_struct *tsk;
374 436
375 dirty_ratio = vm_dirty_ratio; 437 if (vm_dirty_bytes)
376 if (dirty_ratio < 5) 438 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
377 dirty_ratio = 5; 439 else {
440 int dirty_ratio;
378 441
379 background_ratio = dirty_background_ratio; 442 dirty_ratio = vm_dirty_ratio;
380 if (background_ratio >= dirty_ratio) 443 if (dirty_ratio < 5)
381 background_ratio = dirty_ratio / 2; 444 dirty_ratio = 5;
445 dirty = (dirty_ratio * available_memory) / 100;
446 }
447
448 if (dirty_background_bytes)
449 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
450 else
451 background = (dirty_background_ratio * available_memory) / 100;
382 452
383 background = (background_ratio * available_memory) / 100; 453 if (background >= dirty)
384 dirty = (dirty_ratio * available_memory) / 100; 454 background = dirty / 2;
385 tsk = current; 455 tsk = current;
386 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { 456 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
387 background += background / 4; 457 background += background / 4;
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping)
423{ 493{
424 long nr_reclaimable, bdi_nr_reclaimable; 494 long nr_reclaimable, bdi_nr_reclaimable;
425 long nr_writeback, bdi_nr_writeback; 495 long nr_writeback, bdi_nr_writeback;
426 long background_thresh; 496 unsigned long background_thresh;
427 long dirty_thresh; 497 unsigned long dirty_thresh;
428 long bdi_thresh; 498 unsigned long bdi_thresh;
429 unsigned long pages_written = 0; 499 unsigned long pages_written = 0;
430 unsigned long write_chunk = sync_writeback_pages(); 500 unsigned long write_chunk = sync_writeback_pages();
431 501
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
580 650
581void throttle_vm_writeout(gfp_t gfp_mask) 651void throttle_vm_writeout(gfp_t gfp_mask)
582{ 652{
583 long background_thresh; 653 unsigned long background_thresh;
584 long dirty_thresh; 654 unsigned long dirty_thresh;
585 655
586 for ( ; ; ) { 656 for ( ; ; ) {
587 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 657 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages)
624 }; 694 };
625 695
626 for ( ; ; ) { 696 for ( ; ; ) {
627 long background_thresh; 697 unsigned long background_thresh;
628 long dirty_thresh; 698 unsigned long dirty_thresh;
629 699
630 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 700 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
631 if (global_page_state(NR_FILE_DIRTY) + 701 if (global_page_state(NR_FILE_DIRTY) +
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping,
868 int done = 0; 938 int done = 0;
869 struct pagevec pvec; 939 struct pagevec pvec;
870 int nr_pages; 940 int nr_pages;
941 pgoff_t uninitialized_var(writeback_index);
871 pgoff_t index; 942 pgoff_t index;
872 pgoff_t end; /* Inclusive */ 943 pgoff_t end; /* Inclusive */
873 int scanned = 0; 944 pgoff_t done_index;
945 int cycled;
874 int range_whole = 0; 946 int range_whole = 0;
875 long nr_to_write = wbc->nr_to_write; 947 long nr_to_write = wbc->nr_to_write;
876 948
@@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping,
881 953
882 pagevec_init(&pvec, 0); 954 pagevec_init(&pvec, 0);
883 if (wbc->range_cyclic) { 955 if (wbc->range_cyclic) {
884 index = mapping->writeback_index; /* Start from prev offset */ 956 writeback_index = mapping->writeback_index; /* prev offset */
957 index = writeback_index;
958 if (index == 0)
959 cycled = 1;
960 else
961 cycled = 0;
885 end = -1; 962 end = -1;
886 } else { 963 } else {
887 index = wbc->range_start >> PAGE_CACHE_SHIFT; 964 index = wbc->range_start >> PAGE_CACHE_SHIFT;
888 end = wbc->range_end >> PAGE_CACHE_SHIFT; 965 end = wbc->range_end >> PAGE_CACHE_SHIFT;
889 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 966 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
890 range_whole = 1; 967 range_whole = 1;
891 scanned = 1; 968 cycled = 1; /* ignore range_cyclic tests */
892 } 969 }
893retry: 970retry:
894 while (!done && (index <= end) && 971 done_index = index;
895 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 972 while (!done && (index <= end)) {
896 PAGECACHE_TAG_DIRTY, 973 int i;
897 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 974
898 unsigned i; 975 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
976 PAGECACHE_TAG_DIRTY,
977 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
978 if (nr_pages == 0)
979 break;
899 980
900 scanned = 1;
901 for (i = 0; i < nr_pages; i++) { 981 for (i = 0; i < nr_pages; i++) {
902 struct page *page = pvec.pages[i]; 982 struct page *page = pvec.pages[i];
903 983
904 /* 984 /*
905 * At this point we hold neither mapping->tree_lock nor 985 * At this point, the page may be truncated or
906 * lock on the page itself: the page may be truncated or 986 * invalidated (changing page->mapping to NULL), or
907 * invalidated (changing page->mapping to NULL), or even 987 * even swizzled back from swapper_space to tmpfs file
908 * swizzled back from swapper_space to tmpfs file 988 * mapping. However, page->index will not change
909 * mapping 989 * because we have a reference on the page.
910 */ 990 */
991 if (page->index > end) {
992 /*
993 * can't be range_cyclic (1st pass) because
994 * end == -1 in that case.
995 */
996 done = 1;
997 break;
998 }
999
1000 done_index = page->index + 1;
1001
911 lock_page(page); 1002 lock_page(page);
912 1003
1004 /*
1005 * Page truncated or invalidated. We can freely skip it
1006 * then, even for data integrity operations: the page
1007 * has disappeared concurrently, so there could be no
1008 * real expectation of this data interity operation
1009 * even if there is now a new, dirty page at the same
1010 * pagecache address.
1011 */
913 if (unlikely(page->mapping != mapping)) { 1012 if (unlikely(page->mapping != mapping)) {
1013continue_unlock:
914 unlock_page(page); 1014 unlock_page(page);
915 continue; 1015 continue;
916 } 1016 }
917 1017
918 if (!wbc->range_cyclic && page->index > end) { 1018 if (!PageDirty(page)) {
919 done = 1; 1019 /* someone wrote it for us */
920 unlock_page(page); 1020 goto continue_unlock;
921 continue;
922 } 1021 }
923 1022
924 if (wbc->sync_mode != WB_SYNC_NONE) 1023 if (PageWriteback(page)) {
925 wait_on_page_writeback(page); 1024 if (wbc->sync_mode != WB_SYNC_NONE)
926 1025 wait_on_page_writeback(page);
927 if (PageWriteback(page) || 1026 else
928 !clear_page_dirty_for_io(page)) { 1027 goto continue_unlock;
929 unlock_page(page);
930 continue;
931 } 1028 }
932 1029
933 ret = (*writepage)(page, wbc, data); 1030 BUG_ON(PageWriteback(page));
1031 if (!clear_page_dirty_for_io(page))
1032 goto continue_unlock;
934 1033
935 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 1034 ret = (*writepage)(page, wbc, data);
936 unlock_page(page); 1035 if (unlikely(ret)) {
937 ret = 0; 1036 if (ret == AOP_WRITEPAGE_ACTIVATE) {
1037 unlock_page(page);
1038 ret = 0;
1039 } else {
1040 /*
1041 * done_index is set past this page,
1042 * so media errors will not choke
1043 * background writeout for the entire
1044 * file. This has consequences for
1045 * range_cyclic semantics (ie. it may
1046 * not be suitable for data integrity
1047 * writeout).
1048 */
1049 done = 1;
1050 break;
1051 }
1052 }
1053
1054 if (wbc->sync_mode == WB_SYNC_NONE) {
1055 wbc->nr_to_write--;
1056 if (wbc->nr_to_write <= 0) {
1057 done = 1;
1058 break;
1059 }
938 } 1060 }
939 if (ret || (--nr_to_write <= 0))
940 done = 1;
941 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1061 if (wbc->nonblocking && bdi_write_congested(bdi)) {
942 wbc->encountered_congestion = 1; 1062 wbc->encountered_congestion = 1;
943 done = 1; 1063 done = 1;
1064 break;
944 } 1065 }
945 } 1066 }
946 pagevec_release(&pvec); 1067 pagevec_release(&pvec);
947 cond_resched(); 1068 cond_resched();
948 } 1069 }
949 if (!scanned && !done) { 1070 if (!cycled) {
950 /* 1071 /*
1072 * range_cyclic:
951 * We hit the last page and there is more work to be done: wrap 1073 * We hit the last page and there is more work to be done: wrap
952 * back to the start of the file 1074 * back to the start of the file
953 */ 1075 */
954 scanned = 1; 1076 cycled = 1;
955 index = 0; 1077 index = 0;
1078 end = writeback_index - 1;
956 goto retry; 1079 goto retry;
957 } 1080 }
958 if (!wbc->no_nrwrite_index_update) { 1081 if (!wbc->no_nrwrite_index_update) {
959 if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) 1082 if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
960 mapping->writeback_index = index; 1083 mapping->writeback_index = done_index;
961 wbc->nr_to_write = nr_to_write; 1084 wbc->nr_to_write = nr_to_write;
962 } 1085 }
963 1086
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8ac01474563..7bf22e045318 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states);
69 69
70unsigned long totalram_pages __read_mostly; 70unsigned long totalram_pages __read_mostly;
71unsigned long totalreserve_pages __read_mostly; 71unsigned long totalreserve_pages __read_mostly;
72long nr_swap_pages; 72unsigned long highest_memmap_pfn __read_mostly;
73int percpu_pagelist_fraction; 73int percpu_pagelist_fraction;
74 74
75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page)
223 223
224static void bad_page(struct page *page) 224static void bad_page(struct page *page)
225{ 225{
226 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG 226 static unsigned long resume;
227 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 227 static unsigned long nr_shown;
228 current->comm, page, (int)(2*sizeof(unsigned long)), 228 static unsigned long nr_unshown;
229 (unsigned long)page->flags, page->mapping, 229
230 page_mapcount(page), page_count(page)); 230 /*
231 * Allow a burst of 60 reports, then keep quiet for that minute;
232 * or allow a steady drip of one report per second.
233 */
234 if (nr_shown == 60) {
235 if (time_before(jiffies, resume)) {
236 nr_unshown++;
237 goto out;
238 }
239 if (nr_unshown) {
240 printk(KERN_ALERT
241 "BUG: Bad page state: %lu messages suppressed\n",
242 nr_unshown);
243 nr_unshown = 0;
244 }
245 nr_shown = 0;
246 }
247 if (nr_shown++ == 0)
248 resume = jiffies + 60 * HZ;
249
250 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
251 current->comm, page_to_pfn(page));
252 printk(KERN_ALERT
253 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
254 page, (void *)page->flags, page_count(page),
255 page_mapcount(page), page->mapping, page->index);
231 256
232 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
233 KERN_EMERG "Backtrace:\n");
234 dump_stack(); 257 dump_stack();
235 page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; 258out:
236 set_page_count(page, 0); 259 /* Leave bad fields for debug, except PageBuddy could make trouble */
237 reset_page_mapcount(page); 260 __ClearPageBuddy(page);
238 page->mapping = NULL;
239 add_taint(TAINT_BAD_PAGE); 261 add_taint(TAINT_BAD_PAGE);
240} 262}
241 263
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order)
292} 314}
293#endif 315#endif
294 316
295static void destroy_compound_page(struct page *page, unsigned long order) 317static int destroy_compound_page(struct page *page, unsigned long order)
296{ 318{
297 int i; 319 int i;
298 int nr_pages = 1 << order; 320 int nr_pages = 1 << order;
321 int bad = 0;
299 322
300 if (unlikely(compound_order(page) != order)) 323 if (unlikely(compound_order(page) != order) ||
324 unlikely(!PageHead(page))) {
301 bad_page(page); 325 bad_page(page);
326 bad++;
327 }
302 328
303 if (unlikely(!PageHead(page)))
304 bad_page(page);
305 __ClearPageHead(page); 329 __ClearPageHead(page);
330
306 for (i = 1; i < nr_pages; i++) { 331 for (i = 1; i < nr_pages; i++) {
307 struct page *p = page + i; 332 struct page *p = page + i;
308 333
309 if (unlikely(!PageTail(p) | 334 if (unlikely(!PageTail(p) | (p->first_page != page))) {
310 (p->first_page != page)))
311 bad_page(page); 335 bad_page(page);
336 bad++;
337 }
312 __ClearPageTail(p); 338 __ClearPageTail(p);
313 } 339 }
340
341 return bad;
314} 342}
315 343
316static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 344static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page,
430 int migratetype = get_pageblock_migratetype(page); 458 int migratetype = get_pageblock_migratetype(page);
431 459
432 if (unlikely(PageCompound(page))) 460 if (unlikely(PageCompound(page)))
433 destroy_compound_page(page, order); 461 if (unlikely(destroy_compound_page(page, order)))
462 return;
434 463
435 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 464 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
436 465
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page)
467 if (unlikely(page_mapcount(page) | 496 if (unlikely(page_mapcount(page) |
468 (page->mapping != NULL) | 497 (page->mapping != NULL) |
469 (page_count(page) != 0) | 498 (page_count(page) != 0) |
470 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) 499 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
471 bad_page(page); 500 bad_page(page);
472 if (PageDirty(page)) 501 return 1;
473 __ClearPageDirty(page); 502 }
474 if (PageSwapBacked(page)) 503 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
475 __ClearPageSwapBacked(page); 504 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
476 /* 505 return 0;
477 * For now, we report if PG_reserved was found set, but do not
478 * clear it, and do not free the page. But we shall soon need
479 * to do more, for when the ZERO_PAGE count wraps negative.
480 */
481 return PageReserved(page);
482} 506}
483 507
484/* 508/*
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
523{ 547{
524 unsigned long flags; 548 unsigned long flags;
525 int i; 549 int i;
526 int reserved = 0; 550 int bad = 0;
527 551
528 for (i = 0 ; i < (1 << order) ; ++i) 552 for (i = 0 ; i < (1 << order) ; ++i)
529 reserved += free_pages_check(page + i); 553 bad += free_pages_check(page + i);
530 if (reserved) 554 if (bad)
531 return; 555 return;
532 556
533 if (!PageHighMem(page)) { 557 if (!PageHighMem(page)) {
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
612 if (unlikely(page_mapcount(page) | 636 if (unlikely(page_mapcount(page) |
613 (page->mapping != NULL) | 637 (page->mapping != NULL) |
614 (page_count(page) != 0) | 638 (page_count(page) != 0) |
615 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) 639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
616 bad_page(page); 640 bad_page(page);
617
618 /*
619 * For now, we report if PG_reserved was found set, but do not
620 * clear it, and do not allocate the page: as a safety net.
621 */
622 if (PageReserved(page))
623 return 1; 641 return 1;
642 }
624 643
625 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
626 1 << PG_referenced | 1 << PG_arch_1 |
627 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
628#ifdef CONFIG_UNEVICTABLE_LRU
629 | 1 << PG_mlocked
630#endif
631 );
632 set_page_private(page, 0); 644 set_page_private(page, 0);
633 set_page_refcounted(page); 645 set_page_refcounted(page);
634 646
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2609 unsigned long pfn; 2621 unsigned long pfn;
2610 struct zone *z; 2622 struct zone *z;
2611 2623
2624 if (highest_memmap_pfn < end_pfn - 1)
2625 highest_memmap_pfn = end_pfn - 1;
2626
2612 z = &NODE_DATA(nid)->node_zones[zone]; 2627 z = &NODE_DATA(nid)->node_zones[zone];
2613 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 2628 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2614 /* 2629 /*
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
3381{ 3396{
3382 unsigned long usemapsize = usemap_size(zonesize); 3397 unsigned long usemapsize = usemap_size(zonesize);
3383 zone->pageblock_flags = NULL; 3398 zone->pageblock_flags = NULL;
3384 if (usemapsize) { 3399 if (usemapsize)
3385 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 3400 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3386 memset(zone->pageblock_flags, 0, usemapsize);
3387 }
3388} 3401}
3389#else 3402#else
3390static void inline setup_usemap(struct pglist_data *pgdat, 3403static void inline setup_usemap(struct pglist_data *pgdat,
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3469 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3482 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3470 if (realsize >= memmap_pages) { 3483 if (realsize >= memmap_pages) {
3471 realsize -= memmap_pages; 3484 realsize -= memmap_pages;
3472 printk(KERN_DEBUG 3485 if (memmap_pages)
3473 " %s zone: %lu pages used for memmap\n", 3486 printk(KERN_DEBUG
3474 zone_names[j], memmap_pages); 3487 " %s zone: %lu pages used for memmap\n",
3488 zone_names[j], memmap_pages);
3475 } else 3489 } else
3476 printk(KERN_WARNING 3490 printk(KERN_WARNING
3477 " %s zone: %lu pages exceeds realsize %lu\n", 3491 " %s zone: %lu pages exceeds realsize %lu\n",
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void)
4316 * 1TB 101 10GB 4330 * 1TB 101 10GB
4317 * 10TB 320 32GB 4331 * 10TB 320 32GB
4318 */ 4332 */
4319void setup_per_zone_inactive_ratio(void) 4333static void setup_per_zone_inactive_ratio(void)
4320{ 4334{
4321 struct zone *zone; 4335 struct zone *zone;
4322 4336
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename,
4573 return table; 4587 return table;
4574} 4588}
4575 4589
4576#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
4577struct page *pfn_to_page(unsigned long pfn)
4578{
4579 return __pfn_to_page(pfn);
4580}
4581unsigned long page_to_pfn(struct page *page)
4582{
4583 return __page_to_pfn(page);
4584}
4585EXPORT_SYMBOL(pfn_to_page);
4586EXPORT_SYMBOL(page_to_pfn);
4587#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
4588
4589/* Return a pointer to the bitmap storing bits affecting a block of pages */ 4590/* Return a pointer to the bitmap storing bits affecting a block of pages */
4590static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 4591static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4591 unsigned long pfn) 4592 unsigned long pfn)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ab27ff750519..d6507a660ed6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -101,7 +101,7 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
101} 101}
102 102
103/* __alloc_bootmem...() is protected by !slab_available() */ 103/* __alloc_bootmem...() is protected by !slab_available() */
104int __init_refok init_section_page_cgroup(unsigned long pfn) 104static int __init_refok init_section_page_cgroup(unsigned long pfn)
105{ 105{
106 struct mem_section *section; 106 struct mem_section *section;
107 struct page_cgroup *base, *pc; 107 struct page_cgroup *base, *pc;
diff --git a/mm/page_io.c b/mm/page_io.c
index 065c4480eaf0..dc6ce0afbded 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
98 struct bio *bio; 98 struct bio *bio;
99 int ret = 0, rw = WRITE; 99 int ret = 0, rw = WRITE;
100 100
101 if (remove_exclusive_swap_page(page)) { 101 if (try_to_free_swap(page)) {
102 unlock_page(page); 102 unlock_page(page);
103 goto out; 103 goto out;
104 } 104 }
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page)
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
127 127
128 BUG_ON(!PageLocked(page)); 128 VM_BUG_ON(!PageLocked(page));
129 BUG_ON(PageUptodate(page)); 129 VM_BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
131 end_swap_bio_read); 131 end_swap_bio_read);
132 if (bio == NULL) { 132 if (bio == NULL) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 10993942d6c9..ac4af8cffbf9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,9 +47,9 @@
47#include <linux/rmap.h> 47#include <linux/rmap.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 50#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h> 51#include <linux/mmu_notifier.h>
52#include <linux/migrate.h>
53 53
54#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
55 55
@@ -191,7 +191,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 191 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 192 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 193 */
194struct anon_vma *page_lock_anon_vma(struct page *page) 194static struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 195{
196 struct anon_vma *anon_vma; 196 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 197 unsigned long anon_mapping;
@@ -211,7 +211,7 @@ out:
211 return NULL; 211 return NULL;
212} 212}
213 213
214void page_unlock_anon_vma(struct anon_vma *anon_vma) 214static void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 215{
216 spin_unlock(&anon_vma->lock); 216 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 217 rcu_read_unlock();
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page,
359 goto out_unmap; 359 goto out_unmap;
360 } 360 }
361 361
362 if (ptep_clear_flush_young_notify(vma, address, pte)) 362 if (ptep_clear_flush_young_notify(vma, address, pte)) {
363 referenced++; 363 /*
364 * Don't treat a reference through a sequentially read
365 * mapping as such. If the page has been used in
366 * another mapping, we will catch it; if this other
367 * mapping is already gone, the unmap path will have
368 * set PG_referenced or activated the page.
369 */
370 if (likely(!VM_SequentialReadHint(vma)))
371 referenced++;
372 }
364 373
365 /* Pretend the page is referenced if the task has the 374 /* Pretend the page is referenced if the task has the
366 swap token and is in the middle of a page fault. */ 375 swap token and is in the middle of a page fault. */
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page,
661void page_add_new_anon_rmap(struct page *page, 670void page_add_new_anon_rmap(struct page *page,
662 struct vm_area_struct *vma, unsigned long address) 671 struct vm_area_struct *vma, unsigned long address)
663{ 672{
664 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 673 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
665 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 674 SetPageSwapBacked(page);
675 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
666 __page_set_anon_rmap(page, vma, address); 676 __page_set_anon_rmap(page, vma, address);
677 if (page_evictable(page, vma))
678 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
679 else
680 add_page_to_unevictable_list(page);
667} 681}
668 682
669/** 683/**
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page)
693 */ 707 */
694void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) 708void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
695{ 709{
696 BUG_ON(page_mapcount(page) == 0);
697 if (PageAnon(page)) 710 if (PageAnon(page))
698 __page_check_anon_rmap(page, vma, address); 711 __page_check_anon_rmap(page, vma, address);
699 atomic_inc(&page->_mapcount); 712 atomic_inc(&page->_mapcount);
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
703/** 716/**
704 * page_remove_rmap - take down pte mapping from a page 717 * page_remove_rmap - take down pte mapping from a page
705 * @page: page to remove mapping from 718 * @page: page to remove mapping from
706 * @vma: the vm area in which the mapping is removed
707 * 719 *
708 * The caller needs to hold the pte lock. 720 * The caller needs to hold the pte lock.
709 */ 721 */
710void page_remove_rmap(struct page *page, struct vm_area_struct *vma) 722void page_remove_rmap(struct page *page)
711{ 723{
712 if (atomic_add_negative(-1, &page->_mapcount)) { 724 if (atomic_add_negative(-1, &page->_mapcount)) {
713 if (unlikely(page_mapcount(page) < 0)) {
714 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
715 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
716 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
717 printk (KERN_EMERG " page->count = %x\n", page_count(page));
718 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
719 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
720 if (vma->vm_ops) {
721 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
722 }
723 if (vma->vm_file && vma->vm_file->f_op)
724 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
725 BUG();
726 }
727
728 /* 725 /*
729 * Now that the last pte has gone, s390 must transfer dirty 726 * Now that the last pte has gone, s390 must transfer dirty
730 * flag from storage key to struct page. We can usually skip 727 * flag from storage key to struct page. We can usually skip
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
818 spin_unlock(&mmlist_lock); 815 spin_unlock(&mmlist_lock);
819 } 816 }
820 dec_mm_counter(mm, anon_rss); 817 dec_mm_counter(mm, anon_rss);
821#ifdef CONFIG_MIGRATION 818 } else if (PAGE_MIGRATION) {
822 } else {
823 /* 819 /*
824 * Store the pfn of the page in a special migration 820 * Store the pfn of the page in a special migration
825 * pte. do_swap_page() will wait until the migration 821 * pte. do_swap_page() will wait until the migration
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
827 */ 823 */
828 BUG_ON(!migration); 824 BUG_ON(!migration);
829 entry = make_migration_entry(page, pte_write(pteval)); 825 entry = make_migration_entry(page, pte_write(pteval));
830#endif
831 } 826 }
832 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 827 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
833 BUG_ON(pte_file(*pte)); 828 BUG_ON(pte_file(*pte));
834 } else 829 } else if (PAGE_MIGRATION && migration) {
835#ifdef CONFIG_MIGRATION
836 if (migration) {
837 /* Establish migration entry for a file page */ 830 /* Establish migration entry for a file page */
838 swp_entry_t entry; 831 swp_entry_t entry;
839 entry = make_migration_entry(page, pte_write(pteval)); 832 entry = make_migration_entry(page, pte_write(pteval));
840 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 833 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
841 } else 834 } else
842#endif
843 dec_mm_counter(mm, file_rss); 835 dec_mm_counter(mm, file_rss);
844 836
845 837
846 page_remove_rmap(page, vma); 838 page_remove_rmap(page);
847 page_cache_release(page); 839 page_cache_release(page);
848 840
849out_unmap: 841out_unmap:
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
958 if (pte_dirty(pteval)) 950 if (pte_dirty(pteval))
959 set_page_dirty(page); 951 set_page_dirty(page);
960 952
961 page_remove_rmap(page, vma); 953 page_remove_rmap(page);
962 page_cache_release(page); 954 page_cache_release(page);
963 dec_mm_counter(mm, file_rss); 955 dec_mm_counter(mm, file_rss);
964 (*mapcount)--; 956 (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index f1b0d4871f3a..5941f9801363 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -14,31 +14,39 @@
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 * 16 *
17 * tiny-shmem:
18 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
19 *
17 * This file is released under the GPL. 20 * This file is released under the GPL.
18 */ 21 */
19 22
23#include <linux/fs.h>
24#include <linux/init.h>
25#include <linux/vfs.h>
26#include <linux/mount.h>
27#include <linux/file.h>
28#include <linux/mm.h>
29#include <linux/module.h>
30#include <linux/swap.h>
31
32static struct vfsmount *shm_mnt;
33
34#ifdef CONFIG_SHMEM
20/* 35/*
21 * This virtual memory filesystem is heavily based on the ramfs. It 36 * This virtual memory filesystem is heavily based on the ramfs. It
22 * extends ramfs by the ability to use swap and honor resource limits 37 * extends ramfs by the ability to use swap and honor resource limits
23 * which makes it a completely usable filesystem. 38 * which makes it a completely usable filesystem.
24 */ 39 */
25 40
26#include <linux/module.h>
27#include <linux/init.h>
28#include <linux/fs.h>
29#include <linux/xattr.h> 41#include <linux/xattr.h>
30#include <linux/exportfs.h> 42#include <linux/exportfs.h>
31#include <linux/generic_acl.h> 43#include <linux/generic_acl.h>
32#include <linux/mm.h>
33#include <linux/mman.h> 44#include <linux/mman.h>
34#include <linux/file.h>
35#include <linux/swap.h>
36#include <linux/pagemap.h> 45#include <linux/pagemap.h>
37#include <linux/string.h> 46#include <linux/string.h>
38#include <linux/slab.h> 47#include <linux/slab.h>
39#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
40#include <linux/shmem_fs.h> 49#include <linux/shmem_fs.h>
41#include <linux/mount.h>
42#include <linux/writeback.h> 50#include <linux/writeback.h>
43#include <linux/vfs.h> 51#include <linux/vfs.h>
44#include <linux/blkdev.h> 52#include <linux/blkdev.h>
@@ -1444,7 +1452,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1444 if (error) 1452 if (error)
1445 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1453 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1446 1454
1447 mark_page_accessed(vmf->page);
1448 return ret | VM_FAULT_LOCKED; 1455 return ret | VM_FAULT_LOCKED;
1449} 1456}
1450 1457
@@ -2486,7 +2493,6 @@ static struct file_system_type tmpfs_fs_type = {
2486 .get_sb = shmem_get_sb, 2493 .get_sb = shmem_get_sb,
2487 .kill_sb = kill_litter_super, 2494 .kill_sb = kill_litter_super,
2488}; 2495};
2489static struct vfsmount *shm_mnt;
2490 2496
2491static int __init init_tmpfs(void) 2497static int __init init_tmpfs(void)
2492{ 2498{
@@ -2525,7 +2531,51 @@ out4:
2525 shm_mnt = ERR_PTR(error); 2531 shm_mnt = ERR_PTR(error);
2526 return error; 2532 return error;
2527} 2533}
2528module_init(init_tmpfs) 2534
2535#else /* !CONFIG_SHMEM */
2536
2537/*
2538 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
2539 *
2540 * This is intended for small system where the benefits of the full
2541 * shmem code (swap-backed and resource-limited) are outweighed by
2542 * their complexity. On systems without swap this code should be
2543 * effectively equivalent, but much lighter weight.
2544 */
2545
2546#include <linux/ramfs.h>
2547
2548static struct file_system_type tmpfs_fs_type = {
2549 .name = "tmpfs",
2550 .get_sb = ramfs_get_sb,
2551 .kill_sb = kill_litter_super,
2552};
2553
2554static int __init init_tmpfs(void)
2555{
2556 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2557
2558 shm_mnt = kern_mount(&tmpfs_fs_type);
2559 BUG_ON(IS_ERR(shm_mnt));
2560
2561 return 0;
2562}
2563
2564int shmem_unuse(swp_entry_t entry, struct page *page)
2565{
2566 return 0;
2567}
2568
2569#define shmem_file_operations ramfs_file_operations
2570#define shmem_vm_ops generic_file_vm_ops
2571#define shmem_get_inode ramfs_get_inode
2572#define shmem_acct_size(a, b) 0
2573#define shmem_unacct_size(a, b) do {} while (0)
2574#define SHMEM_MAX_BYTES LLONG_MAX
2575
2576#endif /* CONFIG_SHMEM */
2577
2578/* common code */
2529 2579
2530/** 2580/**
2531 * shmem_file_setup - get an unlinked file living in tmpfs 2581 * shmem_file_setup - get an unlinked file living in tmpfs
@@ -2569,12 +2619,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2569 if (!inode) 2619 if (!inode)
2570 goto close_file; 2620 goto close_file;
2571 2621
2622#ifdef CONFIG_SHMEM
2572 SHMEM_I(inode)->flags = flags & VM_ACCOUNT; 2623 SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2624#endif
2573 d_instantiate(dentry, inode); 2625 d_instantiate(dentry, inode);
2574 inode->i_size = size; 2626 inode->i_size = size;
2575 inode->i_nlink = 0; /* It is unlinked */ 2627 inode->i_nlink = 0; /* It is unlinked */
2576 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, 2628 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2577 &shmem_file_operations); 2629 &shmem_file_operations);
2630
2631#ifndef CONFIG_MMU
2632 error = ramfs_nommu_expand_for_mapping(inode, size);
2633 if (error)
2634 goto close_file;
2635#endif
2578 return file; 2636 return file;
2579 2637
2580close_file: 2638close_file:
@@ -2606,3 +2664,5 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2606 vma->vm_ops = &shmem_vm_ops; 2664 vma->vm_ops = &shmem_vm_ops;
2607 return 0; 2665 return 0;
2608} 2666}
2667
2668module_init(init_tmpfs)
diff --git a/mm/swap.c b/mm/swap.c
index b135ec90cdeb..ba2c0e8b8b54 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -246,25 +246,6 @@ void add_page_to_unevictable_list(struct page *page)
246 spin_unlock_irq(&zone->lru_lock); 246 spin_unlock_irq(&zone->lru_lock);
247} 247}
248 248
249/**
250 * lru_cache_add_active_or_unevictable
251 * @page: the page to be added to LRU
252 * @vma: vma in which page is mapped for determining reclaimability
253 *
254 * place @page on active or unevictable LRU list, depending on
255 * page_evictable(). Note that if the page is not evictable,
256 * it goes directly back onto it's zone's unevictable list. It does
257 * NOT use a per cpu pagevec.
258 */
259void lru_cache_add_active_or_unevictable(struct page *page,
260 struct vm_area_struct *vma)
261{
262 if (page_evictable(page, vma))
263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
264 else
265 add_page_to_unevictable_list(page);
266}
267
268/* 249/*
269 * Drain pages out of the cpu's pagevecs. 250 * Drain pages out of the cpu's pagevecs.
270 * Either "cpu" is the current CPU, and preemption has already been 251 * Either "cpu" is the current CPU, and preemption has already been
@@ -398,28 +379,6 @@ void __pagevec_release(struct pagevec *pvec)
398EXPORT_SYMBOL(__pagevec_release); 379EXPORT_SYMBOL(__pagevec_release);
399 380
400/* 381/*
401 * pagevec_release() for pages which are known to not be on the LRU
402 *
403 * This function reinitialises the caller's pagevec.
404 */
405void __pagevec_release_nonlru(struct pagevec *pvec)
406{
407 int i;
408 struct pagevec pages_to_free;
409
410 pagevec_init(&pages_to_free, pvec->cold);
411 for (i = 0; i < pagevec_count(pvec); i++) {
412 struct page *page = pvec->pages[i];
413
414 VM_BUG_ON(PageLRU(page));
415 if (put_page_testzero(page))
416 pagevec_add(&pages_to_free, page);
417 }
418 pagevec_free(&pages_to_free);
419 pagevec_reinit(pvec);
420}
421
422/*
423 * Add the passed pages to the LRU, then drop the caller's refcount 382 * Add the passed pages to the LRU, then drop the caller's refcount
424 * on them. Reinitialises the caller's pagevec. 383 * on them. Reinitialises the caller's pagevec.
425 */ 384 */
@@ -495,8 +454,7 @@ void pagevec_swap_free(struct pagevec *pvec)
495 struct page *page = pvec->pages[i]; 454 struct page *page = pvec->pages[i];
496 455
497 if (PageSwapCache(page) && trylock_page(page)) { 456 if (PageSwapCache(page) && trylock_page(page)) {
498 if (PageSwapCache(page)) 457 try_to_free_swap(page);
499 remove_exclusive_swap_page_ref(page);
500 unlock_page(page); 458 unlock_page(page);
501 } 459 }
502 } 460 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3353c9029cef..81c825f67a7f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -72,10 +72,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
72{ 72{
73 int error; 73 int error;
74 74
75 BUG_ON(!PageLocked(page)); 75 VM_BUG_ON(!PageLocked(page));
76 BUG_ON(PageSwapCache(page)); 76 VM_BUG_ON(PageSwapCache(page));
77 BUG_ON(PagePrivate(page)); 77 VM_BUG_ON(!PageSwapBacked(page));
78 BUG_ON(!PageSwapBacked(page)); 78
79 error = radix_tree_preload(gfp_mask); 79 error = radix_tree_preload(gfp_mask);
80 if (!error) { 80 if (!error) {
81 page_cache_get(page); 81 page_cache_get(page);
@@ -108,10 +108,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
108 */ 108 */
109void __delete_from_swap_cache(struct page *page) 109void __delete_from_swap_cache(struct page *page)
110{ 110{
111 BUG_ON(!PageLocked(page)); 111 VM_BUG_ON(!PageLocked(page));
112 BUG_ON(!PageSwapCache(page)); 112 VM_BUG_ON(!PageSwapCache(page));
113 BUG_ON(PageWriteback(page)); 113 VM_BUG_ON(PageWriteback(page));
114 BUG_ON(PagePrivate(page));
115 114
116 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 115 radix_tree_delete(&swapper_space.page_tree, page_private(page));
117 set_page_private(page, 0); 116 set_page_private(page, 0);
@@ -129,13 +128,13 @@ void __delete_from_swap_cache(struct page *page)
129 * Allocate swap space for the page and add the page to the 128 * Allocate swap space for the page and add the page to the
130 * swap cache. Caller needs to hold the page lock. 129 * swap cache. Caller needs to hold the page lock.
131 */ 130 */
132int add_to_swap(struct page * page, gfp_t gfp_mask) 131int add_to_swap(struct page *page)
133{ 132{
134 swp_entry_t entry; 133 swp_entry_t entry;
135 int err; 134 int err;
136 135
137 BUG_ON(!PageLocked(page)); 136 VM_BUG_ON(!PageLocked(page));
138 BUG_ON(!PageUptodate(page)); 137 VM_BUG_ON(!PageUptodate(page));
139 138
140 for (;;) { 139 for (;;) {
141 entry = get_swap_page(); 140 entry = get_swap_page();
@@ -154,7 +153,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
154 * Add it to the swap cache and mark it dirty 153 * Add it to the swap cache and mark it dirty
155 */ 154 */
156 err = add_to_swap_cache(page, entry, 155 err = add_to_swap_cache(page, entry,
157 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); 156 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
158 157
159 switch (err) { 158 switch (err) {
160 case 0: /* Success */ 159 case 0: /* Success */
@@ -196,14 +195,14 @@ void delete_from_swap_cache(struct page *page)
196 * If we are the only user, then try to free up the swap cache. 195 * If we are the only user, then try to free up the swap cache.
197 * 196 *
198 * Its ok to check for PageSwapCache without the page lock 197 * Its ok to check for PageSwapCache without the page lock
199 * here because we are going to recheck again inside 198 * here because we are going to recheck again inside
200 * exclusive_swap_page() _with_ the lock. 199 * try_to_free_swap() _with_ the lock.
201 * - Marcelo 200 * - Marcelo
202 */ 201 */
203static inline void free_swap_cache(struct page *page) 202static inline void free_swap_cache(struct page *page)
204{ 203{
205 if (PageSwapCache(page) && trylock_page(page)) { 204 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
206 remove_exclusive_swap_page(page); 205 try_to_free_swap(page);
207 unlock_page(page); 206 unlock_page(page);
208 } 207 }
209} 208}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a9f87e5162..eec5ca758a23 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,6 +16,7 @@
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/shm.h> 17#include <linux/shm.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/random.h>
19#include <linux/writeback.h> 20#include <linux/writeback.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/seq_file.h> 22#include <linux/seq_file.h>
@@ -35,6 +36,7 @@
35 36
36static DEFINE_SPINLOCK(swap_lock); 37static DEFINE_SPINLOCK(swap_lock);
37static unsigned int nr_swapfiles; 38static unsigned int nr_swapfiles;
39long nr_swap_pages;
38long total_swap_pages; 40long total_swap_pages;
39static int swap_overflow; 41static int swap_overflow;
40static int least_priority; 42static int least_priority;
@@ -83,15 +85,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
83 up_read(&swap_unplug_sem); 85 up_read(&swap_unplug_sem);
84} 86}
85 87
88/*
89 * swapon tell device that all the old swap contents can be discarded,
90 * to allow the swap device to optimize its wear-levelling.
91 */
92static int discard_swap(struct swap_info_struct *si)
93{
94 struct swap_extent *se;
95 int err = 0;
96
97 list_for_each_entry(se, &si->extent_list, list) {
98 sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
99 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
100
101 if (se->start_page == 0) {
102 /* Do not discard the swap header page! */
103 start_block += 1 << (PAGE_SHIFT - 9);
104 nr_blocks -= 1 << (PAGE_SHIFT - 9);
105 if (!nr_blocks)
106 continue;
107 }
108
109 err = blkdev_issue_discard(si->bdev, start_block,
110 nr_blocks, GFP_KERNEL);
111 if (err)
112 break;
113
114 cond_resched();
115 }
116 return err; /* That will often be -EOPNOTSUPP */
117}
118
119/*
120 * swap allocation tell device that a cluster of swap can now be discarded,
121 * to allow the swap device to optimize its wear-levelling.
122 */
123static void discard_swap_cluster(struct swap_info_struct *si,
124 pgoff_t start_page, pgoff_t nr_pages)
125{
126 struct swap_extent *se = si->curr_swap_extent;
127 int found_extent = 0;
128
129 while (nr_pages) {
130 struct list_head *lh;
131
132 if (se->start_page <= start_page &&
133 start_page < se->start_page + se->nr_pages) {
134 pgoff_t offset = start_page - se->start_page;
135 sector_t start_block = se->start_block + offset;
136 sector_t nr_blocks = se->nr_pages - offset;
137
138 if (nr_blocks > nr_pages)
139 nr_blocks = nr_pages;
140 start_page += nr_blocks;
141 nr_pages -= nr_blocks;
142
143 if (!found_extent++)
144 si->curr_swap_extent = se;
145
146 start_block <<= PAGE_SHIFT - 9;
147 nr_blocks <<= PAGE_SHIFT - 9;
148 if (blkdev_issue_discard(si->bdev, start_block,
149 nr_blocks, GFP_NOIO))
150 break;
151 }
152
153 lh = se->list.next;
154 if (lh == &si->extent_list)
155 lh = lh->next;
156 se = list_entry(lh, struct swap_extent, list);
157 }
158}
159
160static int wait_for_discard(void *word)
161{
162 schedule();
163 return 0;
164}
165
86#define SWAPFILE_CLUSTER 256 166#define SWAPFILE_CLUSTER 256
87#define LATENCY_LIMIT 256 167#define LATENCY_LIMIT 256
88 168
89static inline unsigned long scan_swap_map(struct swap_info_struct *si) 169static inline unsigned long scan_swap_map(struct swap_info_struct *si)
90{ 170{
91 unsigned long offset, last_in_cluster; 171 unsigned long offset;
172 unsigned long scan_base;
173 unsigned long last_in_cluster = 0;
92 int latency_ration = LATENCY_LIMIT; 174 int latency_ration = LATENCY_LIMIT;
175 int found_free_cluster = 0;
93 176
94 /* 177 /*
95 * We try to cluster swap pages by allocating them sequentially 178 * We try to cluster swap pages by allocating them sequentially
96 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 179 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
97 * way, however, we resort to first-free allocation, starting 180 * way, however, we resort to first-free allocation, starting
@@ -99,16 +182,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
99 * all over the entire swap partition, so that we reduce 182 * all over the entire swap partition, so that we reduce
100 * overall disk seek times between swap pages. -- sct 183 * overall disk seek times between swap pages. -- sct
101 * But we do now try to find an empty cluster. -Andrea 184 * But we do now try to find an empty cluster. -Andrea
185 * And we let swap pages go all over an SSD partition. Hugh
102 */ 186 */
103 187
104 si->flags += SWP_SCANNING; 188 si->flags += SWP_SCANNING;
105 if (unlikely(!si->cluster_nr)) { 189 scan_base = offset = si->cluster_next;
106 si->cluster_nr = SWAPFILE_CLUSTER - 1; 190
107 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) 191 if (unlikely(!si->cluster_nr--)) {
108 goto lowest; 192 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
193 si->cluster_nr = SWAPFILE_CLUSTER - 1;
194 goto checks;
195 }
196 if (si->flags & SWP_DISCARDABLE) {
197 /*
198 * Start range check on racing allocations, in case
199 * they overlap the cluster we eventually decide on
200 * (we scan without swap_lock to allow preemption).
201 * It's hardly conceivable that cluster_nr could be
202 * wrapped during our scan, but don't depend on it.
203 */
204 if (si->lowest_alloc)
205 goto checks;
206 si->lowest_alloc = si->max;
207 si->highest_alloc = 0;
208 }
109 spin_unlock(&swap_lock); 209 spin_unlock(&swap_lock);
110 210
111 offset = si->lowest_bit; 211 /*
212 * If seek is expensive, start searching for new cluster from
213 * start of partition, to minimize the span of allocated swap.
214 * But if seek is cheap, search from our current position, so
215 * that swap is allocated from all over the partition: if the
216 * Flash Translation Layer only remaps within limited zones,
217 * we don't want to wear out the first zone too quickly.
218 */
219 if (!(si->flags & SWP_SOLIDSTATE))
220 scan_base = offset = si->lowest_bit;
112 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 221 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
113 222
114 /* Locate the first empty (unaligned) cluster */ 223 /* Locate the first empty (unaligned) cluster */
@@ -117,43 +226,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
117 last_in_cluster = offset + SWAPFILE_CLUSTER; 226 last_in_cluster = offset + SWAPFILE_CLUSTER;
118 else if (offset == last_in_cluster) { 227 else if (offset == last_in_cluster) {
119 spin_lock(&swap_lock); 228 spin_lock(&swap_lock);
120 si->cluster_next = offset-SWAPFILE_CLUSTER+1; 229 offset -= SWAPFILE_CLUSTER - 1;
121 goto cluster; 230 si->cluster_next = offset;
231 si->cluster_nr = SWAPFILE_CLUSTER - 1;
232 found_free_cluster = 1;
233 goto checks;
122 } 234 }
123 if (unlikely(--latency_ration < 0)) { 235 if (unlikely(--latency_ration < 0)) {
124 cond_resched(); 236 cond_resched();
125 latency_ration = LATENCY_LIMIT; 237 latency_ration = LATENCY_LIMIT;
126 } 238 }
127 } 239 }
240
241 offset = si->lowest_bit;
242 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
243
244 /* Locate the first empty (unaligned) cluster */
245 for (; last_in_cluster < scan_base; offset++) {
246 if (si->swap_map[offset])
247 last_in_cluster = offset + SWAPFILE_CLUSTER;
248 else if (offset == last_in_cluster) {
249 spin_lock(&swap_lock);
250 offset -= SWAPFILE_CLUSTER - 1;
251 si->cluster_next = offset;
252 si->cluster_nr = SWAPFILE_CLUSTER - 1;
253 found_free_cluster = 1;
254 goto checks;
255 }
256 if (unlikely(--latency_ration < 0)) {
257 cond_resched();
258 latency_ration = LATENCY_LIMIT;
259 }
260 }
261
262 offset = scan_base;
128 spin_lock(&swap_lock); 263 spin_lock(&swap_lock);
129 goto lowest; 264 si->cluster_nr = SWAPFILE_CLUSTER - 1;
265 si->lowest_alloc = 0;
130 } 266 }
131 267
132 si->cluster_nr--; 268checks:
133cluster: 269 if (!(si->flags & SWP_WRITEOK))
134 offset = si->cluster_next;
135 if (offset > si->highest_bit)
136lowest: offset = si->lowest_bit;
137checks: if (!(si->flags & SWP_WRITEOK))
138 goto no_page; 270 goto no_page;
139 if (!si->highest_bit) 271 if (!si->highest_bit)
140 goto no_page; 272 goto no_page;
141 if (!si->swap_map[offset]) { 273 if (offset > si->highest_bit)
142 if (offset == si->lowest_bit) 274 scan_base = offset = si->lowest_bit;
143 si->lowest_bit++; 275 if (si->swap_map[offset])
144 if (offset == si->highest_bit) 276 goto scan;
145 si->highest_bit--; 277
146 si->inuse_pages++; 278 if (offset == si->lowest_bit)
147 if (si->inuse_pages == si->pages) { 279 si->lowest_bit++;
148 si->lowest_bit = si->max; 280 if (offset == si->highest_bit)
149 si->highest_bit = 0; 281 si->highest_bit--;
282 si->inuse_pages++;
283 if (si->inuse_pages == si->pages) {
284 si->lowest_bit = si->max;
285 si->highest_bit = 0;
286 }
287 si->swap_map[offset] = 1;
288 si->cluster_next = offset + 1;
289 si->flags -= SWP_SCANNING;
290
291 if (si->lowest_alloc) {
292 /*
293 * Only set when SWP_DISCARDABLE, and there's a scan
294 * for a free cluster in progress or just completed.
295 */
296 if (found_free_cluster) {
297 /*
298 * To optimize wear-levelling, discard the
299 * old data of the cluster, taking care not to
300 * discard any of its pages that have already
301 * been allocated by racing tasks (offset has
302 * already stepped over any at the beginning).
303 */
304 if (offset < si->highest_alloc &&
305 si->lowest_alloc <= last_in_cluster)
306 last_in_cluster = si->lowest_alloc - 1;
307 si->flags |= SWP_DISCARDING;
308 spin_unlock(&swap_lock);
309
310 if (offset < last_in_cluster)
311 discard_swap_cluster(si, offset,
312 last_in_cluster - offset + 1);
313
314 spin_lock(&swap_lock);
315 si->lowest_alloc = 0;
316 si->flags &= ~SWP_DISCARDING;
317
318 smp_mb(); /* wake_up_bit advises this */
319 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
320
321 } else if (si->flags & SWP_DISCARDING) {
322 /*
323 * Delay using pages allocated by racing tasks
324 * until the whole discard has been issued. We
325 * could defer that delay until swap_writepage,
326 * but it's easier to keep this self-contained.
327 */
328 spin_unlock(&swap_lock);
329 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
330 wait_for_discard, TASK_UNINTERRUPTIBLE);
331 spin_lock(&swap_lock);
332 } else {
333 /*
334 * Note pages allocated by racing tasks while
335 * scan for a free cluster is in progress, so
336 * that its final discard can exclude them.
337 */
338 if (offset < si->lowest_alloc)
339 si->lowest_alloc = offset;
340 if (offset > si->highest_alloc)
341 si->highest_alloc = offset;
150 } 342 }
151 si->swap_map[offset] = 1;
152 si->cluster_next = offset + 1;
153 si->flags -= SWP_SCANNING;
154 return offset;
155 } 343 }
344 return offset;
156 345
346scan:
157 spin_unlock(&swap_lock); 347 spin_unlock(&swap_lock);
158 while (++offset <= si->highest_bit) { 348 while (++offset <= si->highest_bit) {
159 if (!si->swap_map[offset]) { 349 if (!si->swap_map[offset]) {
@@ -165,8 +355,18 @@ checks: if (!(si->flags & SWP_WRITEOK))
165 latency_ration = LATENCY_LIMIT; 355 latency_ration = LATENCY_LIMIT;
166 } 356 }
167 } 357 }
358 offset = si->lowest_bit;
359 while (++offset < scan_base) {
360 if (!si->swap_map[offset]) {
361 spin_lock(&swap_lock);
362 goto checks;
363 }
364 if (unlikely(--latency_ration < 0)) {
365 cond_resched();
366 latency_ration = LATENCY_LIMIT;
367 }
368 }
168 spin_lock(&swap_lock); 369 spin_lock(&swap_lock);
169 goto lowest;
170 370
171no_page: 371no_page:
172 si->flags -= SWP_SCANNING; 372 si->flags -= SWP_SCANNING;
@@ -268,7 +468,7 @@ bad_nofile:
268 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 468 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
269out: 469out:
270 return NULL; 470 return NULL;
271} 471}
272 472
273static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 473static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
274{ 474{
@@ -326,97 +526,58 @@ static inline int page_swapcount(struct page *page)
326} 526}
327 527
328/* 528/*
329 * We can use this swap cache entry directly 529 * We can write to an anon page without COW if there are no other references
330 * if there are no other references to it. 530 * to it. And as a side-effect, free up its swap: because the old content
531 * on disk will never be read, and seeking back there to write new content
532 * later would only waste time away from clustering.
331 */ 533 */
332int can_share_swap_page(struct page *page) 534int reuse_swap_page(struct page *page)
333{ 535{
334 int count; 536 int count;
335 537
336 BUG_ON(!PageLocked(page)); 538 VM_BUG_ON(!PageLocked(page));
337 count = page_mapcount(page); 539 count = page_mapcount(page);
338 if (count <= 1 && PageSwapCache(page)) 540 if (count <= 1 && PageSwapCache(page)) {
339 count += page_swapcount(page); 541 count += page_swapcount(page);
542 if (count == 1 && !PageWriteback(page)) {
543 delete_from_swap_cache(page);
544 SetPageDirty(page);
545 }
546 }
340 return count == 1; 547 return count == 1;
341} 548}
342 549
343/* 550/*
344 * Work out if there are any other processes sharing this 551 * If swap is getting full, or if there are no more mappings of this page,
345 * swap cache page. Free it if you can. Return success. 552 * then try_to_free_swap is called to free its swap space.
346 */ 553 */
347static int remove_exclusive_swap_page_count(struct page *page, int count) 554int try_to_free_swap(struct page *page)
348{ 555{
349 int retval; 556 VM_BUG_ON(!PageLocked(page));
350 struct swap_info_struct * p;
351 swp_entry_t entry;
352
353 BUG_ON(PagePrivate(page));
354 BUG_ON(!PageLocked(page));
355 557
356 if (!PageSwapCache(page)) 558 if (!PageSwapCache(page))
357 return 0; 559 return 0;
358 if (PageWriteback(page)) 560 if (PageWriteback(page))
359 return 0; 561 return 0;
360 if (page_count(page) != count) /* us + cache + ptes */ 562 if (page_swapcount(page))
361 return 0;
362
363 entry.val = page_private(page);
364 p = swap_info_get(entry);
365 if (!p)
366 return 0; 563 return 0;
367 564
368 /* Is the only swap cache user the cache itself? */ 565 delete_from_swap_cache(page);
369 retval = 0; 566 SetPageDirty(page);
370 if (p->swap_map[swp_offset(entry)] == 1) { 567 return 1;
371 /* Recheck the page count with the swapcache lock held.. */
372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == count) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page);
375 SetPageDirty(page);
376 retval = 1;
377 }
378 spin_unlock_irq(&swapper_space.tree_lock);
379 }
380 spin_unlock(&swap_lock);
381
382 if (retval) {
383 swap_free(entry);
384 page_cache_release(page);
385 }
386
387 return retval;
388}
389
390/*
391 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache.
393 */
394int remove_exclusive_swap_page(struct page *page)
395{
396 return remove_exclusive_swap_page_count(page, 2);
397}
398
399/*
400 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page.
403 */
404int remove_exclusive_swap_page_ref(struct page *page)
405{
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407} 568}
408 569
409/* 570/*
410 * Free the swap entry like above, but also try to 571 * Free the swap entry like above, but also try to
411 * free the page cache entry if it is the last user. 572 * free the page cache entry if it is the last user.
412 */ 573 */
413void free_swap_and_cache(swp_entry_t entry) 574int free_swap_and_cache(swp_entry_t entry)
414{ 575{
415 struct swap_info_struct * p; 576 struct swap_info_struct *p;
416 struct page *page = NULL; 577 struct page *page = NULL;
417 578
418 if (is_migration_entry(entry)) 579 if (is_migration_entry(entry))
419 return; 580 return 1;
420 581
421 p = swap_info_get(entry); 582 p = swap_info_get(entry);
422 if (p) { 583 if (p) {
@@ -430,20 +591,19 @@ void free_swap_and_cache(swp_entry_t entry)
430 spin_unlock(&swap_lock); 591 spin_unlock(&swap_lock);
431 } 592 }
432 if (page) { 593 if (page) {
433 int one_user; 594 /*
434 595 * Not mapped elsewhere, or swap space full? Free it!
435 BUG_ON(PagePrivate(page)); 596 * Also recheck PageSwapCache now page is locked (above).
436 one_user = (page_count(page) == 2); 597 */
437 /* Only cache user (+us), or swap space full? Free it! */
438 /* Also recheck PageSwapCache after page is locked (above) */
439 if (PageSwapCache(page) && !PageWriteback(page) && 598 if (PageSwapCache(page) && !PageWriteback(page) &&
440 (one_user || vm_swap_full())) { 599 (!page_mapped(page) || vm_swap_full())) {
441 delete_from_swap_cache(page); 600 delete_from_swap_cache(page);
442 SetPageDirty(page); 601 SetPageDirty(page);
443 } 602 }
444 unlock_page(page); 603 unlock_page(page);
445 page_cache_release(page); 604 page_cache_release(page);
446 } 605 }
606 return p != NULL;
447} 607}
448 608
449#ifdef CONFIG_HIBERNATION 609#ifdef CONFIG_HIBERNATION
@@ -776,10 +936,10 @@ static int try_to_unuse(unsigned int type)
776 break; 936 break;
777 } 937 }
778 938
779 /* 939 /*
780 * Get a page for the entry, using the existing swap 940 * Get a page for the entry, using the existing swap
781 * cache page if there is one. Otherwise, get a clean 941 * cache page if there is one. Otherwise, get a clean
782 * page and read the swap into it. 942 * page and read the swap into it.
783 */ 943 */
784 swap_map = &si->swap_map[i]; 944 swap_map = &si->swap_map[i];
785 entry = swp_entry(type, i); 945 entry = swp_entry(type, i);
@@ -930,7 +1090,16 @@ static int try_to_unuse(unsigned int type)
930 lock_page(page); 1090 lock_page(page);
931 wait_on_page_writeback(page); 1091 wait_on_page_writeback(page);
932 } 1092 }
933 if (PageSwapCache(page)) 1093
1094 /*
1095 * It is conceivable that a racing task removed this page from
1096 * swap cache just before we acquired the page lock at the top,
1097 * or while we dropped it in unuse_mm(). The page might even
1098 * be back in swap cache on another swap area: that we must not
1099 * delete, since it may not have been written out to swap yet.
1100 */
1101 if (PageSwapCache(page) &&
1102 likely(page_private(page) == entry.val))
934 delete_from_swap_cache(page); 1103 delete_from_swap_cache(page);
935 1104
936 /* 1105 /*
@@ -1203,26 +1372,6 @@ out:
1203 return ret; 1372 return ret;
1204} 1373}
1205 1374
1206#if 0 /* We don't need this yet */
1207#include <linux/backing-dev.h>
1208int page_queue_congested(struct page *page)
1209{
1210 struct backing_dev_info *bdi;
1211
1212 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1213
1214 if (PageSwapCache(page)) {
1215 swp_entry_t entry = { .val = page_private(page) };
1216 struct swap_info_struct *sis;
1217
1218 sis = get_swap_info_struct(swp_type(entry));
1219 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
1220 } else
1221 bdi = page->mapping->backing_dev_info;
1222 return bdi_write_congested(bdi);
1223}
1224#endif
1225
1226asmlinkage long sys_swapoff(const char __user * specialfile) 1375asmlinkage long sys_swapoff(const char __user * specialfile)
1227{ 1376{
1228 struct swap_info_struct * p = NULL; 1377 struct swap_info_struct * p = NULL;
@@ -1233,7 +1382,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1233 char * pathname; 1382 char * pathname;
1234 int i, type, prev; 1383 int i, type, prev;
1235 int err; 1384 int err;
1236 1385
1237 if (!capable(CAP_SYS_ADMIN)) 1386 if (!capable(CAP_SYS_ADMIN))
1238 return -EPERM; 1387 return -EPERM;
1239 1388
@@ -1253,7 +1402,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1253 spin_lock(&swap_lock); 1402 spin_lock(&swap_lock);
1254 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1403 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1255 p = swap_info + type; 1404 p = swap_info + type;
1256 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 1405 if (p->flags & SWP_WRITEOK) {
1257 if (p->swap_file->f_mapping == mapping) 1406 if (p->swap_file->f_mapping == mapping)
1258 break; 1407 break;
1259 } 1408 }
@@ -1426,12 +1575,12 @@ static int swap_show(struct seq_file *swap, void *v)
1426 file = ptr->swap_file; 1575 file = ptr->swap_file;
1427 len = seq_path(swap, &file->f_path, " \t\n\\"); 1576 len = seq_path(swap, &file->f_path, " \t\n\\");
1428 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1577 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1429 len < 40 ? 40 - len : 1, " ", 1578 len < 40 ? 40 - len : 1, " ",
1430 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1579 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1431 "partition" : "file\t", 1580 "partition" : "file\t",
1432 ptr->pages << (PAGE_SHIFT - 10), 1581 ptr->pages << (PAGE_SHIFT - 10),
1433 ptr->inuse_pages << (PAGE_SHIFT - 10), 1582 ptr->inuse_pages << (PAGE_SHIFT - 10),
1434 ptr->prio); 1583 ptr->prio);
1435 return 0; 1584 return 0;
1436} 1585}
1437 1586
@@ -1487,12 +1636,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1487 int i, prev; 1636 int i, prev;
1488 int error; 1637 int error;
1489 union swap_header *swap_header = NULL; 1638 union swap_header *swap_header = NULL;
1490 int swap_header_version;
1491 unsigned int nr_good_pages = 0; 1639 unsigned int nr_good_pages = 0;
1492 int nr_extents = 0; 1640 int nr_extents = 0;
1493 sector_t span; 1641 sector_t span;
1494 unsigned long maxpages = 1; 1642 unsigned long maxpages = 1;
1495 int swapfilesize; 1643 unsigned long swapfilepages;
1496 unsigned short *swap_map = NULL; 1644 unsigned short *swap_map = NULL;
1497 struct page *page = NULL; 1645 struct page *page = NULL;
1498 struct inode *inode = NULL; 1646 struct inode *inode = NULL;
@@ -1570,7 +1718,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1570 goto bad_swap; 1718 goto bad_swap;
1571 } 1719 }
1572 1720
1573 swapfilesize = i_size_read(inode) >> PAGE_SHIFT; 1721 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1574 1722
1575 /* 1723 /*
1576 * Read the swap header. 1724 * Read the swap header.
@@ -1584,101 +1732,86 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1584 error = PTR_ERR(page); 1732 error = PTR_ERR(page);
1585 goto bad_swap; 1733 goto bad_swap;
1586 } 1734 }
1587 kmap(page); 1735 swap_header = kmap(page);
1588 swap_header = page_address(page);
1589 1736
1590 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 1737 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1591 swap_header_version = 1;
1592 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1593 swap_header_version = 2;
1594 else {
1595 printk(KERN_ERR "Unable to find swap-space signature\n"); 1738 printk(KERN_ERR "Unable to find swap-space signature\n");
1596 error = -EINVAL; 1739 error = -EINVAL;
1597 goto bad_swap; 1740 goto bad_swap;
1598 } 1741 }
1599 1742
1600 switch (swap_header_version) { 1743 /* swap partition endianess hack... */
1601 case 1: 1744 if (swab32(swap_header->info.version) == 1) {
1602 printk(KERN_ERR "version 0 swap is no longer supported. " 1745 swab32s(&swap_header->info.version);
1603 "Use mkswap -v1 %s\n", name); 1746 swab32s(&swap_header->info.last_page);
1747 swab32s(&swap_header->info.nr_badpages);
1748 for (i = 0; i < swap_header->info.nr_badpages; i++)
1749 swab32s(&swap_header->info.badpages[i]);
1750 }
1751 /* Check the swap header's sub-version */
1752 if (swap_header->info.version != 1) {
1753 printk(KERN_WARNING
1754 "Unable to handle swap header version %d\n",
1755 swap_header->info.version);
1604 error = -EINVAL; 1756 error = -EINVAL;
1605 goto bad_swap; 1757 goto bad_swap;
1606 case 2: 1758 }
1607 /* swap partition endianess hack... */
1608 if (swab32(swap_header->info.version) == 1) {
1609 swab32s(&swap_header->info.version);
1610 swab32s(&swap_header->info.last_page);
1611 swab32s(&swap_header->info.nr_badpages);
1612 for (i = 0; i < swap_header->info.nr_badpages; i++)
1613 swab32s(&swap_header->info.badpages[i]);
1614 }
1615 /* Check the swap header's sub-version and the size of
1616 the swap file and bad block lists */
1617 if (swap_header->info.version != 1) {
1618 printk(KERN_WARNING
1619 "Unable to handle swap header version %d\n",
1620 swap_header->info.version);
1621 error = -EINVAL;
1622 goto bad_swap;
1623 }
1624 1759
1625 p->lowest_bit = 1; 1760 p->lowest_bit = 1;
1626 p->cluster_next = 1; 1761 p->cluster_next = 1;
1627 1762
1628 /* 1763 /*
1629 * Find out how many pages are allowed for a single swap 1764 * Find out how many pages are allowed for a single swap
1630 * device. There are two limiting factors: 1) the number of 1765 * device. There are two limiting factors: 1) the number of
1631 * bits for the swap offset in the swp_entry_t type and 1766 * bits for the swap offset in the swp_entry_t type and
1632 * 2) the number of bits in the a swap pte as defined by 1767 * 2) the number of bits in the a swap pte as defined by
1633 * the different architectures. In order to find the 1768 * the different architectures. In order to find the
1634 * largest possible bit mask a swap entry with swap type 0 1769 * largest possible bit mask a swap entry with swap type 0
1635 * and swap offset ~0UL is created, encoded to a swap pte, 1770 * and swap offset ~0UL is created, encoded to a swap pte,
1636 * decoded to a swp_entry_t again and finally the swap 1771 * decoded to a swp_entry_t again and finally the swap
1637 * offset is extracted. This will mask all the bits from 1772 * offset is extracted. This will mask all the bits from
1638 * the initial ~0UL mask that can't be encoded in either 1773 * the initial ~0UL mask that can't be encoded in either
1639 * the swp_entry_t or the architecture definition of a 1774 * the swp_entry_t or the architecture definition of a
1640 * swap pte. 1775 * swap pte.
1641 */ 1776 */
1642 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; 1777 maxpages = swp_offset(pte_to_swp_entry(
1643 if (maxpages > swap_header->info.last_page) 1778 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
1644 maxpages = swap_header->info.last_page; 1779 if (maxpages > swap_header->info.last_page)
1645 p->highest_bit = maxpages - 1; 1780 maxpages = swap_header->info.last_page;
1781 p->highest_bit = maxpages - 1;
1646 1782
1647 error = -EINVAL; 1783 error = -EINVAL;
1648 if (!maxpages) 1784 if (!maxpages)
1649 goto bad_swap; 1785 goto bad_swap;
1650 if (swapfilesize && maxpages > swapfilesize) { 1786 if (swapfilepages && maxpages > swapfilepages) {
1651 printk(KERN_WARNING 1787 printk(KERN_WARNING
1652 "Swap area shorter than signature indicates\n"); 1788 "Swap area shorter than signature indicates\n");
1653 goto bad_swap; 1789 goto bad_swap;
1654 } 1790 }
1655 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1791 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1656 goto bad_swap; 1792 goto bad_swap;
1657 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1793 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1658 goto bad_swap; 1794 goto bad_swap;
1659 1795
1660 /* OK, set up the swap map and apply the bad block list */ 1796 /* OK, set up the swap map and apply the bad block list */
1661 swap_map = vmalloc(maxpages * sizeof(short)); 1797 swap_map = vmalloc(maxpages * sizeof(short));
1662 if (!swap_map) { 1798 if (!swap_map) {
1663 error = -ENOMEM; 1799 error = -ENOMEM;
1664 goto bad_swap; 1800 goto bad_swap;
1665 } 1801 }
1666 1802
1667 error = 0; 1803 memset(swap_map, 0, maxpages * sizeof(short));
1668 memset(swap_map, 0, maxpages * sizeof(short)); 1804 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1669 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1805 int page_nr = swap_header->info.badpages[i];
1670 int page_nr = swap_header->info.badpages[i]; 1806 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
1671 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1807 error = -EINVAL;
1672 error = -EINVAL;
1673 else
1674 swap_map[page_nr] = SWAP_MAP_BAD;
1675 }
1676 nr_good_pages = swap_header->info.last_page -
1677 swap_header->info.nr_badpages -
1678 1 /* header page */;
1679 if (error)
1680 goto bad_swap; 1808 goto bad_swap;
1809 }
1810 swap_map[page_nr] = SWAP_MAP_BAD;
1681 } 1811 }
1812 nr_good_pages = swap_header->info.last_page -
1813 swap_header->info.nr_badpages -
1814 1 /* header page */;
1682 1815
1683 if (nr_good_pages) { 1816 if (nr_good_pages) {
1684 swap_map[0] = SWAP_MAP_BAD; 1817 swap_map[0] = SWAP_MAP_BAD;
@@ -1697,6 +1830,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1697 goto bad_swap; 1830 goto bad_swap;
1698 } 1831 }
1699 1832
1833 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1834 p->flags |= SWP_SOLIDSTATE;
1835 p->cluster_next = 1 + (random32() % p->highest_bit);
1836 }
1837 if (discard_swap(p) == 0)
1838 p->flags |= SWP_DISCARDABLE;
1839
1700 mutex_lock(&swapon_mutex); 1840 mutex_lock(&swapon_mutex);
1701 spin_lock(&swap_lock); 1841 spin_lock(&swap_lock);
1702 if (swap_flags & SWAP_FLAG_PREFER) 1842 if (swap_flags & SWAP_FLAG_PREFER)
@@ -1705,14 +1845,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1705 else 1845 else
1706 p->prio = --least_priority; 1846 p->prio = --least_priority;
1707 p->swap_map = swap_map; 1847 p->swap_map = swap_map;
1708 p->flags = SWP_ACTIVE; 1848 p->flags |= SWP_WRITEOK;
1709 nr_swap_pages += nr_good_pages; 1849 nr_swap_pages += nr_good_pages;
1710 total_swap_pages += nr_good_pages; 1850 total_swap_pages += nr_good_pages;
1711 1851
1712 printk(KERN_INFO "Adding %uk swap on %s. " 1852 printk(KERN_INFO "Adding %uk swap on %s. "
1713 "Priority:%d extents:%d across:%lluk\n", 1853 "Priority:%d extents:%d across:%lluk %s%s\n",
1714 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 1854 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
1715 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); 1855 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
1856 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
1857 (p->flags & SWP_DISCARDABLE) ? "D" : "");
1716 1858
1717 /* insert swap space into swap_list: */ 1859 /* insert swap space into swap_list: */
1718 prev = -1; 1860 prev = -1;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
deleted file mode 100644
index 3e67d575ee6e..000000000000
--- a/mm/tiny-shmem.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
3 *
4 * Matt Mackall <mpm@selenic.com> January, 2004
5 * derived from mm/shmem.c and fs/ramfs/inode.c
6 *
7 * This is intended for small system where the benefits of the full
8 * shmem code (swap-backed and resource-limited) are outweighed by
9 * their complexity. On systems without swap this code should be
10 * effectively equivalent, but much lighter weight.
11 */
12
13#include <linux/fs.h>
14#include <linux/init.h>
15#include <linux/vfs.h>
16#include <linux/mount.h>
17#include <linux/file.h>
18#include <linux/mm.h>
19#include <linux/module.h>
20#include <linux/swap.h>
21#include <linux/ramfs.h>
22
23static struct file_system_type tmpfs_fs_type = {
24 .name = "tmpfs",
25 .get_sb = ramfs_get_sb,
26 .kill_sb = kill_litter_super,
27};
28
29static struct vfsmount *shm_mnt;
30
31static int __init init_tmpfs(void)
32{
33 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
34
35 shm_mnt = kern_mount(&tmpfs_fs_type);
36 BUG_ON(IS_ERR(shm_mnt));
37
38 return 0;
39}
40module_init(init_tmpfs)
41
42/**
43 * shmem_file_setup - get an unlinked file living in tmpfs
44 * @name: name for dentry (to be seen in /proc/<pid>/maps
45 * @size: size to be set for the file
46 * @flags: vm_flags
47 */
48struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
49{
50 int error;
51 struct file *file;
52 struct inode *inode;
53 struct dentry *dentry, *root;
54 struct qstr this;
55
56 if (IS_ERR(shm_mnt))
57 return (void *)shm_mnt;
58
59 error = -ENOMEM;
60 this.name = name;
61 this.len = strlen(name);
62 this.hash = 0; /* will go */
63 root = shm_mnt->mnt_root;
64 dentry = d_alloc(root, &this);
65 if (!dentry)
66 goto put_memory;
67
68 error = -ENFILE;
69 file = get_empty_filp();
70 if (!file)
71 goto put_dentry;
72
73 error = -ENOSPC;
74 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
75 if (!inode)
76 goto close_file;
77
78 d_instantiate(dentry, inode);
79 inode->i_size = size;
80 inode->i_nlink = 0; /* It is unlinked */
81 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
82 &ramfs_file_operations);
83
84#ifndef CONFIG_MMU
85 error = ramfs_nommu_expand_for_mapping(inode, size);
86 if (error)
87 goto close_file;
88#endif
89 return file;
90
91close_file:
92 put_filp(file);
93put_dentry:
94 dput(dentry);
95put_memory:
96 return ERR_PTR(error);
97}
98EXPORT_SYMBOL_GPL(shmem_file_setup);
99
100/**
101 * shmem_zero_setup - setup a shared anonymous mapping
102 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
103 */
104int shmem_zero_setup(struct vm_area_struct *vma)
105{
106 struct file *file;
107 loff_t size = vma->vm_end - vma->vm_start;
108
109 file = shmem_file_setup("dev/zero", size, vma->vm_flags);
110 if (IS_ERR(file))
111 return PTR_ERR(file);
112
113 if (vma->vm_file)
114 fput(vma->vm_file);
115 vma->vm_file = file;
116 vma->vm_ops = &generic_file_vm_ops;
117 return 0;
118}
119
120int shmem_unuse(swp_entry_t entry, struct page *page)
121{
122 return 0;
123}
124
125#ifndef CONFIG_MMU
126unsigned long shmem_get_unmapped_area(struct file *file,
127 unsigned long addr,
128 unsigned long len,
129 unsigned long pgoff,
130 unsigned long flags)
131{
132 return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
133}
134#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7465f22fec0c..c5db9a7264d9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,6 +14,7 @@
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/mutex.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
18#include <linux/proc_fs.h> 19#include <linux/proc_fs.h>
19#include <linux/seq_file.h> 20#include <linux/seq_file.h>
@@ -381,8 +382,9 @@ found:
381 goto retry; 382 goto retry;
382 } 383 }
383 if (printk_ratelimit()) 384 if (printk_ratelimit())
384 printk(KERN_WARNING "vmap allocation failed: " 385 printk(KERN_WARNING
385 "use vmalloc=<size> to increase size.\n"); 386 "vmap allocation for size %lu failed: "
387 "use vmalloc=<size> to increase size.\n", size);
386 return ERR_PTR(-EBUSY); 388 return ERR_PTR(-EBUSY);
387 } 389 }
388 390
@@ -432,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va)
432 vunmap_page_range(va->va_start, va->va_end); 434 vunmap_page_range(va->va_start, va->va_end);
433} 435}
434 436
437static void vmap_debug_free_range(unsigned long start, unsigned long end)
438{
439 /*
440 * Unmap page tables and force a TLB flush immediately if
441 * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
442 * bugs similarly to those in linear kernel virtual address
443 * space after a page has been freed.
444 *
445 * All the lazy freeing logic is still retained, in order to
446 * minimise intrusiveness of this debugging feature.
447 *
448 * This is going to be *slow* (linear kernel virtual address
449 * debugging doesn't do a broadcast TLB flush so it is a lot
450 * faster).
451 */
452#ifdef CONFIG_DEBUG_PAGEALLOC
453 vunmap_page_range(start, end);
454 flush_tlb_kernel_range(start, end);
455#endif
456}
457
435/* 458/*
436 * lazy_max_pages is the maximum amount of virtual address space we gather up 459 * lazy_max_pages is the maximum amount of virtual address space we gather up
437 * before attempting to purge with a TLB flush. 460 * before attempting to purge with a TLB flush.
@@ -472,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
472static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, 495static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
473 int sync, int force_flush) 496 int sync, int force_flush)
474{ 497{
475 static DEFINE_SPINLOCK(purge_lock); 498 static DEFINE_MUTEX(purge_lock);
476 LIST_HEAD(valist); 499 LIST_HEAD(valist);
477 struct vmap_area *va; 500 struct vmap_area *va;
478 int nr = 0; 501 int nr = 0;
@@ -483,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
483 * the case that isn't actually used at the moment anyway. 506 * the case that isn't actually used at the moment anyway.
484 */ 507 */
485 if (!sync && !force_flush) { 508 if (!sync && !force_flush) {
486 if (!spin_trylock(&purge_lock)) 509 if (!mutex_trylock(&purge_lock))
487 return; 510 return;
488 } else 511 } else
489 spin_lock(&purge_lock); 512 mutex_lock(&purge_lock);
490 513
491 rcu_read_lock(); 514 rcu_read_lock();
492 list_for_each_entry_rcu(va, &vmap_area_list, list) { 515 list_for_each_entry_rcu(va, &vmap_area_list, list) {
@@ -518,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
518 __free_vmap_area(va); 541 __free_vmap_area(va);
519 spin_unlock(&vmap_area_lock); 542 spin_unlock(&vmap_area_lock);
520 } 543 }
521 spin_unlock(&purge_lock); 544 mutex_unlock(&purge_lock);
522} 545}
523 546
524/* 547/*
@@ -912,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
912 BUG_ON(addr & (PAGE_SIZE-1)); 935 BUG_ON(addr & (PAGE_SIZE-1));
913 936
914 debug_check_no_locks_freed(mem, size); 937 debug_check_no_locks_freed(mem, size);
938 vmap_debug_free_range(addr, addr+size);
915 939
916 if (likely(count <= VMAP_MAX_ALLOC)) 940 if (likely(count <= VMAP_MAX_ALLOC))
917 vb_free(mem, size); 941 vb_free(mem, size);
@@ -1128,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr)
1128 if (va && va->flags & VM_VM_AREA) { 1152 if (va && va->flags & VM_VM_AREA) {
1129 struct vm_struct *vm = va->private; 1153 struct vm_struct *vm = va->private;
1130 struct vm_struct *tmp, **p; 1154 struct vm_struct *tmp, **p;
1155
1156 vmap_debug_free_range(va->va_start, va->va_end);
1131 free_unmap_vmap_area(va); 1157 free_unmap_vmap_area(va);
1132 vm->size -= PAGE_SIZE; 1158 vm->size -= PAGE_SIZE;
1133 1159
@@ -1375,7 +1401,8 @@ void *vmalloc_user(unsigned long size)
1375 struct vm_struct *area; 1401 struct vm_struct *area;
1376 void *ret; 1402 void *ret;
1377 1403
1378 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 1404 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1405 PAGE_KERNEL, -1, __builtin_return_address(0));
1379 if (ret) { 1406 if (ret) {
1380 area = find_vm_area(ret); 1407 area = find_vm_area(ret);
1381 area->flags |= VM_USERMAP; 1408 area->flags |= VM_USERMAP;
@@ -1420,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node);
1420 1447
1421void *vmalloc_exec(unsigned long size) 1448void *vmalloc_exec(unsigned long size)
1422{ 1449{
1423 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); 1450 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1451 -1, __builtin_return_address(0));
1424} 1452}
1425 1453
1426#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1454#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1440,7 +1468,8 @@ void *vmalloc_exec(unsigned long size)
1440 */ 1468 */
1441void *vmalloc_32(unsigned long size) 1469void *vmalloc_32(unsigned long size)
1442{ 1470{
1443 return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); 1471 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
1472 -1, __builtin_return_address(0));
1444} 1473}
1445EXPORT_SYMBOL(vmalloc_32); 1474EXPORT_SYMBOL(vmalloc_32);
1446 1475
@@ -1456,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size)
1456 struct vm_struct *area; 1485 struct vm_struct *area;
1457 void *ret; 1486 void *ret;
1458 1487
1459 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); 1488 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1489 -1, __builtin_return_address(0));
1460 if (ret) { 1490 if (ret) {
1461 area = find_vm_area(ret); 1491 area = find_vm_area(ret);
1462 area->flags |= VM_USERMAP; 1492 area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d196f46c8808..b07c48b09a93 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 52 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 53 unsigned long nr_scanned;
54 54
55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed;
57
55 /* This context's GFP mask */ 58 /* This context's GFP mask */
56 gfp_t gfp_mask; 59 gfp_t gfp_mask;
57 60
@@ -617,7 +620,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
617 referenced && page_mapping_inuse(page)) 620 referenced && page_mapping_inuse(page))
618 goto activate_locked; 621 goto activate_locked;
619 622
620#ifdef CONFIG_SWAP
621 /* 623 /*
622 * Anonymous process memory has backing store? 624 * Anonymous process memory has backing store?
623 * Try to allocate it some swap space here. 625 * Try to allocate it some swap space here.
@@ -625,20 +627,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
625 if (PageAnon(page) && !PageSwapCache(page)) { 627 if (PageAnon(page) && !PageSwapCache(page)) {
626 if (!(sc->gfp_mask & __GFP_IO)) 628 if (!(sc->gfp_mask & __GFP_IO))
627 goto keep_locked; 629 goto keep_locked;
628 switch (try_to_munlock(page)) { 630 if (!add_to_swap(page))
629 case SWAP_FAIL: /* shouldn't happen */
630 case SWAP_AGAIN:
631 goto keep_locked;
632 case SWAP_MLOCK:
633 goto cull_mlocked;
634 case SWAP_SUCCESS:
635 ; /* fall thru'; add to swap cache */
636 }
637 if (!add_to_swap(page, GFP_ATOMIC))
638 goto activate_locked; 631 goto activate_locked;
639 may_enter_fs = 1; 632 may_enter_fs = 1;
640 } 633 }
641#endif /* CONFIG_SWAP */
642 634
643 mapping = page_mapping(page); 635 mapping = page_mapping(page);
644 636
@@ -752,6 +744,8 @@ free_it:
752 continue; 744 continue;
753 745
754cull_mlocked: 746cull_mlocked:
747 if (PageSwapCache(page))
748 try_to_free_swap(page);
755 unlock_page(page); 749 unlock_page(page);
756 putback_lru_page(page); 750 putback_lru_page(page);
757 continue; 751 continue;
@@ -759,7 +753,7 @@ cull_mlocked:
759activate_locked: 753activate_locked:
760 /* Not a candidate for swapping, so reclaim swap space. */ 754 /* Not a candidate for swapping, so reclaim swap space. */
761 if (PageSwapCache(page) && vm_swap_full()) 755 if (PageSwapCache(page) && vm_swap_full())
762 remove_exclusive_swap_page_ref(page); 756 try_to_free_swap(page);
763 VM_BUG_ON(PageActive(page)); 757 VM_BUG_ON(PageActive(page));
764 SetPageActive(page); 758 SetPageActive(page);
765 pgactivate++; 759 pgactivate++;
@@ -1173,11 +1167,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1173 zone->prev_priority = priority; 1167 zone->prev_priority = priority;
1174} 1168}
1175 1169
1176static inline int zone_is_near_oom(struct zone *zone)
1177{
1178 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1179}
1180
1181/* 1170/*
1182 * This moves pages from the active list to the inactive list. 1171 * This moves pages from the active list to the inactive list.
1183 * 1172 *
@@ -1248,6 +1237,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1248 list_add(&page->lru, &l_inactive); 1237 list_add(&page->lru, &l_inactive);
1249 } 1238 }
1250 1239
1240 /*
1241 * Move the pages to the [file or anon] inactive list.
1242 */
1243 pagevec_init(&pvec, 1);
1244 pgmoved = 0;
1245 lru = LRU_BASE + file * LRU_FILE;
1246
1251 spin_lock_irq(&zone->lru_lock); 1247 spin_lock_irq(&zone->lru_lock);
1252 /* 1248 /*
1253 * Count referenced pages from currently used mappings as 1249 * Count referenced pages from currently used mappings as
@@ -1255,15 +1251,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1255 * This helps balance scan pressure between file and anonymous 1251 * This helps balance scan pressure between file and anonymous
1256 * pages in get_scan_ratio. 1252 * pages in get_scan_ratio.
1257 */ 1253 */
1258 zone->recent_rotated[!!file] += pgmoved; 1254 if (scan_global_lru(sc))
1259 1255 zone->recent_rotated[!!file] += pgmoved;
1260 /*
1261 * Move the pages to the [file or anon] inactive list.
1262 */
1263 pagevec_init(&pvec, 1);
1264 1256
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267 while (!list_empty(&l_inactive)) { 1257 while (!list_empty(&l_inactive)) {
1268 page = lru_to_page(&l_inactive); 1258 page = lru_to_page(&l_inactive);
1269 prefetchw_prev_lru_page(page, &l_inactive, flags); 1259 prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1336,12 +1326,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1336 unsigned long anon_prio, file_prio; 1326 unsigned long anon_prio, file_prio;
1337 unsigned long ap, fp; 1327 unsigned long ap, fp;
1338 1328
1339 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1340 zone_page_state(zone, NR_INACTIVE_ANON);
1341 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1342 zone_page_state(zone, NR_INACTIVE_FILE);
1343 free = zone_page_state(zone, NR_FREE_PAGES);
1344
1345 /* If we have no swap space, do not bother scanning anon pages. */ 1329 /* If we have no swap space, do not bother scanning anon pages. */
1346 if (nr_swap_pages <= 0) { 1330 if (nr_swap_pages <= 0) {
1347 percent[0] = 0; 1331 percent[0] = 0;
@@ -1349,6 +1333,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1349 return; 1333 return;
1350 } 1334 }
1351 1335
1336 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1337 zone_page_state(zone, NR_INACTIVE_ANON);
1338 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1339 zone_page_state(zone, NR_INACTIVE_FILE);
1340 free = zone_page_state(zone, NR_FREE_PAGES);
1341
1352 /* If we have very few page cache pages, force-scan anon pages. */ 1342 /* If we have very few page cache pages, force-scan anon pages. */
1353 if (unlikely(file + free <= zone->pages_high)) { 1343 if (unlikely(file + free <= zone->pages_high)) {
1354 percent[0] = 100; 1344 percent[0] = 100;
@@ -1408,14 +1398,15 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1408/* 1398/*
1409 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1399 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1410 */ 1400 */
1411static unsigned long shrink_zone(int priority, struct zone *zone, 1401static void shrink_zone(int priority, struct zone *zone,
1412 struct scan_control *sc) 1402 struct scan_control *sc)
1413{ 1403{
1414 unsigned long nr[NR_LRU_LISTS]; 1404 unsigned long nr[NR_LRU_LISTS];
1415 unsigned long nr_to_scan; 1405 unsigned long nr_to_scan;
1416 unsigned long nr_reclaimed = 0;
1417 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1406 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1418 enum lru_list l; 1407 enum lru_list l;
1408 unsigned long nr_reclaimed = sc->nr_reclaimed;
1409 unsigned long swap_cluster_max = sc->swap_cluster_max;
1419 1410
1420 get_scan_ratio(zone, sc, percent); 1411 get_scan_ratio(zone, sc, percent);
1421 1412
@@ -1431,7 +1422,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1431 } 1422 }
1432 zone->lru[l].nr_scan += scan; 1423 zone->lru[l].nr_scan += scan;
1433 nr[l] = zone->lru[l].nr_scan; 1424 nr[l] = zone->lru[l].nr_scan;
1434 if (nr[l] >= sc->swap_cluster_max) 1425 if (nr[l] >= swap_cluster_max)
1435 zone->lru[l].nr_scan = 0; 1426 zone->lru[l].nr_scan = 0;
1436 else 1427 else
1437 nr[l] = 0; 1428 nr[l] = 0;
@@ -1450,16 +1441,28 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1450 nr[LRU_INACTIVE_FILE]) { 1441 nr[LRU_INACTIVE_FILE]) {
1451 for_each_evictable_lru(l) { 1442 for_each_evictable_lru(l) {
1452 if (nr[l]) { 1443 if (nr[l]) {
1453 nr_to_scan = min(nr[l], 1444 nr_to_scan = min(nr[l], swap_cluster_max);
1454 (unsigned long)sc->swap_cluster_max);
1455 nr[l] -= nr_to_scan; 1445 nr[l] -= nr_to_scan;
1456 1446
1457 nr_reclaimed += shrink_list(l, nr_to_scan, 1447 nr_reclaimed += shrink_list(l, nr_to_scan,
1458 zone, sc, priority); 1448 zone, sc, priority);
1459 } 1449 }
1460 } 1450 }
1451 /*
1452 * On large memory systems, scan >> priority can become
1453 * really large. This is fine for the starting priority;
1454 * we want to put equal scanning pressure on each zone.
1455 * However, if the VM has a harder time of freeing pages,
1456 * with multiple processes reclaiming pages, the total
1457 * freeing target can get unreasonably large.
1458 */
1459 if (nr_reclaimed > swap_cluster_max &&
1460 priority < DEF_PRIORITY && !current_is_kswapd())
1461 break;
1461 } 1462 }
1462 1463
1464 sc->nr_reclaimed = nr_reclaimed;
1465
1463 /* 1466 /*
1464 * Even if we did not try to evict anon pages at all, we want to 1467 * Even if we did not try to evict anon pages at all, we want to
1465 * rebalance the anon lru active/inactive ratio. 1468 * rebalance the anon lru active/inactive ratio.
@@ -1470,7 +1473,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1470 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1473 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1471 1474
1472 throttle_vm_writeout(sc->gfp_mask); 1475 throttle_vm_writeout(sc->gfp_mask);
1473 return nr_reclaimed;
1474} 1476}
1475 1477
1476/* 1478/*
@@ -1484,16 +1486,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1484 * b) The zones may be over pages_high but they must go *over* pages_high to 1486 * b) The zones may be over pages_high but they must go *over* pages_high to
1485 * satisfy the `incremental min' zone defense algorithm. 1487 * satisfy the `incremental min' zone defense algorithm.
1486 * 1488 *
1487 * Returns the number of reclaimed pages.
1488 *
1489 * If a zone is deemed to be full of pinned pages then just give it a light 1489 * If a zone is deemed to be full of pinned pages then just give it a light
1490 * scan then give up on it. 1490 * scan then give up on it.
1491 */ 1491 */
1492static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 1492static void shrink_zones(int priority, struct zonelist *zonelist,
1493 struct scan_control *sc) 1493 struct scan_control *sc)
1494{ 1494{
1495 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1495 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1496 unsigned long nr_reclaimed = 0;
1497 struct zoneref *z; 1496 struct zoneref *z;
1498 struct zone *zone; 1497 struct zone *zone;
1499 1498
@@ -1524,10 +1523,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1524 priority); 1523 priority);
1525 } 1524 }
1526 1525
1527 nr_reclaimed += shrink_zone(priority, zone, sc); 1526 shrink_zone(priority, zone, sc);
1528 } 1527 }
1529
1530 return nr_reclaimed;
1531} 1528}
1532 1529
1533/* 1530/*
@@ -1552,7 +1549,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1552 int priority; 1549 int priority;
1553 unsigned long ret = 0; 1550 unsigned long ret = 0;
1554 unsigned long total_scanned = 0; 1551 unsigned long total_scanned = 0;
1555 unsigned long nr_reclaimed = 0;
1556 struct reclaim_state *reclaim_state = current->reclaim_state; 1552 struct reclaim_state *reclaim_state = current->reclaim_state;
1557 unsigned long lru_pages = 0; 1553 unsigned long lru_pages = 0;
1558 struct zoneref *z; 1554 struct zoneref *z;
@@ -1580,7 +1576,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1580 sc->nr_scanned = 0; 1576 sc->nr_scanned = 0;
1581 if (!priority) 1577 if (!priority)
1582 disable_swap_token(); 1578 disable_swap_token();
1583 nr_reclaimed += shrink_zones(priority, zonelist, sc); 1579 shrink_zones(priority, zonelist, sc);
1584 /* 1580 /*
1585 * Don't shrink slabs when reclaiming memory from 1581 * Don't shrink slabs when reclaiming memory from
1586 * over limit cgroups 1582 * over limit cgroups
@@ -1588,13 +1584,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1588 if (scan_global_lru(sc)) { 1584 if (scan_global_lru(sc)) {
1589 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1585 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1590 if (reclaim_state) { 1586 if (reclaim_state) {
1591 nr_reclaimed += reclaim_state->reclaimed_slab; 1587 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1592 reclaim_state->reclaimed_slab = 0; 1588 reclaim_state->reclaimed_slab = 0;
1593 } 1589 }
1594 } 1590 }
1595 total_scanned += sc->nr_scanned; 1591 total_scanned += sc->nr_scanned;
1596 if (nr_reclaimed >= sc->swap_cluster_max) { 1592 if (sc->nr_reclaimed >= sc->swap_cluster_max) {
1597 ret = nr_reclaimed; 1593 ret = sc->nr_reclaimed;
1598 goto out; 1594 goto out;
1599 } 1595 }
1600 1596
@@ -1617,7 +1613,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1617 } 1613 }
1618 /* top priority shrink_zones still had more to do? don't OOM, then */ 1614 /* top priority shrink_zones still had more to do? don't OOM, then */
1619 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1615 if (!sc->all_unreclaimable && scan_global_lru(sc))
1620 ret = nr_reclaimed; 1616 ret = sc->nr_reclaimed;
1621out: 1617out:
1622 /* 1618 /*
1623 * Now that we've scanned all the zones at this priority level, note 1619 * Now that we've scanned all the zones at this priority level, note
@@ -1712,7 +1708,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1712 int priority; 1708 int priority;
1713 int i; 1709 int i;
1714 unsigned long total_scanned; 1710 unsigned long total_scanned;
1715 unsigned long nr_reclaimed;
1716 struct reclaim_state *reclaim_state = current->reclaim_state; 1711 struct reclaim_state *reclaim_state = current->reclaim_state;
1717 struct scan_control sc = { 1712 struct scan_control sc = {
1718 .gfp_mask = GFP_KERNEL, 1713 .gfp_mask = GFP_KERNEL,
@@ -1731,7 +1726,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1731 1726
1732loop_again: 1727loop_again:
1733 total_scanned = 0; 1728 total_scanned = 0;
1734 nr_reclaimed = 0; 1729 sc.nr_reclaimed = 0;
1735 sc.may_writepage = !laptop_mode; 1730 sc.may_writepage = !laptop_mode;
1736 count_vm_event(PAGEOUTRUN); 1731 count_vm_event(PAGEOUTRUN);
1737 1732
@@ -1817,11 +1812,11 @@ loop_again:
1817 */ 1812 */
1818 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1813 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1819 end_zone, 0)) 1814 end_zone, 0))
1820 nr_reclaimed += shrink_zone(priority, zone, &sc); 1815 shrink_zone(priority, zone, &sc);
1821 reclaim_state->reclaimed_slab = 0; 1816 reclaim_state->reclaimed_slab = 0;
1822 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1817 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1823 lru_pages); 1818 lru_pages);
1824 nr_reclaimed += reclaim_state->reclaimed_slab; 1819 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1825 total_scanned += sc.nr_scanned; 1820 total_scanned += sc.nr_scanned;
1826 if (zone_is_all_unreclaimable(zone)) 1821 if (zone_is_all_unreclaimable(zone))
1827 continue; 1822 continue;
@@ -1835,7 +1830,7 @@ loop_again:
1835 * even in laptop mode 1830 * even in laptop mode
1836 */ 1831 */
1837 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1832 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1838 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1833 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
1839 sc.may_writepage = 1; 1834 sc.may_writepage = 1;
1840 } 1835 }
1841 if (all_zones_ok) 1836 if (all_zones_ok)
@@ -1853,7 +1848,7 @@ loop_again:
1853 * matches the direct reclaim path behaviour in terms of impact 1848 * matches the direct reclaim path behaviour in terms of impact
1854 * on zone->*_priority. 1849 * on zone->*_priority.
1855 */ 1850 */
1856 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1851 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
1857 break; 1852 break;
1858 } 1853 }
1859out: 1854out:
@@ -1872,10 +1867,27 @@ out:
1872 1867
1873 try_to_freeze(); 1868 try_to_freeze();
1874 1869
1870 /*
1871 * Fragmentation may mean that the system cannot be
1872 * rebalanced for high-order allocations in all zones.
1873 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
1874 * it means the zones have been fully scanned and are still
1875 * not balanced. For high-order allocations, there is
1876 * little point trying all over again as kswapd may
1877 * infinite loop.
1878 *
1879 * Instead, recheck all watermarks at order-0 as they
1880 * are the most important. If watermarks are ok, kswapd will go
1881 * back to sleep. High-order users can still perform direct
1882 * reclaim if they wish.
1883 */
1884 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
1885 order = sc.order = 0;
1886
1875 goto loop_again; 1887 goto loop_again;
1876 } 1888 }
1877 1889
1878 return nr_reclaimed; 1890 return sc.nr_reclaimed;
1879} 1891}
1880 1892
1881/* 1893/*
@@ -2227,7 +2239,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2227 struct task_struct *p = current; 2239 struct task_struct *p = current;
2228 struct reclaim_state reclaim_state; 2240 struct reclaim_state reclaim_state;
2229 int priority; 2241 int priority;
2230 unsigned long nr_reclaimed = 0;
2231 struct scan_control sc = { 2242 struct scan_control sc = {
2232 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2243 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2233 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2244 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2260,9 +2271,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2260 priority = ZONE_RECLAIM_PRIORITY; 2271 priority = ZONE_RECLAIM_PRIORITY;
2261 do { 2272 do {
2262 note_zone_scanning_priority(zone, priority); 2273 note_zone_scanning_priority(zone, priority);
2263 nr_reclaimed += shrink_zone(priority, zone, &sc); 2274 shrink_zone(priority, zone, &sc);
2264 priority--; 2275 priority--;
2265 } while (priority >= 0 && nr_reclaimed < nr_pages); 2276 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2266 } 2277 }
2267 2278
2268 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2279 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2286,13 +2297,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2286 * Update nr_reclaimed by the number of slab pages we 2297 * Update nr_reclaimed by the number of slab pages we
2287 * reclaimed from this zone. 2298 * reclaimed from this zone.
2288 */ 2299 */
2289 nr_reclaimed += slab_reclaimable - 2300 sc.nr_reclaimed += slab_reclaimable -
2290 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2301 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2291 } 2302 }
2292 2303
2293 p->reclaim_state = NULL; 2304 p->reclaim_state = NULL;
2294 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2305 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2295 return nr_reclaimed >= nr_pages; 2306 return sc.nr_reclaimed >= nr_pages;
2296} 2307}
2297 2308
2298int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 2309int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2472,7 +2483,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
2472 * back onto @zone's unevictable list. 2483 * back onto @zone's unevictable list.
2473 */ 2484 */
2474#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ 2485#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2475void scan_zone_unevictable_pages(struct zone *zone) 2486static void scan_zone_unevictable_pages(struct zone *zone)
2476{ 2487{
2477 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 2488 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2478 unsigned long scan; 2489 unsigned long scan;
@@ -2514,7 +2525,7 @@ void scan_zone_unevictable_pages(struct zone *zone)
2514 * that has possibly/probably made some previously unevictable pages 2525 * that has possibly/probably made some previously unevictable pages
2515 * evictable. 2526 * evictable.
2516 */ 2527 */
2517void scan_all_zones_unevictable_pages(void) 2528static void scan_all_zones_unevictable_pages(void)
2518{ 2529{
2519 struct zone *zone; 2530 struct zone *zone;
2520 2531