aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile1
-rw-r--r--mm/filemap.c118
-rw-r--r--mm/hugetlb.c9
-rw-r--r--mm/hwpoison-inject.c113
-rw-r--r--mm/internal.h12
-rw-r--r--mm/kmemleak.c188
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memcontrol.c27
-rw-r--r--mm/memory-failure.c569
-rw-r--r--mm/memory.c4
-rw-r--r--mm/migrate.c39
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/nommu.c144
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c102
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/shmem.c73
-rw-r--r--mm/shmem_acl.c171
-rw-r--r--mm/slab.c16
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/util.c46
-rw-r--r--mm/vmalloc.c114
-rw-r--r--mm/vmscan.c3
26 files changed, 1273 insertions, 602 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2310984591ed..17b8947aa7da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -221,6 +221,7 @@ config KSM
221 221
222config DEFAULT_MMAP_MIN_ADDR 222config DEFAULT_MMAP_MIN_ADDR
223 int "Low address space to protect from user allocation" 223 int "Low address space to protect from user allocation"
224 depends on MMU
224 default 4096 225 default 4096
225 help 226 help
226 This is the portion of low virtual memory which should be protected 227 This is the portion of low virtual memory which should be protected
@@ -251,8 +252,9 @@ config MEMORY_FAILURE
251 special hardware support and typically ECC memory. 252 special hardware support and typically ECC memory.
252 253
253config HWPOISON_INJECT 254config HWPOISON_INJECT
254 tristate "Poison pages injector" 255 tristate "HWPoison pages injector"
255 depends on MEMORY_FAILURE && DEBUG_KERNEL 256 depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
257 select PROC_PAGE_MONITOR
256 258
257config NOMMU_INITIAL_TRIM_EXCESS 259config NOMMU_INITIAL_TRIM_EXCESS
258 int "Turn on mmap() excess space trimming before booting" 260 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Makefile b/mm/Makefile
index 82131d0f8d85..7a68d2ab5560 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
22obj-$(CONFIG_NUMA) += mempolicy.o 22obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o 27obj-$(CONFIG_KSM) += ksm.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 8b4d88f9249e..698ea80f2102 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1634static struct page *__read_cache_page(struct address_space *mapping, 1634static struct page *__read_cache_page(struct address_space *mapping,
1635 pgoff_t index, 1635 pgoff_t index,
1636 int (*filler)(void *,struct page*), 1636 int (*filler)(void *,struct page*),
1637 void *data) 1637 void *data,
1638 gfp_t gfp)
1638{ 1639{
1639 struct page *page; 1640 struct page *page;
1640 int err; 1641 int err;
1641repeat: 1642repeat:
1642 page = find_get_page(mapping, index); 1643 page = find_get_page(mapping, index);
1643 if (!page) { 1644 if (!page) {
1644 page = page_cache_alloc_cold(mapping); 1645 page = __page_cache_alloc(gfp | __GFP_COLD);
1645 if (!page) 1646 if (!page)
1646 return ERR_PTR(-ENOMEM); 1647 return ERR_PTR(-ENOMEM);
1647 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1648 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
1661 return page; 1662 return page;
1662} 1663}
1663 1664
1664/** 1665static struct page *do_read_cache_page(struct address_space *mapping,
1665 * read_cache_page_async - read into page cache, fill it if needed
1666 * @mapping: the page's address_space
1667 * @index: the page index
1668 * @filler: function to perform the read
1669 * @data: destination for read data
1670 *
1671 * Same as read_cache_page, but don't wait for page to become unlocked
1672 * after submitting it to the filler.
1673 *
1674 * Read into the page cache. If a page already exists, and PageUptodate() is
1675 * not set, try to fill the page but don't wait for it to become unlocked.
1676 *
1677 * If the page does not get brought uptodate, return -EIO.
1678 */
1679struct page *read_cache_page_async(struct address_space *mapping,
1680 pgoff_t index, 1666 pgoff_t index,
1681 int (*filler)(void *,struct page*), 1667 int (*filler)(void *,struct page*),
1682 void *data) 1668 void *data,
1669 gfp_t gfp)
1670
1683{ 1671{
1684 struct page *page; 1672 struct page *page;
1685 int err; 1673 int err;
1686 1674
1687retry: 1675retry:
1688 page = __read_cache_page(mapping, index, filler, data); 1676 page = __read_cache_page(mapping, index, filler, data, gfp);
1689 if (IS_ERR(page)) 1677 if (IS_ERR(page))
1690 return page; 1678 return page;
1691 if (PageUptodate(page)) 1679 if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
1710 mark_page_accessed(page); 1698 mark_page_accessed(page);
1711 return page; 1699 return page;
1712} 1700}
1701
1702/**
1703 * read_cache_page_async - read into page cache, fill it if needed
1704 * @mapping: the page's address_space
1705 * @index: the page index
1706 * @filler: function to perform the read
1707 * @data: destination for read data
1708 *
1709 * Same as read_cache_page, but don't wait for page to become unlocked
1710 * after submitting it to the filler.
1711 *
1712 * Read into the page cache. If a page already exists, and PageUptodate() is
1713 * not set, try to fill the page but don't wait for it to become unlocked.
1714 *
1715 * If the page does not get brought uptodate, return -EIO.
1716 */
1717struct page *read_cache_page_async(struct address_space *mapping,
1718 pgoff_t index,
1719 int (*filler)(void *,struct page*),
1720 void *data)
1721{
1722 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1723}
1713EXPORT_SYMBOL(read_cache_page_async); 1724EXPORT_SYMBOL(read_cache_page_async);
1714 1725
1726static struct page *wait_on_page_read(struct page *page)
1727{
1728 if (!IS_ERR(page)) {
1729 wait_on_page_locked(page);
1730 if (!PageUptodate(page)) {
1731 page_cache_release(page);
1732 page = ERR_PTR(-EIO);
1733 }
1734 }
1735 return page;
1736}
1737
1738/**
1739 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
1740 * @mapping: the page's address_space
1741 * @index: the page index
1742 * @gfp: the page allocator flags to use if allocating
1743 *
1744 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
1745 * any new page allocations done using the specified allocation flags. Note
1746 * that the Radix tree operations will still use GFP_KERNEL, so you can't
1747 * expect to do this atomically or anything like that - but you can pass in
1748 * other page requirements.
1749 *
1750 * If the page does not get brought uptodate, return -EIO.
1751 */
1752struct page *read_cache_page_gfp(struct address_space *mapping,
1753 pgoff_t index,
1754 gfp_t gfp)
1755{
1756 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
1757
1758 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
1759}
1760EXPORT_SYMBOL(read_cache_page_gfp);
1761
1715/** 1762/**
1716 * read_cache_page - read into page cache, fill it if needed 1763 * read_cache_page - read into page cache, fill it if needed
1717 * @mapping: the page's address_space 1764 * @mapping: the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
1729 int (*filler)(void *,struct page*), 1776 int (*filler)(void *,struct page*),
1730 void *data) 1777 void *data)
1731{ 1778{
1732 struct page *page; 1779 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1733
1734 page = read_cache_page_async(mapping, index, filler, data);
1735 if (IS_ERR(page))
1736 goto out;
1737 wait_on_page_locked(page);
1738 if (!PageUptodate(page)) {
1739 page_cache_release(page);
1740 page = ERR_PTR(-EIO);
1741 }
1742 out:
1743 return page;
1744} 1780}
1745EXPORT_SYMBOL(read_cache_page); 1781EXPORT_SYMBOL(read_cache_page);
1746 1782
@@ -2196,6 +2232,9 @@ again:
2196 if (unlikely(status)) 2232 if (unlikely(status))
2197 break; 2233 break;
2198 2234
2235 if (mapping_writably_mapped(mapping))
2236 flush_dcache_page(page);
2237
2199 pagefault_disable(); 2238 pagefault_disable();
2200 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2239 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2201 pagefault_enable(); 2240 pagefault_enable();
@@ -2240,7 +2279,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2240 size_t count, ssize_t written) 2279 size_t count, ssize_t written)
2241{ 2280{
2242 struct file *file = iocb->ki_filp; 2281 struct file *file = iocb->ki_filp;
2243 struct address_space *mapping = file->f_mapping;
2244 ssize_t status; 2282 ssize_t status;
2245 struct iov_iter i; 2283 struct iov_iter i;
2246 2284
@@ -2252,15 +2290,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2252 *ppos = pos + status; 2290 *ppos = pos + status;
2253 } 2291 }
2254 2292
2255 /*
2256 * If we get here for O_DIRECT writes then we must have fallen through
2257 * to buffered writes (block instantiation inside i_size). So we sync
2258 * the file data here, to try to honour O_DIRECT expectations.
2259 */
2260 if (unlikely(file->f_flags & O_DIRECT) && written)
2261 status = filemap_write_and_wait_range(mapping,
2262 pos, pos + written - 1);
2263
2264 return written ? written : status; 2293 return written ? written : status;
2265} 2294}
2266EXPORT_SYMBOL(generic_file_buffered_write); 2295EXPORT_SYMBOL(generic_file_buffered_write);
@@ -2359,10 +2388,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2359 * semantics. 2388 * semantics.
2360 */ 2389 */
2361 endbyte = pos + written_buffered - written - 1; 2390 endbyte = pos + written_buffered - written - 1;
2362 err = do_sync_mapping_range(file->f_mapping, pos, endbyte, 2391 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
2363 SYNC_FILE_RANGE_WAIT_BEFORE|
2364 SYNC_FILE_RANGE_WRITE|
2365 SYNC_FILE_RANGE_WAIT_AFTER);
2366 if (err == 0) { 2392 if (err == 0) {
2367 written = written_buffered; 2393 written = written_buffered;
2368 invalidate_mapping_pages(mapping, 2394 invalidate_mapping_pages(mapping,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c218207..2d16fa6b8c2d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
402{ 402{
403 int i; 403 int i;
404 404
405 if (unlikely(sz > MAX_ORDER_NR_PAGES)) { 405 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
406 clear_gigantic_page(page, addr, sz); 406 clear_gigantic_page(page, addr, sz);
407 return; 407 return;
408 } 408 }
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = {
1515 .attrs = hstate_attrs, 1515 .attrs = hstate_attrs,
1516}; 1516};
1517 1517
1518static int __init hugetlb_sysfs_add_hstate(struct hstate *h, 1518static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1519 struct kobject *parent, 1519 struct kobject **hstate_kobjs,
1520 struct kobject **hstate_kobjs, 1520 struct attribute_group *hstate_attr_group)
1521 struct attribute_group *hstate_attr_group)
1522{ 1521{
1523 int retval; 1522 int retval;
1524 int hi = h - hstates; 1523 int hi = h - hstates;
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e1d85137f086..10ea71905c1f 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -3,18 +3,68 @@
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h>
7#include <linux/pagemap.h>
8#include "internal.h"
6 9
7static struct dentry *hwpoison_dir, *corrupt_pfn; 10static struct dentry *hwpoison_dir;
8 11
9static int hwpoison_inject(void *data, u64 val) 12static int hwpoison_inject(void *data, u64 val)
10{ 13{
14 unsigned long pfn = val;
15 struct page *p;
16 int err;
17
18 if (!capable(CAP_SYS_ADMIN))
19 return -EPERM;
20
21 if (!hwpoison_filter_enable)
22 goto inject;
23 if (!pfn_valid(pfn))
24 return -ENXIO;
25
26 p = pfn_to_page(pfn);
27 /*
28 * This implies unable to support free buddy pages.
29 */
30 if (!get_page_unless_zero(p))
31 return 0;
32
33 if (!PageLRU(p))
34 shake_page(p, 0);
35 /*
36 * This implies unable to support non-LRU pages.
37 */
38 if (!PageLRU(p))
39 return 0;
40
41 /*
42 * do a racy check with elevated page count, to make sure PG_hwpoison
43 * will only be set for the targeted owner (or on a free page).
44 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock.
46 */
47 lock_page(p);
48 err = hwpoison_filter(p);
49 unlock_page(p);
50 if (err)
51 return 0;
52
53inject:
54 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
55 return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
56}
57
58static int hwpoison_unpoison(void *data, u64 val)
59{
11 if (!capable(CAP_SYS_ADMIN)) 60 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM; 61 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); 62
14 return __memory_failure(val, 18, 0); 63 return unpoison_memory(val);
15} 64}
16 65
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 66DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
67DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
18 68
19static void pfn_inject_exit(void) 69static void pfn_inject_exit(void)
20{ 70{
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void)
24 74
25static int pfn_inject_init(void) 75static int pfn_inject_init(void)
26{ 76{
77 struct dentry *dentry;
78
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 79 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL) 80 if (hwpoison_dir == NULL)
29 return -ENOMEM; 81 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 82
83 /*
84 * Note that the below poison/unpoison interfaces do not involve
85 * hardware status change, hence do not require hardware support.
86 * They are mainly for testing hwpoison in software level.
87 */
88 dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops); 89 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) { 90 if (!dentry)
33 pfn_inject_exit(); 91 goto fail;
34 return -ENOMEM; 92
35 } 93 dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
94 NULL, &unpoison_fops);
95 if (!dentry)
96 goto fail;
97
98 dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
99 hwpoison_dir, &hwpoison_filter_enable);
100 if (!dentry)
101 goto fail;
102
103 dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
104 hwpoison_dir, &hwpoison_filter_dev_major);
105 if (!dentry)
106 goto fail;
107
108 dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
109 hwpoison_dir, &hwpoison_filter_dev_minor);
110 if (!dentry)
111 goto fail;
112
113 dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
114 hwpoison_dir, &hwpoison_filter_flags_mask);
115 if (!dentry)
116 goto fail;
117
118 dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
119 hwpoison_dir, &hwpoison_filter_flags_value);
120 if (!dentry)
121 goto fail;
122
123#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
124 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
125 hwpoison_dir, &hwpoison_filter_memcg);
126 if (!dentry)
127 goto fail;
128#endif
129
36 return 0; 130 return 0;
131fail:
132 pfn_inject_exit();
133 return -ENOMEM;
37} 134}
38 135
39module_init(pfn_inject_init); 136module_init(pfn_inject_init);
diff --git a/mm/internal.h b/mm/internal.h
index 4fe67a162cb4..6a697bb97fc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page);
50 */ 50 */
51extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
52extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
53#ifdef CONFIG_MEMORY_FAILURE
54extern bool is_free_buddy_page(struct page *page);
55#endif
53 56
54 57
55/* 58/*
@@ -247,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
247#define ZONE_RECLAIM_SOME 0 250#define ZONE_RECLAIM_SOME 0
248#define ZONE_RECLAIM_SUCCESS 1 251#define ZONE_RECLAIM_SUCCESS 1
249#endif 252#endif
253
254extern int hwpoison_filter(struct page *p);
255
256extern u32 hwpoison_filter_dev_major;
257extern u32 hwpoison_filter_dev_minor;
258extern u64 hwpoison_filter_flags_mask;
259extern u64 hwpoison_filter_flags_value;
260extern u64 hwpoison_filter_memcg;
261extern u32 hwpoison_filter_enable;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 13f33b3081ec..5b069e4f5e48 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -93,6 +93,7 @@
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h> 95#include <linux/workqueue.h>
96#include <linux/crc32.h>
96 97
97#include <asm/sections.h> 98#include <asm/sections.h>
98#include <asm/processor.h> 99#include <asm/processor.h>
@@ -108,7 +109,6 @@
108#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ 109#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
109#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 110#define SECS_FIRST_SCAN 60 /* delay before the first scan */
110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 111#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ 112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
113 113
114#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
@@ -119,8 +119,8 @@
119/* scanning area inside a memory block */ 119/* scanning area inside a memory block */
120struct kmemleak_scan_area { 120struct kmemleak_scan_area {
121 struct hlist_node node; 121 struct hlist_node node;
122 unsigned long offset; 122 unsigned long start;
123 size_t length; 123 size_t size;
124}; 124};
125 125
126#define KMEMLEAK_GREY 0 126#define KMEMLEAK_GREY 0
@@ -149,6 +149,8 @@ struct kmemleak_object {
149 int min_count; 149 int min_count;
150 /* the total number of pointers found pointing to this object */ 150 /* the total number of pointers found pointing to this object */
151 int count; 151 int count;
152 /* checksum for detecting modified objects */
153 u32 checksum;
152 /* memory ranges to be scanned inside an object (empty for all) */ 154 /* memory ranges to be scanned inside an object (empty for all) */
153 struct hlist_head area_list; 155 struct hlist_head area_list;
154 unsigned long trace[MAX_TRACE]; 156 unsigned long trace[MAX_TRACE];
@@ -164,8 +166,6 @@ struct kmemleak_object {
164#define OBJECT_REPORTED (1 << 1) 166#define OBJECT_REPORTED (1 << 1)
165/* flag set to not scan the object */ 167/* flag set to not scan the object */
166#define OBJECT_NO_SCAN (1 << 2) 168#define OBJECT_NO_SCAN (1 << 2)
167/* flag set on newly allocated objects */
168#define OBJECT_NEW (1 << 3)
169 169
170/* number of bytes to print per line; must be 16 or 32 */ 170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16 171#define HEX_ROW_SIZE 16
@@ -241,8 +241,6 @@ struct early_log {
241 const void *ptr; /* allocated/freed memory block */ 241 const void *ptr; /* allocated/freed memory block */
242 size_t size; /* memory block size */ 242 size_t size; /* memory block size */
243 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
244 unsigned long offset; /* scan area offset */
245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */ 244 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */ 245 unsigned int trace_len; /* stack trace length */
248}; 246};
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object)
323 object->count >= object->min_count; 321 object->count >= object->min_count;
324} 322}
325 323
326static bool color_black(const struct kmemleak_object *object)
327{
328 return object->min_count == KMEMLEAK_BLACK;
329}
330
331/* 324/*
332 * Objects are considered unreferenced only if their color is white, they have 325 * Objects are considered unreferenced only if their color is white, they have
333 * not be deleted and have a minimum age to avoid false positives caused by 326 * not be deleted and have a minimum age to avoid false positives caused by
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object)
335 */ 328 */
336static bool unreferenced_object(struct kmemleak_object *object) 329static bool unreferenced_object(struct kmemleak_object *object)
337{ 330{
338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 331 return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
339 time_before_eq(object->jiffies + jiffies_min_age, 332 time_before_eq(object->jiffies + jiffies_min_age,
340 jiffies_last_scan); 333 jiffies_last_scan);
341} 334}
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq,
348 struct kmemleak_object *object) 341 struct kmemleak_object *object)
349{ 342{
350 int i; 343 int i;
344 unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
351 345
352 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", 346 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
353 object->pointer, object->size); 347 object->pointer, object->size);
354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 348 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
355 object->comm, object->pid, object->jiffies); 349 object->comm, object->pid, object->jiffies,
350 msecs_age / 1000, msecs_age % 1000);
356 hex_dump_object(seq, object); 351 hex_dump_object(seq, object);
357 seq_printf(seq, " backtrace:\n"); 352 seq_printf(seq, " backtrace:\n");
358 353
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object)
381 pr_notice(" min_count = %d\n", object->min_count); 376 pr_notice(" min_count = %d\n", object->min_count);
382 pr_notice(" count = %d\n", object->count); 377 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags); 378 pr_notice(" flags = 0x%lx\n", object->flags);
379 pr_notice(" checksum = %d\n", object->checksum);
384 pr_notice(" backtrace:\n"); 380 pr_notice(" backtrace:\n");
385 print_stack_trace(&trace, 4); 381 print_stack_trace(&trace, 4);
386} 382}
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
522 INIT_HLIST_HEAD(&object->area_list); 518 INIT_HLIST_HEAD(&object->area_list);
523 spin_lock_init(&object->lock); 519 spin_lock_init(&object->lock);
524 atomic_set(&object->use_count, 1); 520 atomic_set(&object->use_count, 1);
525 object->flags = OBJECT_ALLOCATED | OBJECT_NEW; 521 object->flags = OBJECT_ALLOCATED;
526 object->pointer = ptr; 522 object->pointer = ptr;
527 object->size = size; 523 object->size = size;
528 object->min_count = min_count; 524 object->min_count = min_count;
529 object->count = -1; /* no color initially */ 525 object->count = 0; /* white color initially */
530 object->jiffies = jiffies; 526 object->jiffies = jiffies;
527 object->checksum = 0;
531 528
532 /* task information */ 529 /* task information */
533 if (in_irq()) { 530 if (in_irq()) {
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr)
720 * Add a scanning area to the object. If at least one such area is added, 717 * Add a scanning area to the object. If at least one such area is added,
721 * kmemleak will only scan these ranges rather than the whole memory block. 718 * kmemleak will only scan these ranges rather than the whole memory block.
722 */ 719 */
723static void add_scan_area(unsigned long ptr, unsigned long offset, 720static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
724 size_t length, gfp_t gfp)
725{ 721{
726 unsigned long flags; 722 unsigned long flags;
727 struct kmemleak_object *object; 723 struct kmemleak_object *object;
728 struct kmemleak_scan_area *area; 724 struct kmemleak_scan_area *area;
729 725
730 object = find_and_get_object(ptr, 0); 726 object = find_and_get_object(ptr, 1);
731 if (!object) { 727 if (!object) {
732 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", 728 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
733 ptr); 729 ptr);
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
741 } 737 }
742 738
743 spin_lock_irqsave(&object->lock, flags); 739 spin_lock_irqsave(&object->lock, flags);
744 if (offset + length > object->size) { 740 if (ptr + size > object->pointer + object->size) {
745 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); 741 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
746 dump_object_info(object); 742 dump_object_info(object);
747 kmem_cache_free(scan_area_cache, area); 743 kmem_cache_free(scan_area_cache, area);
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
749 } 745 }
750 746
751 INIT_HLIST_NODE(&area->node); 747 INIT_HLIST_NODE(&area->node);
752 area->offset = offset; 748 area->start = ptr;
753 area->length = length; 749 area->size = size;
754 750
755 hlist_add_head(&area->node, &object->area_list); 751 hlist_add_head(&area->node, &object->area_list);
756out_unlock: 752out_unlock:
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr)
786 * processed later once kmemleak is fully initialized. 782 * processed later once kmemleak is fully initialized.
787 */ 783 */
788static void __init log_early(int op_type, const void *ptr, size_t size, 784static void __init log_early(int op_type, const void *ptr, size_t size,
789 int min_count, unsigned long offset, size_t length) 785 int min_count)
790{ 786{
791 unsigned long flags; 787 unsigned long flags;
792 struct early_log *log; 788 struct early_log *log;
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
808 log->ptr = ptr; 804 log->ptr = ptr;
809 log->size = size; 805 log->size = size;
810 log->min_count = min_count; 806 log->min_count = min_count;
811 log->offset = offset;
812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC) 807 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace); 808 log->trace_len = __save_stack_trace(log->trace);
815 crt_early_log++; 809 crt_early_log++;
@@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
858 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 852 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
859 create_object((unsigned long)ptr, size, min_count, gfp); 853 create_object((unsigned long)ptr, size, min_count, gfp);
860 else if (atomic_read(&kmemleak_early_log)) 854 else if (atomic_read(&kmemleak_early_log))
861 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); 855 log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
862} 856}
863EXPORT_SYMBOL_GPL(kmemleak_alloc); 857EXPORT_SYMBOL_GPL(kmemleak_alloc);
864 858
@@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr)
873 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 867 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
874 delete_object_full((unsigned long)ptr); 868 delete_object_full((unsigned long)ptr);
875 else if (atomic_read(&kmemleak_early_log)) 869 else if (atomic_read(&kmemleak_early_log))
876 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); 870 log_early(KMEMLEAK_FREE, ptr, 0, 0);
877} 871}
878EXPORT_SYMBOL_GPL(kmemleak_free); 872EXPORT_SYMBOL_GPL(kmemleak_free);
879 873
@@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
888 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 882 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
889 delete_object_part((unsigned long)ptr, size); 883 delete_object_part((unsigned long)ptr, size);
890 else if (atomic_read(&kmemleak_early_log)) 884 else if (atomic_read(&kmemleak_early_log))
891 log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); 885 log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
892} 886}
893EXPORT_SYMBOL_GPL(kmemleak_free_part); 887EXPORT_SYMBOL_GPL(kmemleak_free_part);
894 888
@@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr)
903 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 897 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
904 make_gray_object((unsigned long)ptr); 898 make_gray_object((unsigned long)ptr);
905 else if (atomic_read(&kmemleak_early_log)) 899 else if (atomic_read(&kmemleak_early_log))
906 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); 900 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
907} 901}
908EXPORT_SYMBOL(kmemleak_not_leak); 902EXPORT_SYMBOL(kmemleak_not_leak);
909 903
@@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr)
919 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 913 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
920 make_black_object((unsigned long)ptr); 914 make_black_object((unsigned long)ptr);
921 else if (atomic_read(&kmemleak_early_log)) 915 else if (atomic_read(&kmemleak_early_log))
922 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); 916 log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
923} 917}
924EXPORT_SYMBOL(kmemleak_ignore); 918EXPORT_SYMBOL(kmemleak_ignore);
925 919
926/* 920/*
927 * Limit the range to be scanned in an allocated memory block. 921 * Limit the range to be scanned in an allocated memory block.
928 */ 922 */
929void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, 923void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
930 size_t length, gfp_t gfp)
931{ 924{
932 pr_debug("%s(0x%p)\n", __func__, ptr); 925 pr_debug("%s(0x%p)\n", __func__, ptr);
933 926
934 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 927 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
935 add_scan_area((unsigned long)ptr, offset, length, gfp); 928 add_scan_area((unsigned long)ptr, size, gfp);
936 else if (atomic_read(&kmemleak_early_log)) 929 else if (atomic_read(&kmemleak_early_log))
937 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); 930 log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
938} 931}
939EXPORT_SYMBOL(kmemleak_scan_area); 932EXPORT_SYMBOL(kmemleak_scan_area);
940 933
@@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr)
948 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 941 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
949 object_no_scan((unsigned long)ptr); 942 object_no_scan((unsigned long)ptr);
950 else if (atomic_read(&kmemleak_early_log)) 943 else if (atomic_read(&kmemleak_early_log))
951 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); 944 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
952} 945}
953EXPORT_SYMBOL(kmemleak_no_scan); 946EXPORT_SYMBOL(kmemleak_no_scan);
954 947
955/* 948/*
949 * Update an object's checksum and return true if it was modified.
950 */
951static bool update_checksum(struct kmemleak_object *object)
952{
953 u32 old_csum = object->checksum;
954
955 if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
956 return false;
957
958 object->checksum = crc32(0, (void *)object->pointer, object->size);
959 return object->checksum != old_csum;
960}
961
962/*
956 * Memory scanning is a long process and it needs to be interruptable. This 963 * Memory scanning is a long process and it needs to be interruptable. This
957 * function checks whether such interrupt condition occured. 964 * function checks whether such interrupt condition occured.
958 */ 965 */
@@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end,
1031 * added to the gray_list. 1038 * added to the gray_list.
1032 */ 1039 */
1033 object->count++; 1040 object->count++;
1034 if (color_gray(object)) 1041 if (color_gray(object)) {
1035 list_add_tail(&object->gray_list, &gray_list); 1042 list_add_tail(&object->gray_list, &gray_list);
1036 else 1043 spin_unlock_irqrestore(&object->lock, flags);
1037 put_object(object); 1044 continue;
1045 }
1046
1038 spin_unlock_irqrestore(&object->lock, flags); 1047 spin_unlock_irqrestore(&object->lock, flags);
1048 put_object(object);
1039 } 1049 }
1040} 1050}
1041 1051
@@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object)
1075 } 1085 }
1076 } else 1086 } else
1077 hlist_for_each_entry(area, elem, &object->area_list, node) 1087 hlist_for_each_entry(area, elem, &object->area_list, node)
1078 scan_block((void *)(object->pointer + area->offset), 1088 scan_block((void *)area->start,
1079 (void *)(object->pointer + area->offset 1089 (void *)(area->start + area->size),
1080 + area->length), object, 0); 1090 object, 0);
1081out: 1091out:
1082 spin_unlock_irqrestore(&object->lock, flags); 1092 spin_unlock_irqrestore(&object->lock, flags);
1083} 1093}
1084 1094
1085/* 1095/*
1096 * Scan the objects already referenced (gray objects). More objects will be
1097 * referenced and, if there are no memory leaks, all the objects are scanned.
1098 */
1099static void scan_gray_list(void)
1100{
1101 struct kmemleak_object *object, *tmp;
1102
1103 /*
1104 * The list traversal is safe for both tail additions and removals
1105 * from inside the loop. The kmemleak objects cannot be freed from
1106 * outside the loop because their use_count was incremented.
1107 */
1108 object = list_entry(gray_list.next, typeof(*object), gray_list);
1109 while (&object->gray_list != &gray_list) {
1110 cond_resched();
1111
1112 /* may add new objects to the list */
1113 if (!scan_should_stop())
1114 scan_object(object);
1115
1116 tmp = list_entry(object->gray_list.next, typeof(*object),
1117 gray_list);
1118
1119 /* remove the object from the list and release it */
1120 list_del(&object->gray_list);
1121 put_object(object);
1122
1123 object = tmp;
1124 }
1125 WARN_ON(!list_empty(&gray_list));
1126}
1127
1128/*
1086 * Scan data sections and all the referenced memory blocks allocated via the 1129 * Scan data sections and all the referenced memory blocks allocated via the
1087 * kernel's standard allocators. This function must be called with the 1130 * kernel's standard allocators. This function must be called with the
1088 * scan_mutex held. 1131 * scan_mutex held.
@@ -1090,10 +1133,9 @@ out:
1090static void kmemleak_scan(void) 1133static void kmemleak_scan(void)
1091{ 1134{
1092 unsigned long flags; 1135 unsigned long flags;
1093 struct kmemleak_object *object, *tmp; 1136 struct kmemleak_object *object;
1094 int i; 1137 int i;
1095 int new_leaks = 0; 1138 int new_leaks = 0;
1096 int gray_list_pass = 0;
1097 1139
1098 jiffies_last_scan = jiffies; 1140 jiffies_last_scan = jiffies;
1099 1141
@@ -1114,7 +1156,6 @@ static void kmemleak_scan(void)
1114#endif 1156#endif
1115 /* reset the reference count (whiten the object) */ 1157 /* reset the reference count (whiten the object) */
1116 object->count = 0; 1158 object->count = 0;
1117 object->flags &= ~OBJECT_NEW;
1118 if (color_gray(object) && get_object(object)) 1159 if (color_gray(object) && get_object(object))
1119 list_add_tail(&object->gray_list, &gray_list); 1160 list_add_tail(&object->gray_list, &gray_list);
1120 1161
@@ -1172,62 +1213,36 @@ static void kmemleak_scan(void)
1172 1213
1173 /* 1214 /*
1174 * Scan the objects already referenced from the sections scanned 1215 * Scan the objects already referenced from the sections scanned
1175 * above. More objects will be referenced and, if there are no memory 1216 * above.
1176 * leaks, all the objects will be scanned. The list traversal is safe
1177 * for both tail additions and removals from inside the loop. The
1178 * kmemleak objects cannot be freed from outside the loop because their
1179 * use_count was increased.
1180 */ 1217 */
1181repeat: 1218 scan_gray_list();
1182 object = list_entry(gray_list.next, typeof(*object), gray_list);
1183 while (&object->gray_list != &gray_list) {
1184 cond_resched();
1185
1186 /* may add new objects to the list */
1187 if (!scan_should_stop())
1188 scan_object(object);
1189
1190 tmp = list_entry(object->gray_list.next, typeof(*object),
1191 gray_list);
1192
1193 /* remove the object from the list and release it */
1194 list_del(&object->gray_list);
1195 put_object(object);
1196
1197 object = tmp;
1198 }
1199
1200 if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
1201 goto scan_end;
1202 1219
1203 /* 1220 /*
1204 * Check for new objects allocated during this scanning and add them 1221 * Check for new or unreferenced objects modified since the previous
1205 * to the gray list. 1222 * scan and color them gray until the next scan.
1206 */ 1223 */
1207 rcu_read_lock(); 1224 rcu_read_lock();
1208 list_for_each_entry_rcu(object, &object_list, object_list) { 1225 list_for_each_entry_rcu(object, &object_list, object_list) {
1209 spin_lock_irqsave(&object->lock, flags); 1226 spin_lock_irqsave(&object->lock, flags);
1210 if ((object->flags & OBJECT_NEW) && !color_black(object) && 1227 if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
1211 get_object(object)) { 1228 && update_checksum(object) && get_object(object)) {
1212 object->flags &= ~OBJECT_NEW; 1229 /* color it gray temporarily */
1230 object->count = object->min_count;
1213 list_add_tail(&object->gray_list, &gray_list); 1231 list_add_tail(&object->gray_list, &gray_list);
1214 } 1232 }
1215 spin_unlock_irqrestore(&object->lock, flags); 1233 spin_unlock_irqrestore(&object->lock, flags);
1216 } 1234 }
1217 rcu_read_unlock(); 1235 rcu_read_unlock();
1218 1236
1219 if (!list_empty(&gray_list)) 1237 /*
1220 goto repeat; 1238 * Re-scan the gray list for modified unreferenced objects.
1221 1239 */
1222scan_end: 1240 scan_gray_list();
1223 WARN_ON(!list_empty(&gray_list));
1224 1241
1225 /* 1242 /*
1226 * If scanning was stopped or new objects were being allocated at a 1243 * If scanning was stopped do not report any new unreferenced objects.
1227 * higher rate than gray list scanning, do not report any new
1228 * unreferenced objects.
1229 */ 1244 */
1230 if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) 1245 if (scan_should_stop())
1231 return; 1246 return;
1232 1247
1233 /* 1248 /*
@@ -1642,8 +1657,7 @@ void __init kmemleak_init(void)
1642 kmemleak_ignore(log->ptr); 1657 kmemleak_ignore(log->ptr);
1643 break; 1658 break;
1644 case KMEMLEAK_SCAN_AREA: 1659 case KMEMLEAK_SCAN_AREA:
1645 kmemleak_scan_area(log->ptr, log->offset, log->length, 1660 kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
1646 GFP_KERNEL);
1647 break; 1661 break;
1648 case KMEMLEAK_NO_SCAN: 1662 case KMEMLEAK_NO_SCAN:
1649 kmemleak_no_scan(log->ptr); 1663 kmemleak_no_scan(log->ptr);
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
14 * Safely read from address @src to the buffer at @dst. If a kernel fault 14 * Safely read from address @src to the buffer at @dst. If a kernel fault
15 * happens, handle that and return -EFAULT. 15 * happens, handle that and return -EFAULT.
16 */ 16 */
17long probe_kernel_read(void *dst, void *src, size_t size) 17
18long __weak probe_kernel_read(void *dst, void *src, size_t size)
19 __attribute__((alias("__probe_kernel_read")));
20
21long __probe_kernel_read(void *dst, void *src, size_t size)
18{ 22{
19 long ret; 23 long ret;
20 mm_segment_t old_fs = get_fs(); 24 mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 43 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 44 * happens, handle that and return -EFAULT.
41 */ 45 */
42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) 46long __weak probe_kernel_write(void *dst, void *src, size_t size)
47 __attribute__((alias("__probe_kernel_write")));
48
49long __probe_kernel_write(void *dst, void *src, size_t size)
43{ 50{
44 long ret; 51 long ret;
45 mm_segment_t old_fs = get_fs(); 52 mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index 35b1479b7c9d..319528b8db74 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
12#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/ksm.h> 15#include <linux/ksm.h>
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma,
222/* 223/*
223 * Error injection support for memory error handling. 224 * Error injection support for memory error handling.
224 */ 225 */
225static int madvise_hwpoison(unsigned long start, unsigned long end) 226static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
226{ 227{
227 int ret = 0; 228 int ret = 0;
228 229
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end)
230 return -EPERM; 231 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) { 232 for (; start < end; start += PAGE_SIZE) {
232 struct page *p; 233 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1, 234 int ret = get_user_pages_fast(start, 1, 0, &p);
234 0, 0, &p, NULL);
235 if (ret != 1) 235 if (ret != 1)
236 return ret; 236 return ret;
237 if (bhv == MADV_SOFT_OFFLINE) {
238 printk(KERN_INFO "Soft offlining page %lx at %lx\n",
239 page_to_pfn(p), start);
240 ret = soft_offline_page(p, MF_COUNT_INCREASED);
241 if (ret)
242 break;
243 continue;
244 }
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 245 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start); 246 page_to_pfn(p), start);
239 /* Ignore return value for now */ 247 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1); 248 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
241 put_page(p);
242 } 249 }
243 return ret; 250 return ret;
244} 251}
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
335 size_t len; 342 size_t len;
336 343
337#ifdef CONFIG_MEMORY_FAILURE 344#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON) 345 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
339 return madvise_hwpoison(start, start+len_in); 346 return madvise_hwpoison(behavior, start, start+len_in);
340#endif 347#endif
341 if (!madvise_behavior_valid(behavior)) 348 if (!madvise_behavior_valid(behavior))
342 return error; 349 return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 878808c4fcbe..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -283,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
283 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
284} 284}
285 285
286struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
287{
288 return &mem->css;
289}
290
286static struct mem_cgroup_per_zone * 291static struct mem_cgroup_per_zone *
287page_cgroup_zoneinfo(struct page_cgroup *pc) 292page_cgroup_zoneinfo(struct page_cgroup *pc)
288{ 293{
@@ -1536,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1536 return container_of(css, struct mem_cgroup, css); 1541 return container_of(css, struct mem_cgroup, css);
1537} 1542}
1538 1543
1539static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1544struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1540{ 1545{
1541 struct mem_cgroup *mem; 1546 struct mem_cgroup *mem = NULL;
1542 struct page_cgroup *pc; 1547 struct page_cgroup *pc;
1543 unsigned short id; 1548 unsigned short id;
1544 swp_entry_t ent; 1549 swp_entry_t ent;
1545 1550
1546 VM_BUG_ON(!PageLocked(page)); 1551 VM_BUG_ON(!PageLocked(page));
1547 1552
1548 if (!PageSwapCache(page))
1549 return NULL;
1550
1551 pc = lookup_page_cgroup(page); 1553 pc = lookup_page_cgroup(page);
1552 lock_page_cgroup(pc); 1554 lock_page_cgroup(pc);
1553 if (PageCgroupUsed(pc)) { 1555 if (PageCgroupUsed(pc)) {
1554 mem = pc->mem_cgroup; 1556 mem = pc->mem_cgroup;
1555 if (mem && !css_tryget(&mem->css)) 1557 if (mem && !css_tryget(&mem->css))
1556 mem = NULL; 1558 mem = NULL;
1557 } else { 1559 } else if (PageSwapCache(page)) {
1558 ent.val = page_private(page); 1560 ent.val = page_private(page);
1559 id = lookup_swap_cgroup(ent); 1561 id = lookup_swap_cgroup(ent);
1560 rcu_read_lock(); 1562 rcu_read_lock();
@@ -1874,7 +1876,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1874 */ 1876 */
1875 if (!PageSwapCache(page)) 1877 if (!PageSwapCache(page))
1876 goto charge_cur_mm; 1878 goto charge_cur_mm;
1877 mem = try_get_mem_cgroup_from_swapcache(page); 1879 mem = try_get_mem_cgroup_from_page(page);
1878 if (!mem) 1880 if (!mem)
1879 goto charge_cur_mm; 1881 goto charge_cur_mm;
1880 *ptr = mem; 1882 *ptr = mem;
@@ -2584,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2584 if (free_all) 2586 if (free_all)
2585 goto try_to_free; 2587 goto try_to_free;
2586move_account: 2588move_account:
2587 while (mem->res.usage > 0) { 2589 do {
2588 ret = -EBUSY; 2590 ret = -EBUSY;
2589 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2591 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2590 goto out; 2592 goto out;
@@ -2612,8 +2614,8 @@ move_account:
2612 if (ret == -ENOMEM) 2614 if (ret == -ENOMEM)
2613 goto try_to_free; 2615 goto try_to_free;
2614 cond_resched(); 2616 cond_resched();
2615 } 2617 /* "ret" should also be checked to ensure all lists are empty. */
2616 ret = 0; 2618 } while (mem->res.usage > 0 || ret);
2617out: 2619out:
2618 css_put(&mem->css); 2620 css_put(&mem->css);
2619 return ret; 2621 return ret;
@@ -2646,10 +2648,7 @@ try_to_free:
2646 } 2648 }
2647 lru_add_drain(); 2649 lru_add_drain();
2648 /* try move_account...there may be some *locked* pages. */ 2650 /* try move_account...there may be some *locked* pages. */
2649 if (mem->res.usage) 2651 goto move_account;
2650 goto move_account;
2651 ret = 0;
2652 goto out;
2653} 2652}
2654 2653
2655int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2654int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 50d4f8d7024a..17299fd4577c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -34,12 +34,16 @@
34#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/page-flags.h> 36#include <linux/page-flags.h>
37#include <linux/kernel-page-flags.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
38#include <linux/ksm.h> 39#include <linux/ksm.h>
39#include <linux/rmap.h> 40#include <linux/rmap.h>
40#include <linux/pagemap.h> 41#include <linux/pagemap.h>
41#include <linux/swap.h> 42#include <linux/swap.h>
42#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/migrate.h>
45#include <linux/page-isolation.h>
46#include <linux/suspend.h>
43#include "internal.h" 47#include "internal.h"
44 48
45int sysctl_memory_failure_early_kill __read_mostly = 0; 49int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -48,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
48 52
49atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 53atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50 54
55#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
56
57u32 hwpoison_filter_enable = 0;
58u32 hwpoison_filter_dev_major = ~0U;
59u32 hwpoison_filter_dev_minor = ~0U;
60u64 hwpoison_filter_flags_mask;
61u64 hwpoison_filter_flags_value;
62EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
63EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
64EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
65EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
66EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
67
68static int hwpoison_filter_dev(struct page *p)
69{
70 struct address_space *mapping;
71 dev_t dev;
72
73 if (hwpoison_filter_dev_major == ~0U &&
74 hwpoison_filter_dev_minor == ~0U)
75 return 0;
76
77 /*
78 * page_mapping() does not accept slab page
79 */
80 if (PageSlab(p))
81 return -EINVAL;
82
83 mapping = page_mapping(p);
84 if (mapping == NULL || mapping->host == NULL)
85 return -EINVAL;
86
87 dev = mapping->host->i_sb->s_dev;
88 if (hwpoison_filter_dev_major != ~0U &&
89 hwpoison_filter_dev_major != MAJOR(dev))
90 return -EINVAL;
91 if (hwpoison_filter_dev_minor != ~0U &&
92 hwpoison_filter_dev_minor != MINOR(dev))
93 return -EINVAL;
94
95 return 0;
96}
97
98static int hwpoison_filter_flags(struct page *p)
99{
100 if (!hwpoison_filter_flags_mask)
101 return 0;
102
103 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
104 hwpoison_filter_flags_value)
105 return 0;
106 else
107 return -EINVAL;
108}
109
110/*
111 * This allows stress tests to limit test scope to a collection of tasks
112 * by putting them under some memcg. This prevents killing unrelated/important
113 * processes such as /sbin/init. Note that the target task may share clean
114 * pages with init (eg. libc text), which is harmless. If the target task
115 * share _dirty_ pages with another task B, the test scheme must make sure B
116 * is also included in the memcg. At last, due to race conditions this filter
117 * can only guarantee that the page either belongs to the memcg tasks, or is
118 * a freed page.
119 */
120#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
121u64 hwpoison_filter_memcg;
122EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
123static int hwpoison_filter_task(struct page *p)
124{
125 struct mem_cgroup *mem;
126 struct cgroup_subsys_state *css;
127 unsigned long ino;
128
129 if (!hwpoison_filter_memcg)
130 return 0;
131
132 mem = try_get_mem_cgroup_from_page(p);
133 if (!mem)
134 return -EINVAL;
135
136 css = mem_cgroup_css(mem);
137 /* root_mem_cgroup has NULL dentries */
138 if (!css->cgroup->dentry)
139 return -EINVAL;
140
141 ino = css->cgroup->dentry->d_inode->i_ino;
142 css_put(css);
143
144 if (ino != hwpoison_filter_memcg)
145 return -EINVAL;
146
147 return 0;
148}
149#else
150static int hwpoison_filter_task(struct page *p) { return 0; }
151#endif
152
153int hwpoison_filter(struct page *p)
154{
155 if (!hwpoison_filter_enable)
156 return 0;
157
158 if (hwpoison_filter_dev(p))
159 return -EINVAL;
160
161 if (hwpoison_filter_flags(p))
162 return -EINVAL;
163
164 if (hwpoison_filter_task(p))
165 return -EINVAL;
166
167 return 0;
168}
169#else
170int hwpoison_filter(struct page *p)
171{
172 return 0;
173}
174#endif
175
176EXPORT_SYMBOL_GPL(hwpoison_filter);
177
51/* 178/*
52 * Send all the processes who have the page mapped an ``action optional'' 179 * Send all the processes who have the page mapped an ``action optional''
53 * signal. 180 * signal.
@@ -83,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
83} 210}
84 211
85/* 212/*
213 * When a unknown page type is encountered drain as many buffers as possible
214 * in the hope to turn the page into a LRU or free page, which we can handle.
215 */
216void shake_page(struct page *p, int access)
217{
218 if (!PageSlab(p)) {
219 lru_add_drain_all();
220 if (PageLRU(p))
221 return;
222 drain_all_pages();
223 if (PageLRU(p) || is_free_buddy_page(p))
224 return;
225 }
226
227 /*
228 * Only all shrink_slab here (which would also
229 * shrink other caches) if access is not potentially fatal.
230 */
231 if (access) {
232 int nr;
233 do {
234 nr = shrink_slab(1000, GFP_KERNEL, 1000);
235 if (page_count(p) == 0)
236 break;
237 } while (nr > 10);
238 }
239}
240EXPORT_SYMBOL_GPL(shake_page);
241
242/*
86 * Kill all processes that have a poisoned page mapped and then isolate 243 * Kill all processes that have a poisoned page mapped and then isolate
87 * the page. 244 * the page.
88 * 245 *
@@ -177,7 +334,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
177 * In case something went wrong with munmapping 334 * In case something went wrong with munmapping
178 * make sure the process doesn't catch the 335 * make sure the process doesn't catch the
179 * signal and then access the memory. Just kill it. 336 * signal and then access the memory. Just kill it.
180 * the signal handlers
181 */ 337 */
182 if (fail || tk->addr_valid == 0) { 338 if (fail || tk->addr_valid == 0) {
183 printk(KERN_ERR 339 printk(KERN_ERR
@@ -314,33 +470,49 @@ static void collect_procs(struct page *page, struct list_head *tokill)
314 */ 470 */
315 471
316enum outcome { 472enum outcome {
317 FAILED, /* Error handling failed */ 473 IGNORED, /* Error: cannot be handled */
474 FAILED, /* Error: handling failed */
318 DELAYED, /* Will be handled later */ 475 DELAYED, /* Will be handled later */
319 IGNORED, /* Error safely ignored */
320 RECOVERED, /* Successfully recovered */ 476 RECOVERED, /* Successfully recovered */
321}; 477};
322 478
323static const char *action_name[] = { 479static const char *action_name[] = {
480 [IGNORED] = "Ignored",
324 [FAILED] = "Failed", 481 [FAILED] = "Failed",
325 [DELAYED] = "Delayed", 482 [DELAYED] = "Delayed",
326 [IGNORED] = "Ignored",
327 [RECOVERED] = "Recovered", 483 [RECOVERED] = "Recovered",
328}; 484};
329 485
330/* 486/*
331 * Error hit kernel page. 487 * XXX: It is possible that a page is isolated from LRU cache,
332 * Do nothing, try to be lucky and not touch this instead. For a few cases we 488 * and then kept in swap cache or failed to remove from page cache.
333 * could be more sophisticated. 489 * The page count will stop it from being freed by unpoison.
490 * Stress tests should be aware of this memory leak problem.
334 */ 491 */
335static int me_kernel(struct page *p, unsigned long pfn) 492static int delete_from_lru_cache(struct page *p)
336{ 493{
337 return DELAYED; 494 if (!isolate_lru_page(p)) {
495 /*
496 * Clear sensible page flags, so that the buddy system won't
497 * complain when the page is unpoison-and-freed.
498 */
499 ClearPageActive(p);
500 ClearPageUnevictable(p);
501 /*
502 * drop the page count elevated by isolate_lru_page()
503 */
504 page_cache_release(p);
505 return 0;
506 }
507 return -EIO;
338} 508}
339 509
340/* 510/*
341 * Already poisoned page. 511 * Error hit kernel page.
512 * Do nothing, try to be lucky and not touch this instead. For a few cases we
513 * could be more sophisticated.
342 */ 514 */
343static int me_ignore(struct page *p, unsigned long pfn) 515static int me_kernel(struct page *p, unsigned long pfn)
344{ 516{
345 return IGNORED; 517 return IGNORED;
346} 518}
@@ -355,14 +527,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
355} 527}
356 528
357/* 529/*
358 * Free memory
359 */
360static int me_free(struct page *p, unsigned long pfn)
361{
362 return DELAYED;
363}
364
365/*
366 * Clean (or cleaned) page cache page. 530 * Clean (or cleaned) page cache page.
367 */ 531 */
368static int me_pagecache_clean(struct page *p, unsigned long pfn) 532static int me_pagecache_clean(struct page *p, unsigned long pfn)
@@ -371,6 +535,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
371 int ret = FAILED; 535 int ret = FAILED;
372 struct address_space *mapping; 536 struct address_space *mapping;
373 537
538 delete_from_lru_cache(p);
539
374 /* 540 /*
375 * For anonymous pages we're done the only reference left 541 * For anonymous pages we're done the only reference left
376 * should be the one m_f() holds. 542 * should be the one m_f() holds.
@@ -500,14 +666,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500 /* Trigger EIO in shmem: */ 666 /* Trigger EIO in shmem: */
501 ClearPageUptodate(p); 667 ClearPageUptodate(p);
502 668
503 return DELAYED; 669 if (!delete_from_lru_cache(p))
670 return DELAYED;
671 else
672 return FAILED;
504} 673}
505 674
506static int me_swapcache_clean(struct page *p, unsigned long pfn) 675static int me_swapcache_clean(struct page *p, unsigned long pfn)
507{ 676{
508 delete_from_swap_cache(p); 677 delete_from_swap_cache(p);
509 678
510 return RECOVERED; 679 if (!delete_from_lru_cache(p))
680 return RECOVERED;
681 else
682 return FAILED;
511} 683}
512 684
513/* 685/*
@@ -550,7 +722,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
550#define tail (1UL << PG_tail) 722#define tail (1UL << PG_tail)
551#define compound (1UL << PG_compound) 723#define compound (1UL << PG_compound)
552#define slab (1UL << PG_slab) 724#define slab (1UL << PG_slab)
553#define buddy (1UL << PG_buddy)
554#define reserved (1UL << PG_reserved) 725#define reserved (1UL << PG_reserved)
555 726
556static struct page_state { 727static struct page_state {
@@ -559,8 +730,11 @@ static struct page_state {
559 char *msg; 730 char *msg;
560 int (*action)(struct page *p, unsigned long pfn); 731 int (*action)(struct page *p, unsigned long pfn);
561} error_states[] = { 732} error_states[] = {
562 { reserved, reserved, "reserved kernel", me_ignore }, 733 { reserved, reserved, "reserved kernel", me_kernel },
563 { buddy, buddy, "free kernel", me_free }, 734 /*
735 * free pages are specially detected outside this table:
736 * PG_buddy pages only make a small fraction of all free pages.
737 */
564 738
565 /* 739 /*
566 * Could in theory check if slab page is free or if we can drop 740 * Could in theory check if slab page is free or if we can drop
@@ -587,7 +761,6 @@ static struct page_state {
587 761
588 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 762 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
589 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 763 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
590 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
591 764
592 /* 765 /*
593 * Catchall entry: must be at end. 766 * Catchall entry: must be at end.
@@ -595,20 +768,31 @@ static struct page_state {
595 { 0, 0, "unknown page state", me_unknown }, 768 { 0, 0, "unknown page state", me_unknown },
596}; 769};
597 770
771#undef dirty
772#undef sc
773#undef unevict
774#undef mlock
775#undef writeback
776#undef lru
777#undef swapbacked
778#undef head
779#undef tail
780#undef compound
781#undef slab
782#undef reserved
783
598static void action_result(unsigned long pfn, char *msg, int result) 784static void action_result(unsigned long pfn, char *msg, int result)
599{ 785{
600 struct page *page = NULL; 786 struct page *page = pfn_to_page(pfn);
601 if (pfn_valid(pfn))
602 page = pfn_to_page(pfn);
603 787
604 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 788 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
605 pfn, 789 pfn,
606 page && PageDirty(page) ? "dirty " : "", 790 PageDirty(page) ? "dirty " : "",
607 msg, action_name[result]); 791 msg, action_name[result]);
608} 792}
609 793
610static int page_action(struct page_state *ps, struct page *p, 794static int page_action(struct page_state *ps, struct page *p,
611 unsigned long pfn, int ref) 795 unsigned long pfn)
612{ 796{
613 int result; 797 int result;
614 int count; 798 int count;
@@ -616,18 +800,22 @@ static int page_action(struct page_state *ps, struct page *p,
616 result = ps->action(p, pfn); 800 result = ps->action(p, pfn);
617 action_result(pfn, ps->msg, result); 801 action_result(pfn, ps->msg, result);
618 802
619 count = page_count(p) - 1 - ref; 803 count = page_count(p) - 1;
620 if (count != 0) 804 if (ps->action == me_swapcache_dirty && result == DELAYED)
805 count--;
806 if (count != 0) {
621 printk(KERN_ERR 807 printk(KERN_ERR
622 "MCE %#lx: %s page still referenced by %d users\n", 808 "MCE %#lx: %s page still referenced by %d users\n",
623 pfn, ps->msg, count); 809 pfn, ps->msg, count);
810 result = FAILED;
811 }
624 812
625 /* Could do more checks here if page looks ok */ 813 /* Could do more checks here if page looks ok */
626 /* 814 /*
627 * Could adjust zone counters here to correct for the missing page. 815 * Could adjust zone counters here to correct for the missing page.
628 */ 816 */
629 817
630 return result == RECOVERED ? 0 : -EBUSY; 818 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
631} 819}
632 820
633#define N_UNMAP_TRIES 5 821#define N_UNMAP_TRIES 5
@@ -636,7 +824,7 @@ static int page_action(struct page_state *ps, struct page *p,
636 * Do all that is necessary to remove user space mappings. Unmap 824 * Do all that is necessary to remove user space mappings. Unmap
637 * the pages and send SIGBUS to the processes if the data was dirty. 825 * the pages and send SIGBUS to the processes if the data was dirty.
638 */ 826 */
639static void hwpoison_user_mappings(struct page *p, unsigned long pfn, 827static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
640 int trapno) 828 int trapno)
641{ 829{
642 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 830 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -646,15 +834,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
646 int i; 834 int i;
647 int kill = 1; 835 int kill = 1;
648 836
649 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) 837 if (PageReserved(p) || PageSlab(p))
650 return; 838 return SWAP_SUCCESS;
651 839
652 /* 840 /*
653 * This check implies we don't kill processes if their pages 841 * This check implies we don't kill processes if their pages
654 * are in the swap cache early. Those are always late kills. 842 * are in the swap cache early. Those are always late kills.
655 */ 843 */
656 if (!page_mapped(p)) 844 if (!page_mapped(p))
657 return; 845 return SWAP_SUCCESS;
846
847 if (PageCompound(p) || PageKsm(p))
848 return SWAP_FAIL;
658 849
659 if (PageSwapCache(p)) { 850 if (PageSwapCache(p)) {
660 printk(KERN_ERR 851 printk(KERN_ERR
@@ -665,6 +856,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
665 /* 856 /*
666 * Propagate the dirty bit from PTEs to struct page first, because we 857 * Propagate the dirty bit from PTEs to struct page first, because we
667 * need this to decide if we should kill or just drop the page. 858 * need this to decide if we should kill or just drop the page.
859 * XXX: the dirty test could be racy: set_page_dirty() may not always
860 * be called inside page lock (it's recommended but not enforced).
668 */ 861 */
669 mapping = page_mapping(p); 862 mapping = page_mapping(p);
670 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 863 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
@@ -716,11 +909,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
716 */ 909 */
717 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 910 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
718 ret != SWAP_SUCCESS, pfn); 911 ret != SWAP_SUCCESS, pfn);
912
913 return ret;
719} 914}
720 915
721int __memory_failure(unsigned long pfn, int trapno, int ref) 916int __memory_failure(unsigned long pfn, int trapno, int flags)
722{ 917{
723 unsigned long lru_flag;
724 struct page_state *ps; 918 struct page_state *ps;
725 struct page *p; 919 struct page *p;
726 int res; 920 int res;
@@ -729,13 +923,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
729 panic("Memory failure from trap %d on page %lx", trapno, pfn); 923 panic("Memory failure from trap %d on page %lx", trapno, pfn);
730 924
731 if (!pfn_valid(pfn)) { 925 if (!pfn_valid(pfn)) {
732 action_result(pfn, "memory outside kernel control", IGNORED); 926 printk(KERN_ERR
733 return -EIO; 927 "MCE %#lx: memory outside kernel control\n",
928 pfn);
929 return -ENXIO;
734 } 930 }
735 931
736 p = pfn_to_page(pfn); 932 p = pfn_to_page(pfn);
737 if (TestSetPageHWPoison(p)) { 933 if (TestSetPageHWPoison(p)) {
738 action_result(pfn, "already hardware poisoned", IGNORED); 934 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
739 return 0; 935 return 0;
740 } 936 }
741 937
@@ -752,9 +948,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
752 * In fact it's dangerous to directly bump up page count from 0, 948 * In fact it's dangerous to directly bump up page count from 0,
753 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 949 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
754 */ 950 */
755 if (!get_page_unless_zero(compound_head(p))) { 951 if (!(flags & MF_COUNT_INCREASED) &&
756 action_result(pfn, "free or high order kernel", IGNORED); 952 !get_page_unless_zero(compound_head(p))) {
757 return PageBuddy(compound_head(p)) ? 0 : -EBUSY; 953 if (is_free_buddy_page(p)) {
954 action_result(pfn, "free buddy", DELAYED);
955 return 0;
956 } else {
957 action_result(pfn, "high order kernel", IGNORED);
958 return -EBUSY;
959 }
758 } 960 }
759 961
760 /* 962 /*
@@ -766,14 +968,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
766 * walked by the page reclaim code, however that's not a big loss. 968 * walked by the page reclaim code, however that's not a big loss.
767 */ 969 */
768 if (!PageLRU(p)) 970 if (!PageLRU(p))
769 lru_add_drain_all(); 971 shake_page(p, 0);
770 lru_flag = p->flags & lru; 972 if (!PageLRU(p)) {
771 if (isolate_lru_page(p)) { 973 /*
974 * shake_page could have turned it free.
975 */
976 if (is_free_buddy_page(p)) {
977 action_result(pfn, "free buddy, 2nd try", DELAYED);
978 return 0;
979 }
772 action_result(pfn, "non LRU", IGNORED); 980 action_result(pfn, "non LRU", IGNORED);
773 put_page(p); 981 put_page(p);
774 return -EBUSY; 982 return -EBUSY;
775 } 983 }
776 page_cache_release(p);
777 984
778 /* 985 /*
779 * Lock the page and wait for writeback to finish. 986 * Lock the page and wait for writeback to finish.
@@ -781,26 +988,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
781 * and in many cases impossible, so we just avoid it here. 988 * and in many cases impossible, so we just avoid it here.
782 */ 989 */
783 lock_page_nosync(p); 990 lock_page_nosync(p);
991
992 /*
993 * unpoison always clear PG_hwpoison inside page lock
994 */
995 if (!PageHWPoison(p)) {
996 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
997 res = 0;
998 goto out;
999 }
1000 if (hwpoison_filter(p)) {
1001 if (TestClearPageHWPoison(p))
1002 atomic_long_dec(&mce_bad_pages);
1003 unlock_page(p);
1004 put_page(p);
1005 return 0;
1006 }
1007
784 wait_on_page_writeback(p); 1008 wait_on_page_writeback(p);
785 1009
786 /* 1010 /*
787 * Now take care of user space mappings. 1011 * Now take care of user space mappings.
1012 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
788 */ 1013 */
789 hwpoison_user_mappings(p, pfn, trapno); 1014 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1015 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1016 res = -EBUSY;
1017 goto out;
1018 }
790 1019
791 /* 1020 /*
792 * Torn down by someone else? 1021 * Torn down by someone else?
793 */ 1022 */
794 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { 1023 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
795 action_result(pfn, "already truncated LRU", IGNORED); 1024 action_result(pfn, "already truncated LRU", IGNORED);
796 res = 0; 1025 res = -EBUSY;
797 goto out; 1026 goto out;
798 } 1027 }
799 1028
800 res = -EBUSY; 1029 res = -EBUSY;
801 for (ps = error_states;; ps++) { 1030 for (ps = error_states;; ps++) {
802 if (((p->flags | lru_flag)& ps->mask) == ps->res) { 1031 if ((p->flags & ps->mask) == ps->res) {
803 res = page_action(ps, p, pfn, ref); 1032 res = page_action(ps, p, pfn);
804 break; 1033 break;
805 } 1034 }
806 } 1035 }
@@ -831,3 +1060,235 @@ void memory_failure(unsigned long pfn, int trapno)
831{ 1060{
832 __memory_failure(pfn, trapno, 0); 1061 __memory_failure(pfn, trapno, 0);
833} 1062}
1063
1064/**
1065 * unpoison_memory - Unpoison a previously poisoned page
1066 * @pfn: Page number of the to be unpoisoned page
1067 *
1068 * Software-unpoison a page that has been poisoned by
1069 * memory_failure() earlier.
1070 *
1071 * This is only done on the software-level, so it only works
1072 * for linux injected failures, not real hardware failures
1073 *
1074 * Returns 0 for success, otherwise -errno.
1075 */
1076int unpoison_memory(unsigned long pfn)
1077{
1078 struct page *page;
1079 struct page *p;
1080 int freeit = 0;
1081
1082 if (!pfn_valid(pfn))
1083 return -ENXIO;
1084
1085 p = pfn_to_page(pfn);
1086 page = compound_head(p);
1087
1088 if (!PageHWPoison(p)) {
1089 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
1090 return 0;
1091 }
1092
1093 if (!get_page_unless_zero(page)) {
1094 if (TestClearPageHWPoison(p))
1095 atomic_long_dec(&mce_bad_pages);
1096 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1097 return 0;
1098 }
1099
1100 lock_page_nosync(page);
1101 /*
1102 * This test is racy because PG_hwpoison is set outside of page lock.
1103 * That's acceptable because that won't trigger kernel panic. Instead,
1104 * the PG_hwpoison page will be caught and isolated on the entrance to
1105 * the free buddy page pool.
1106 */
1107 if (TestClearPageHWPoison(p)) {
1108 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1109 atomic_long_dec(&mce_bad_pages);
1110 freeit = 1;
1111 }
1112 unlock_page(page);
1113
1114 put_page(page);
1115 if (freeit)
1116 put_page(page);
1117
1118 return 0;
1119}
1120EXPORT_SYMBOL(unpoison_memory);
1121
1122static struct page *new_page(struct page *p, unsigned long private, int **x)
1123{
1124 int nid = page_to_nid(p);
1125 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1126}
1127
1128/*
1129 * Safely get reference count of an arbitrary page.
1130 * Returns 0 for a free page, -EIO for a zero refcount page
1131 * that is not free, and 1 for any other page type.
1132 * For 1 the page is returned with increased page count, otherwise not.
1133 */
1134static int get_any_page(struct page *p, unsigned long pfn, int flags)
1135{
1136 int ret;
1137
1138 if (flags & MF_COUNT_INCREASED)
1139 return 1;
1140
1141 /*
1142 * The lock_system_sleep prevents a race with memory hotplug,
1143 * because the isolation assumes there's only a single user.
1144 * This is a big hammer, a better would be nicer.
1145 */
1146 lock_system_sleep();
1147
1148 /*
1149 * Isolate the page, so that it doesn't get reallocated if it
1150 * was free.
1151 */
1152 set_migratetype_isolate(p);
1153 if (!get_page_unless_zero(compound_head(p))) {
1154 if (is_free_buddy_page(p)) {
1155 pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1156 /* Set hwpoison bit while page is still isolated */
1157 SetPageHWPoison(p);
1158 ret = 0;
1159 } else {
1160 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1161 pfn, p->flags);
1162 ret = -EIO;
1163 }
1164 } else {
1165 /* Not a free page */
1166 ret = 1;
1167 }
1168 unset_migratetype_isolate(p);
1169 unlock_system_sleep();
1170 return ret;
1171}
1172
1173/**
1174 * soft_offline_page - Soft offline a page.
1175 * @page: page to offline
1176 * @flags: flags. Same as memory_failure().
1177 *
1178 * Returns 0 on success, otherwise negated errno.
1179 *
1180 * Soft offline a page, by migration or invalidation,
1181 * without killing anything. This is for the case when
1182 * a page is not corrupted yet (so it's still valid to access),
1183 * but has had a number of corrected errors and is better taken
1184 * out.
1185 *
1186 * The actual policy on when to do that is maintained by
1187 * user space.
1188 *
1189 * This should never impact any application or cause data loss,
1190 * however it might take some time.
1191 *
1192 * This is not a 100% solution for all memory, but tries to be
1193 * ``good enough'' for the majority of memory.
1194 */
1195int soft_offline_page(struct page *page, int flags)
1196{
1197 int ret;
1198 unsigned long pfn = page_to_pfn(page);
1199
1200 ret = get_any_page(page, pfn, flags);
1201 if (ret < 0)
1202 return ret;
1203 if (ret == 0)
1204 goto done;
1205
1206 /*
1207 * Page cache page we can handle?
1208 */
1209 if (!PageLRU(page)) {
1210 /*
1211 * Try to free it.
1212 */
1213 put_page(page);
1214 shake_page(page, 1);
1215
1216 /*
1217 * Did it turn free?
1218 */
1219 ret = get_any_page(page, pfn, 0);
1220 if (ret < 0)
1221 return ret;
1222 if (ret == 0)
1223 goto done;
1224 }
1225 if (!PageLRU(page)) {
1226 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1227 pfn, page->flags);
1228 return -EIO;
1229 }
1230
1231 lock_page(page);
1232 wait_on_page_writeback(page);
1233
1234 /*
1235 * Synchronized using the page lock with memory_failure()
1236 */
1237 if (PageHWPoison(page)) {
1238 unlock_page(page);
1239 put_page(page);
1240 pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1241 return -EBUSY;
1242 }
1243
1244 /*
1245 * Try to invalidate first. This should work for
1246 * non dirty unmapped page cache pages.
1247 */
1248 ret = invalidate_inode_page(page);
1249 unlock_page(page);
1250
1251 /*
1252 * Drop count because page migration doesn't like raised
1253 * counts. The page could get re-allocated, but if it becomes
1254 * LRU the isolation will just fail.
1255 * RED-PEN would be better to keep it isolated here, but we
1256 * would need to fix isolation locking first.
1257 */
1258 put_page(page);
1259 if (ret == 1) {
1260 ret = 0;
1261 pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1262 goto done;
1263 }
1264
1265 /*
1266 * Simple invalidation didn't work.
1267 * Try to migrate to a new page instead. migrate.c
1268 * handles a large number of cases for us.
1269 */
1270 ret = isolate_lru_page(page);
1271 if (!ret) {
1272 LIST_HEAD(pagelist);
1273
1274 list_add(&page->lru, &pagelist);
1275 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1276 if (ret) {
1277 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1278 pfn, ret, page->flags);
1279 if (ret > 0)
1280 ret = -EIO;
1281 }
1282 } else {
1283 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1284 pfn, ret, page_count(page), page->flags);
1285 }
1286 if (ret)
1287 return ret;
1288
1289done:
1290 atomic_long_add(1, &mce_bad_pages);
1291 SetPageHWPoison(page);
1292 /* keep elevated page count for bad page */
1293 return ret;
1294}
diff --git a/mm/memory.c b/mm/memory.c
index aed45eaf8ac9..09e4b1be7b67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2555,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2555 ret = VM_FAULT_MAJOR; 2555 ret = VM_FAULT_MAJOR;
2556 count_vm_event(PGMAJFAULT); 2556 count_vm_event(PGMAJFAULT);
2557 } else if (PageHWPoison(page)) { 2557 } else if (PageHWPoison(page)) {
2558 /*
2559 * hwpoisoned dirty swapcache pages are kept for killing
2560 * owner processes (which may be unknown at hwpoison time)
2561 */
2558 ret = VM_FAULT_HWPOISON; 2562 ret = VM_FAULT_HWPOISON;
2559 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2560 goto out_release; 2564 goto out_release;
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b2..880bd592d38e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
912 goto out_pm; 912 goto out_pm;
913 913
914 err = -ENODEV; 914 err = -ENODEV;
915 if (node < 0 || node >= MAX_NUMNODES)
916 goto out_pm;
917
915 if (!node_state(node, N_HIGH_MEMORY)) 918 if (!node_state(node, N_HIGH_MEMORY))
916 goto out_pm; 919 goto out_pm;
917 920
@@ -999,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
999#define DO_PAGES_STAT_CHUNK_NR 16 1002#define DO_PAGES_STAT_CHUNK_NR 16
1000 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1003 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1001 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1004 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1002 unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1003 int err;
1004 1005
1005 for (i = 0; i < nr_pages; i += chunk_nr) { 1006 while (nr_pages) {
1006 if (chunk_nr > nr_pages - i) 1007 unsigned long chunk_nr;
1007 chunk_nr = nr_pages - i;
1008 1008
1009 err = copy_from_user(chunk_pages, &pages[i], 1009 chunk_nr = nr_pages;
1010 chunk_nr * sizeof(*chunk_pages)); 1010 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1011 if (err) { 1011 chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1012 err = -EFAULT; 1012
1013 goto out; 1013 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1014 } 1014 break;
1015 1015
1016 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1016 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1017 1017
1018 err = copy_to_user(&status[i], chunk_status, 1018 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1019 chunk_nr * sizeof(*chunk_status)); 1019 break;
1020 if (err) {
1021 err = -EFAULT;
1022 goto out;
1023 }
1024 }
1025 err = 0;
1026 1020
1027out: 1021 pages += chunk_nr;
1028 return err; 1022 status += chunk_nr;
1023 nr_pages -= chunk_nr;
1024 }
1025 return nr_pages ? -EFAULT : 0;
1029} 1026}
1030 1027
1031/* 1028/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d9c77b2dbe9d..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1043} 1043}
1044EXPORT_SYMBOL(do_mmap_pgoff); 1044EXPORT_SYMBOL(do_mmap_pgoff);
1045 1045
1046SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1047 unsigned long, prot, unsigned long, flags,
1048 unsigned long, fd, unsigned long, pgoff)
1049{
1050 struct file *file = NULL;
1051 unsigned long retval = -EBADF;
1052
1053 if (!(flags & MAP_ANONYMOUS)) {
1054 if (unlikely(flags & MAP_HUGETLB))
1055 return -EINVAL;
1056 file = fget(fd);
1057 if (!file)
1058 goto out;
1059 } else if (flags & MAP_HUGETLB) {
1060 struct user_struct *user = NULL;
1061 /*
1062 * VM_NORESERVE is used because the reservations will be
1063 * taken when vm_ops->mmap() is called
1064 * A dummy user value is used because we are not locking
1065 * memory so no accounting is necessary
1066 */
1067 len = ALIGN(len, huge_page_size(&default_hstate));
1068 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
1069 &user, HUGETLB_ANONHUGE_INODE);
1070 if (IS_ERR(file))
1071 return PTR_ERR(file);
1072 }
1073
1074 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1075
1076 down_write(&current->mm->mmap_sem);
1077 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1078 up_write(&current->mm->mmap_sem);
1079
1080 if (file)
1081 fput(file);
1082out:
1083 return retval;
1084}
1085
1046/* 1086/*
1047 * Some shared mappigns will want the pages marked read-only 1087 * Some shared mappigns will want the pages marked read-only
1048 * to track write events. If so, we'll downgrade vm_page_prot 1088 * to track write events. If so, we'll downgrade vm_page_prot
diff --git a/mm/nommu.c b/mm/nommu.c
index 8687973462bb..48a2ecfaf059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
432 /* 432 /*
433 * Ok, looks good - let it rip. 433 * Ok, looks good - let it rip.
434 */ 434 */
435 flush_icache_range(mm->brk, brk);
435 return mm->brk = brk; 436 return mm->brk = brk;
436} 437}
437 438
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
551static void __put_nommu_region(struct vm_region *region) 552static void __put_nommu_region(struct vm_region *region)
552 __releases(nommu_region_sem) 553 __releases(nommu_region_sem)
553{ 554{
554 kenter("%p{%d}", region, atomic_read(&region->vm_usage)); 555 kenter("%p{%d}", region, region->vm_usage);
555 556
556 BUG_ON(!nommu_region_tree.rb_node); 557 BUG_ON(!nommu_region_tree.rb_node);
557 558
558 if (atomic_dec_and_test(&region->vm_usage)) { 559 if (--region->vm_usage == 0) {
559 if (region->vm_top > region->vm_start) 560 if (region->vm_top > region->vm_start)
560 delete_nommu_region(region); 561 delete_nommu_region(region);
561 up_write(&nommu_region_sem); 562 up_write(&nommu_region_sem);
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1204 if (!vma) 1205 if (!vma)
1205 goto error_getting_vma; 1206 goto error_getting_vma;
1206 1207
1207 atomic_set(&region->vm_usage, 1); 1208 region->vm_usage = 1;
1208 region->vm_flags = vm_flags; 1209 region->vm_flags = vm_flags;
1209 region->vm_pgoff = pgoff; 1210 region->vm_pgoff = pgoff;
1210 1211
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1271 } 1272 }
1272 1273
1273 /* we've found a region we can share */ 1274 /* we've found a region we can share */
1274 atomic_inc(&pregion->vm_usage); 1275 pregion->vm_usage++;
1275 vma->vm_region = pregion; 1276 vma->vm_region = pregion;
1276 start = pregion->vm_start; 1277 start = pregion->vm_start;
1277 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1278 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1288 vma->vm_region = NULL; 1289 vma->vm_region = NULL;
1289 vma->vm_start = 0; 1290 vma->vm_start = 0;
1290 vma->vm_end = 0; 1291 vma->vm_end = 0;
1291 atomic_dec(&pregion->vm_usage); 1292 pregion->vm_usage--;
1292 pregion = NULL; 1293 pregion = NULL;
1293 goto error_just_free; 1294 goto error_just_free;
1294 } 1295 }
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
1353share: 1354share:
1354 add_vma_to_mm(current->mm, vma); 1355 add_vma_to_mm(current->mm, vma);
1355 1356
1356 up_write(&nommu_region_sem); 1357 /* we flush the region from the icache only when the first executable
1358 * mapping of it is made */
1359 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1360 flush_icache_range(region->vm_start, region->vm_end);
1361 region->vm_icache_flushed = true;
1362 }
1357 1363
1358 if (prot & PROT_EXEC) 1364 up_write(&nommu_region_sem);
1359 flush_icache_range(result, result + len);
1360 1365
1361 kleave(" = %lx", result); 1366 kleave(" = %lx", result);
1362 return result; 1367 return result;
@@ -1398,6 +1403,31 @@ error_getting_region:
1398} 1403}
1399EXPORT_SYMBOL(do_mmap_pgoff); 1404EXPORT_SYMBOL(do_mmap_pgoff);
1400 1405
1406SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1407 unsigned long, prot, unsigned long, flags,
1408 unsigned long, fd, unsigned long, pgoff)
1409{
1410 struct file *file = NULL;
1411 unsigned long retval = -EBADF;
1412
1413 if (!(flags & MAP_ANONYMOUS)) {
1414 file = fget(fd);
1415 if (!file)
1416 goto out;
1417 }
1418
1419 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1420
1421 down_write(&current->mm->mmap_sem);
1422 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1423 up_write(&current->mm->mmap_sem);
1424
1425 if (file)
1426 fput(file);
1427out:
1428 return retval;
1429}
1430
1401/* 1431/*
1402 * split a vma into two pieces at address 'addr', a new vma is allocated either 1432 * split a vma into two pieces at address 'addr', a new vma is allocated either
1403 * for the first part or the tail. 1433 * for the first part or the tail.
@@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1411 1441
1412 kenter(""); 1442 kenter("");
1413 1443
1414 /* we're only permitted to split anonymous regions that have a single 1444 /* we're only permitted to split anonymous regions (these should have
1415 * owner */ 1445 * only a single usage on the region) */
1416 if (vma->vm_file || 1446 if (vma->vm_file)
1417 atomic_read(&vma->vm_region->vm_usage) != 1)
1418 return -ENOMEM; 1447 return -ENOMEM;
1419 1448
1420 if (mm->map_count >= sysctl_max_map_count) 1449 if (mm->map_count >= sysctl_max_map_count)
@@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
1488 1517
1489 /* cut the backing region down to size */ 1518 /* cut the backing region down to size */
1490 region = vma->vm_region; 1519 region = vma->vm_region;
1491 BUG_ON(atomic_read(&region->vm_usage) != 1); 1520 BUG_ON(region->vm_usage != 1);
1492 1521
1493 down_write(&nommu_region_sem); 1522 down_write(&nommu_region_sem);
1494 delete_nommu_region(region); 1523 delete_nommu_region(region);
@@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
1732EXPORT_SYMBOL(unmap_mapping_range); 1761EXPORT_SYMBOL(unmap_mapping_range);
1733 1762
1734/* 1763/*
1735 * ask for an unmapped area at which to create a mapping on a file
1736 */
1737unsigned long get_unmapped_area(struct file *file, unsigned long addr,
1738 unsigned long len, unsigned long pgoff,
1739 unsigned long flags)
1740{
1741 unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
1742 unsigned long, unsigned long);
1743
1744 get_area = current->mm->get_unmapped_area;
1745 if (file && file->f_op && file->f_op->get_unmapped_area)
1746 get_area = file->f_op->get_unmapped_area;
1747
1748 if (!get_area)
1749 return -ENOSYS;
1750
1751 return get_area(file, addr, len, pgoff, flags);
1752}
1753EXPORT_SYMBOL(get_unmapped_area);
1754
1755/*
1756 * Check that a process has enough memory to allocate a new virtual 1764 * Check that a process has enough memory to allocate a new virtual
1757 * mapping. 0 means there is enough memory for the allocation to 1765 * mapping. 0 means there is enough memory for the allocation to
1758 * succeed and -ENOMEM implies there is not. 1766 * succeed and -ENOMEM implies there is not.
@@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1891 1899
1892 /* only read or write mappings where it is permitted */ 1900 /* only read or write mappings where it is permitted */
1893 if (write && vma->vm_flags & VM_MAYWRITE) 1901 if (write && vma->vm_flags & VM_MAYWRITE)
1894 len -= copy_to_user((void *) addr, buf, len); 1902 copy_to_user_page(vma, NULL, addr,
1903 (void *) addr, buf, len);
1895 else if (!write && vma->vm_flags & VM_MAYREAD) 1904 else if (!write && vma->vm_flags & VM_MAYREAD)
1896 len -= copy_from_user(buf, (void *) addr, len); 1905 copy_from_user_page(vma, NULL, addr,
1906 buf, (void *) addr, len);
1897 else 1907 else
1898 len = 0; 1908 len = 0;
1899 } else { 1909 } else {
@@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1904 mmput(mm); 1914 mmput(mm);
1905 return len; 1915 return len;
1906} 1916}
1917
1918/**
1919 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1920 * @inode: The inode to check
1921 * @size: The current filesize of the inode
1922 * @newsize: The proposed filesize of the inode
1923 *
1924 * Check the shared mappings on an inode on behalf of a shrinking truncate to
1925 * make sure that that any outstanding VMAs aren't broken and then shrink the
1926 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
1927 * automatically grant mappings that are too large.
1928 */
1929int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1930 size_t newsize)
1931{
1932 struct vm_area_struct *vma;
1933 struct prio_tree_iter iter;
1934 struct vm_region *region;
1935 pgoff_t low, high;
1936 size_t r_size, r_top;
1937
1938 low = newsize >> PAGE_SHIFT;
1939 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1940
1941 down_write(&nommu_region_sem);
1942
1943 /* search for VMAs that fall within the dead zone */
1944 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1945 low, high) {
1946 /* found one - only interested if it's shared out of the page
1947 * cache */
1948 if (vma->vm_flags & VM_SHARED) {
1949 up_write(&nommu_region_sem);
1950 return -ETXTBSY; /* not quite true, but near enough */
1951 }
1952 }
1953
1954 /* reduce any regions that overlap the dead zone - if in existence,
1955 * these will be pointed to by VMAs that don't overlap the dead zone
1956 *
1957 * we don't check for any regions that start beyond the EOF as there
1958 * shouldn't be any
1959 */
1960 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1961 0, ULONG_MAX) {
1962 if (!(vma->vm_flags & VM_SHARED))
1963 continue;
1964
1965 region = vma->vm_region;
1966 r_size = region->vm_top - region->vm_start;
1967 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1968
1969 if (r_top > newsize) {
1970 region->vm_top -= r_top - newsize;
1971 if (region->vm_end > region->vm_top)
1972 region->vm_end = region->vm_top;
1973 }
1974 }
1975
1976 up_write(&nommu_region_sem);
1977 return 0;
1978}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52481b1c1e5..237050478f28 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
459 list_for_each_entry(c, &p->children, sibling) { 459 list_for_each_entry(c, &p->children, sibling) {
460 if (c->mm == p->mm) 460 if (c->mm == p->mm)
461 continue; 461 continue;
462 if (mem && !task_in_mem_cgroup(c, mem))
463 continue;
462 if (!oom_kill_task(c)) 464 if (!oom_kill_task(c))
463 return 0; 465 return 0;
464 } 466 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 850c4a7e2fe5..8deb9d0fd5b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h>
51#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
@@ -555,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
555 page = list_entry(list->prev, struct page, lru); 556 page = list_entry(list->prev, struct page, lru);
556 /* must delete as __free_one_page list manipulates */ 557 /* must delete as __free_one_page list manipulates */
557 list_del(&page->lru); 558 list_del(&page->lru);
558 __free_one_page(page, zone, 0, migratetype); 559 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
559 trace_mm_page_pcpu_drain(page, 0, migratetype); 560 __free_one_page(page, zone, 0, page_private(page));
561 trace_mm_page_pcpu_drain(page, 0, page_private(page));
560 } while (--count && --batch_free && !list_empty(list)); 562 } while (--count && --batch_free && !list_empty(list));
561 } 563 }
562 spin_unlock(&zone->lock); 564 spin_unlock(&zone->lock);
@@ -1221,10 +1223,10 @@ again:
1221 } 1223 }
1222 spin_lock_irqsave(&zone->lock, flags); 1224 spin_lock_irqsave(&zone->lock, flags);
1223 page = __rmqueue(zone, order, migratetype); 1225 page = __rmqueue(zone, order, migratetype);
1224 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1225 spin_unlock(&zone->lock); 1226 spin_unlock(&zone->lock);
1226 if (!page) 1227 if (!page)
1227 goto failed; 1228 goto failed;
1229 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1228 } 1230 }
1229 1231
1230 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1232 __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -2401,13 +2403,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2401{ 2403{
2402 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2404 char saved_string[NUMA_ZONELIST_ORDER_LEN];
2403 int ret; 2405 int ret;
2406 static DEFINE_MUTEX(zl_order_mutex);
2404 2407
2408 mutex_lock(&zl_order_mutex);
2405 if (write) 2409 if (write)
2406 strncpy(saved_string, (char*)table->data, 2410 strcpy(saved_string, (char*)table->data);
2407 NUMA_ZONELIST_ORDER_LEN);
2408 ret = proc_dostring(table, write, buffer, length, ppos); 2411 ret = proc_dostring(table, write, buffer, length, ppos);
2409 if (ret) 2412 if (ret)
2410 return ret; 2413 goto out;
2411 if (write) { 2414 if (write) {
2412 int oldval = user_zonelist_order; 2415 int oldval = user_zonelist_order;
2413 if (__parse_numa_zonelist_order((char*)table->data)) { 2416 if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2420,7 +2423,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2420 } else if (oldval != user_zonelist_order) 2423 } else if (oldval != user_zonelist_order)
2421 build_all_zonelists(); 2424 build_all_zonelists();
2422 } 2425 }
2423 return 0; 2426out:
2427 mutex_unlock(&zl_order_mutex);
2428 return ret;
2424} 2429}
2425 2430
2426 2431
@@ -3579,7 +3584,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3579 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3584 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3580 * then all holes in the requested range will be accounted for. 3585 * then all holes in the requested range will be accounted for.
3581 */ 3586 */
3582static unsigned long __meminit __absent_pages_in_range(int nid, 3587unsigned long __meminit __absent_pages_in_range(int nid,
3583 unsigned long range_start_pfn, 3588 unsigned long range_start_pfn,
3584 unsigned long range_end_pfn) 3589 unsigned long range_end_pfn)
3585{ 3590{
@@ -3994,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3994 } 3999 }
3995 4000
3996 /* Merge backward if suitable */ 4001 /* Merge backward if suitable */
3997 if (start_pfn < early_node_map[i].end_pfn && 4002 if (start_pfn < early_node_map[i].start_pfn &&
3998 end_pfn >= early_node_map[i].start_pfn) { 4003 end_pfn >= early_node_map[i].start_pfn) {
3999 early_node_map[i].start_pfn = start_pfn; 4004 early_node_map[i].start_pfn = start_pfn;
4000 return; 4005 return;
@@ -4108,7 +4113,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
4108} 4113}
4109 4114
4110/* sort the node_map by start_pfn */ 4115/* sort the node_map by start_pfn */
4111static void __init sort_node_map(void) 4116void __init sort_node_map(void)
4112{ 4117{
4113 sort(early_node_map, (size_t)nr_nodemap_entries, 4118 sort(early_node_map, (size_t)nr_nodemap_entries,
4114 sizeof(struct node_active_region), 4119 sizeof(struct node_active_region),
@@ -5008,23 +5013,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5008int set_migratetype_isolate(struct page *page) 5013int set_migratetype_isolate(struct page *page)
5009{ 5014{
5010 struct zone *zone; 5015 struct zone *zone;
5011 unsigned long flags; 5016 struct page *curr_page;
5017 unsigned long flags, pfn, iter;
5018 unsigned long immobile = 0;
5019 struct memory_isolate_notify arg;
5020 int notifier_ret;
5012 int ret = -EBUSY; 5021 int ret = -EBUSY;
5013 int zone_idx; 5022 int zone_idx;
5014 5023
5015 zone = page_zone(page); 5024 zone = page_zone(page);
5016 zone_idx = zone_idx(zone); 5025 zone_idx = zone_idx(zone);
5026
5017 spin_lock_irqsave(&zone->lock, flags); 5027 spin_lock_irqsave(&zone->lock, flags);
5028 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5029 zone_idx == ZONE_MOVABLE) {
5030 ret = 0;
5031 goto out;
5032 }
5033
5034 pfn = page_to_pfn(page);
5035 arg.start_pfn = pfn;
5036 arg.nr_pages = pageblock_nr_pages;
5037 arg.pages_found = 0;
5038
5018 /* 5039 /*
5019 * In future, more migrate types will be able to be isolation target. 5040 * It may be possible to isolate a pageblock even if the
5041 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5042 * notifier chain is used by balloon drivers to return the
5043 * number of pages in a range that are held by the balloon
5044 * driver to shrink memory. If all the pages are accounted for
5045 * by balloons, are free, or on the LRU, isolation can continue.
5046 * Later, for example, when memory hotplug notifier runs, these
5047 * pages reported as "can be isolated" should be isolated(freed)
5048 * by the balloon driver through the memory notifier chain.
5020 */ 5049 */
5021 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && 5050 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5022 zone_idx != ZONE_MOVABLE) 5051 notifier_ret = notifier_to_errno(notifier_ret);
5052 if (notifier_ret || !arg.pages_found)
5023 goto out; 5053 goto out;
5024 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5054
5025 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5055 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
5026 ret = 0; 5056 if (!pfn_valid_within(pfn))
5057 continue;
5058
5059 curr_page = pfn_to_page(iter);
5060 if (!page_count(curr_page) || PageLRU(curr_page))
5061 continue;
5062
5063 immobile++;
5064 }
5065
5066 if (arg.pages_found == immobile)
5067 ret = 0;
5068
5027out: 5069out:
5070 if (!ret) {
5071 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5072 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5073 }
5074
5028 spin_unlock_irqrestore(&zone->lock, flags); 5075 spin_unlock_irqrestore(&zone->lock, flags);
5029 if (!ret) 5076 if (!ret)
5030 drain_all_pages(); 5077 drain_all_pages();
@@ -5091,3 +5138,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5091 spin_unlock_irqrestore(&zone->lock, flags); 5138 spin_unlock_irqrestore(&zone->lock, flags);
5092} 5139}
5093#endif 5140#endif
5141
5142#ifdef CONFIG_MEMORY_FAILURE
5143bool is_free_buddy_page(struct page *page)
5144{
5145 struct zone *zone = page_zone(page);
5146 unsigned long pfn = page_to_pfn(page);
5147 unsigned long flags;
5148 int order;
5149
5150 spin_lock_irqsave(&zone->lock, flags);
5151 for (order = 0; order < MAX_ORDER; order++) {
5152 struct page *page_head = page - (pfn & ((1 << order) - 1));
5153
5154 if (PageBuddy(page_head) && page_order(page_head) >= order)
5155 break;
5156 }
5157 spin_unlock_irqrestore(&zone->lock, flags);
5158
5159 return order < MAX_ORDER;
5160}
5161#endif
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010cc91c6..083e7c91e5f6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work)
1271 */ 1271 */
1272void free_percpu(void *ptr) 1272void free_percpu(void *ptr)
1273{ 1273{
1274 void *addr = __pcpu_ptr_to_addr(ptr); 1274 void *addr;
1275 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
1276 unsigned long flags; 1276 unsigned long flags;
1277 int off; 1277 int off;
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr)
1279 if (!ptr) 1279 if (!ptr)
1280 return; 1280 return;
1281 1281
1282 addr = __pcpu_ptr_to_addr(ptr);
1283
1282 spin_lock_irqsave(&pcpu_lock, flags); 1284 spin_lock_irqsave(&pcpu_lock, flags);
1283 1285
1284 chunk = pcpu_chunk_addr_search(addr); 1286 chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa2345235..033bc135a41f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping,
547 547
548 /* do read-ahead */ 548 /* do read-ahead */
549 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 549 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
550
551#ifdef CONFIG_BLOCK
552 /*
553 * Normally the current page is !uptodate and lock_page() will be
554 * immediately called to implicitly unplug the device. However this
555 * is not always true for RAID conifgurations, where data arrives
556 * not strictly in their submission order. In this case we need to
557 * explicitly kick off the IO.
558 */
559 if (PageUptodate(page))
560 blk_run_backing_dev(mapping->backing_dev_info, NULL);
561#endif
550} 562}
551EXPORT_SYMBOL_GPL(page_cache_async_readahead); 563EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/shmem.c b/mm/shmem.c
index 4fb41c83daca..eef4ebea5158 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,6 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/ima.h>
33 32
34static struct vfsmount *shm_mnt; 33static struct vfsmount *shm_mnt;
35 34
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt;
42 41
43#include <linux/xattr.h> 42#include <linux/xattr.h>
44#include <linux/exportfs.h> 43#include <linux/exportfs.h>
44#include <linux/posix_acl.h>
45#include <linux/generic_acl.h> 45#include <linux/generic_acl.h>
46#include <linux/mman.h> 46#include <linux/mman.h>
47#include <linux/string.h> 47#include <linux/string.h>
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
810 error = inode_setattr(inode, attr); 810 error = inode_setattr(inode, attr);
811#ifdef CONFIG_TMPFS_POSIX_ACL 811#ifdef CONFIG_TMPFS_POSIX_ACL
812 if (!error && (attr->ia_valid & ATTR_MODE)) 812 if (!error && (attr->ia_valid & ATTR_MODE))
813 error = generic_acl_chmod(inode, &shmem_acl_ops); 813 error = generic_acl_chmod(inode);
814#endif 814#endif
815 if (page) 815 if (page)
816 page_cache_release(page); 816 page_cache_release(page);
@@ -1824,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1824 return error; 1824 return error;
1825 } 1825 }
1826 } 1826 }
1827 error = shmem_acl_init(inode, dir); 1827#ifdef CONFIG_TMPFS_POSIX_ACL
1828 error = generic_acl_init(inode, dir);
1828 if (error) { 1829 if (error) {
1829 iput(inode); 1830 iput(inode);
1830 return error; 1831 return error;
1831 } 1832 }
1833#else
1834 error = 0;
1835#endif
1832 if (dir->i_mode & S_ISGID) { 1836 if (dir->i_mode & S_ISGID) {
1833 inode->i_gid = dir->i_gid; 1837 inode->i_gid = dir->i_gid;
1834 if (S_ISDIR(mode)) 1838 if (S_ISDIR(mode))
@@ -2043,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = {
2043 * filesystem level, though. 2047 * filesystem level, though.
2044 */ 2048 */
2045 2049
2046static size_t shmem_xattr_security_list(struct inode *inode, char *list, 2050static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
2047 size_t list_len, const char *name, 2051 size_t list_len, const char *name,
2048 size_t name_len) 2052 size_t name_len, int handler_flags)
2049{ 2053{
2050 return security_inode_listsecurity(inode, list, list_len); 2054 return security_inode_listsecurity(dentry->d_inode, list, list_len);
2051} 2055}
2052 2056
2053static int shmem_xattr_security_get(struct inode *inode, const char *name, 2057static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
2054 void *buffer, size_t size) 2058 void *buffer, size_t size, int handler_flags)
2055{ 2059{
2056 if (strcmp(name, "") == 0) 2060 if (strcmp(name, "") == 0)
2057 return -EINVAL; 2061 return -EINVAL;
2058 return xattr_getsecurity(inode, name, buffer, size); 2062 return xattr_getsecurity(dentry->d_inode, name, buffer, size);
2059} 2063}
2060 2064
2061static int shmem_xattr_security_set(struct inode *inode, const char *name, 2065static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
2062 const void *value, size_t size, int flags) 2066 const void *value, size_t size, int flags, int handler_flags)
2063{ 2067{
2064 if (strcmp(name, "") == 0) 2068 if (strcmp(name, "") == 0)
2065 return -EINVAL; 2069 return -EINVAL;
2066 return security_inode_setsecurity(inode, name, value, size, flags); 2070 return security_inode_setsecurity(dentry->d_inode, name, value,
2071 size, flags);
2067} 2072}
2068 2073
2069static struct xattr_handler shmem_xattr_security_handler = { 2074static struct xattr_handler shmem_xattr_security_handler = {
@@ -2074,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = {
2074}; 2079};
2075 2080
2076static struct xattr_handler *shmem_xattr_handlers[] = { 2081static struct xattr_handler *shmem_xattr_handlers[] = {
2077 &shmem_xattr_acl_access_handler, 2082 &generic_acl_access_handler,
2078 &shmem_xattr_acl_default_handler, 2083 &generic_acl_default_handler,
2079 &shmem_xattr_security_handler, 2084 &shmem_xattr_security_handler,
2080 NULL 2085 NULL
2081}; 2086};
@@ -2454,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = {
2454 .getxattr = generic_getxattr, 2459 .getxattr = generic_getxattr,
2455 .listxattr = generic_listxattr, 2460 .listxattr = generic_listxattr,
2456 .removexattr = generic_removexattr, 2461 .removexattr = generic_removexattr,
2457 .check_acl = shmem_check_acl, 2462 .check_acl = generic_check_acl,
2458#endif 2463#endif
2459 2464
2460}; 2465};
@@ -2477,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2477 .getxattr = generic_getxattr, 2482 .getxattr = generic_getxattr,
2478 .listxattr = generic_listxattr, 2483 .listxattr = generic_listxattr,
2479 .removexattr = generic_removexattr, 2484 .removexattr = generic_removexattr,
2480 .check_acl = shmem_check_acl, 2485 .check_acl = generic_check_acl,
2481#endif 2486#endif
2482}; 2487};
2483 2488
@@ -2488,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2488 .getxattr = generic_getxattr, 2493 .getxattr = generic_getxattr,
2489 .listxattr = generic_listxattr, 2494 .listxattr = generic_listxattr,
2490 .removexattr = generic_removexattr, 2495 .removexattr = generic_removexattr,
2491 .check_acl = shmem_check_acl, 2496 .check_acl = generic_check_acl,
2492#endif 2497#endif
2493}; 2498};
2494 2499
@@ -2626,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2626 int error; 2631 int error;
2627 struct file *file; 2632 struct file *file;
2628 struct inode *inode; 2633 struct inode *inode;
2629 struct dentry *dentry, *root; 2634 struct path path;
2635 struct dentry *root;
2630 struct qstr this; 2636 struct qstr this;
2631 2637
2632 if (IS_ERR(shm_mnt)) 2638 if (IS_ERR(shm_mnt))
@@ -2643,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2643 this.len = strlen(name); 2649 this.len = strlen(name);
2644 this.hash = 0; /* will go */ 2650 this.hash = 0; /* will go */
2645 root = shm_mnt->mnt_root; 2651 root = shm_mnt->mnt_root;
2646 dentry = d_alloc(root, &this); 2652 path.dentry = d_alloc(root, &this);
2647 if (!dentry) 2653 if (!path.dentry)
2648 goto put_memory; 2654 goto put_memory;
2649 2655 path.mnt = mntget(shm_mnt);
2650 error = -ENFILE;
2651 file = get_empty_filp();
2652 if (!file)
2653 goto put_dentry;
2654 2656
2655 error = -ENOSPC; 2657 error = -ENOSPC;
2656 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); 2658 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
2657 if (!inode) 2659 if (!inode)
2658 goto close_file; 2660 goto put_dentry;
2659 2661
2660 d_instantiate(dentry, inode); 2662 d_instantiate(path.dentry, inode);
2661 inode->i_size = size; 2663 inode->i_size = size;
2662 inode->i_nlink = 0; /* It is unlinked */ 2664 inode->i_nlink = 0; /* It is unlinked */
2663 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2664 &shmem_file_operations);
2665
2666#ifndef CONFIG_MMU 2665#ifndef CONFIG_MMU
2667 error = ramfs_nommu_expand_for_mapping(inode, size); 2666 error = ramfs_nommu_expand_for_mapping(inode, size);
2668 if (error) 2667 if (error)
2669 goto close_file; 2668 goto put_dentry;
2670#endif 2669#endif
2671 ima_counts_get(file); 2670
2671 error = -ENFILE;
2672 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2673 &shmem_file_operations);
2674 if (!file)
2675 goto put_dentry;
2676
2672 return file; 2677 return file;
2673 2678
2674close_file:
2675 put_filp(file);
2676put_dentry: 2679put_dentry:
2677 dput(dentry); 2680 path_put(&path);
2678put_memory: 2681put_memory:
2679 shmem_unacct_size(flags, size); 2682 shmem_unacct_size(flags, size);
2680 return ERR_PTR(error); 2683 return ERR_PTR(error);
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
deleted file mode 100644
index df2c87fdae50..000000000000
--- a/mm/shmem_acl.c
+++ /dev/null
@@ -1,171 +0,0 @@
1/*
2 * mm/shmem_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/fs.h>
10#include <linux/shmem_fs.h>
11#include <linux/xattr.h>
12#include <linux/generic_acl.h>
13
14/**
15 * shmem_get_acl - generic_acl_operations->getacl() operation
16 */
17static struct posix_acl *
18shmem_get_acl(struct inode *inode, int type)
19{
20 struct posix_acl *acl = NULL;
21
22 spin_lock(&inode->i_lock);
23 switch(type) {
24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(inode->i_acl);
26 break;
27
28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(inode->i_default_acl);
30 break;
31 }
32 spin_unlock(&inode->i_lock);
33
34 return acl;
35}
36
37/**
38 * shmem_set_acl - generic_acl_operations->setacl() operation
39 */
40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
42{
43 struct posix_acl *free = NULL;
44
45 spin_lock(&inode->i_lock);
46 switch(type) {
47 case ACL_TYPE_ACCESS:
48 free = inode->i_acl;
49 inode->i_acl = posix_acl_dup(acl);
50 break;
51
52 case ACL_TYPE_DEFAULT:
53 free = inode->i_default_acl;
54 inode->i_default_acl = posix_acl_dup(acl);
55 break;
56 }
57 spin_unlock(&inode->i_lock);
58 posix_acl_release(free);
59}
60
61struct generic_acl_operations shmem_acl_ops = {
62 .getacl = shmem_get_acl,
63 .setacl = shmem_set_acl,
64};
65
66/**
67 * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
68 * shmem_xattr_acl_access_handler - plumbing code to implement the
69 * system.posix_acl_access xattr using the generic acl functions.
70 */
71
72static size_t
73shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
74 const char *name, size_t name_len)
75{
76 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
77 list, list_size);
78}
79
80static int
81shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
82 size_t size)
83{
84 if (strcmp(name, "") != 0)
85 return -EINVAL;
86 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
87 size);
88}
89
90static int
91shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
92 size_t size, int flags)
93{
94 if (strcmp(name, "") != 0)
95 return -EINVAL;
96 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
97 size);
98}
99
100struct xattr_handler shmem_xattr_acl_access_handler = {
101 .prefix = POSIX_ACL_XATTR_ACCESS,
102 .list = shmem_list_acl_access,
103 .get = shmem_get_acl_access,
104 .set = shmem_set_acl_access,
105};
106
107/**
108 * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
109 * shmem_xattr_acl_default_handler - plumbing code to implement the
110 * system.posix_acl_default xattr using the generic acl functions.
111 */
112
113static size_t
114shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
115 const char *name, size_t name_len)
116{
117 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
118 list, list_size);
119}
120
121static int
122shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
123 size_t size)
124{
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
128 size);
129}
130
131static int
132shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
133 size_t size, int flags)
134{
135 if (strcmp(name, "") != 0)
136 return -EINVAL;
137 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
138 size);
139}
140
141struct xattr_handler shmem_xattr_acl_default_handler = {
142 .prefix = POSIX_ACL_XATTR_DEFAULT,
143 .list = shmem_list_acl_default,
144 .get = shmem_get_acl_default,
145 .set = shmem_set_acl_default,
146};
147
148/**
149 * shmem_acl_init - Inizialize the acl(s) of a new inode
150 */
151int
152shmem_acl_init(struct inode *inode, struct inode *dir)
153{
154 return generic_acl_init(inode, dir, &shmem_acl_ops);
155}
156
157/**
158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */
160int
161shmem_check_acl(struct inode *inode, int mask)
162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
164
165 if (acl) {
166 int error = posix_acl_permission(inode, acl, mask);
167 posix_acl_release(acl);
168 return error;
169 }
170 return -EAGAIN;
171}
diff --git a/mm/slab.c b/mm/slab.c
index 3f4822938f46..7451bdacaf18 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,7 +654,7 @@ static void init_node_lock_keys(int q)
654 654
655 l3 = s->cs_cachep->nodelists[q]; 655 l3 = s->cs_cachep->nodelists[q];
656 if (!l3 || OFF_SLAB(s->cs_cachep)) 656 if (!l3 || OFF_SLAB(s->cs_cachep))
657 return; 657 continue;
658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
659 alc = l3->alien; 659 alc = l3->alien;
660 /* 660 /*
@@ -665,7 +665,7 @@ static void init_node_lock_keys(int q)
665 * for alloc_alien_cache, 665 * for alloc_alien_cache,
666 */ 666 */
667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
668 return; 668 continue;
669 for_each_node(r) { 669 for_each_node(r) {
670 if (alc[r]) 670 if (alc[r])
671 lockdep_set_class(&alc[r]->lock, 671 lockdep_set_class(&alc[r]->lock,
@@ -1132,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1132 if (nc) 1132 if (nc)
1133 free_block(cachep, nc->entry, nc->avail, node); 1133 free_block(cachep, nc->entry, nc->avail, node);
1134 1134
1135 if (!cpus_empty(*mask)) { 1135 if (!cpumask_empty(mask)) {
1136 spin_unlock_irq(&l3->list_lock); 1136 spin_unlock_irq(&l3->list_lock);
1137 goto free_array_cache; 1137 goto free_array_cache;
1138 } 1138 }
@@ -2275,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2275 /* 2275 /*
2276 * Determine if the slab management is 'on' or 'off' slab. 2276 * Determine if the slab management is 'on' or 'off' slab.
2277 * (bootstrapping cannot cope with offslab caches so don't do 2277 * (bootstrapping cannot cope with offslab caches so don't do
2278 * it too early on.) 2278 * it too early on. Always use on-slab management when
2279 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2279 */ 2280 */
2280 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) 2281 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2282 !(flags & SLAB_NOLEAKTRACE))
2281 /* 2283 /*
2282 * Size is large, assume best to place the slab management obj 2284 * Size is large, assume best to place the slab management obj
2283 * off-slab (should allow better packing of objs). 2285 * off-slab (should allow better packing of objs).
@@ -2596,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2596 * kmemleak does not treat the ->s_mem pointer as a reference 2598 * kmemleak does not treat the ->s_mem pointer as a reference
2597 * to the object. Otherwise we will not report the leak. 2599 * to the object. Otherwise we will not report the leak.
2598 */ 2600 */
2599 kmemleak_scan_area(slabp, offsetof(struct slab, list), 2601 kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2600 sizeof(struct list_head), local_flags); 2602 local_flags);
2601 if (!slabp) 2603 if (!slabp)
2602 return NULL; 2604 return NULL;
2603 } else { 2605 } else {
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee22684..e87e37244829 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
522 */ 522 */
523void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 523void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
524{ 524{
525 if (new < old) { 525 struct address_space *mapping = inode->i_mapping;
526 struct address_space *mapping = inode->i_mapping; 526
527 527 /*
528 /* 528 * unmap_mapping_range is called twice, first simply for
529 * unmap_mapping_range is called twice, first simply for 529 * efficiency so that truncate_inode_pages does fewer
530 * efficiency so that truncate_inode_pages does fewer 530 * single-page unmaps. However after this first call, and
531 * single-page unmaps. However after this first call, and 531 * before truncate_inode_pages finishes, it is possible for
532 * before truncate_inode_pages finishes, it is possible for 532 * private pages to be COWed, which remain after
533 * private pages to be COWed, which remain after 533 * truncate_inode_pages finishes, hence the second
534 * truncate_inode_pages finishes, hence the second 534 * unmap_mapping_range call must be made for correctness.
535 * unmap_mapping_range call must be made for correctness. 535 */
536 */ 536 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
537 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 537 truncate_inode_pages(mapping, new);
538 truncate_inode_pages(mapping, new); 538 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
539 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
540 }
541} 539}
542EXPORT_SYMBOL(truncate_pagecache); 540EXPORT_SYMBOL(truncate_pagecache);
543 541
diff --git a/mm/util.c b/mm/util.c
index b377ce430803..834db7be240f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,10 +4,6 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/hugetlb.h>
8#include <linux/syscalls.h>
9#include <linux/mman.h>
10#include <linux/file.h>
11#include <asm/uaccess.h> 7#include <asm/uaccess.h>
12 8
13#define CREATE_TRACE_POINTS 9#define CREATE_TRACE_POINTS
@@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
224} 220}
225EXPORT_SYMBOL(strndup_user); 221EXPORT_SYMBOL(strndup_user);
226 222
227#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT 223#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
228void arch_pick_mmap_layout(struct mm_struct *mm) 224void arch_pick_mmap_layout(struct mm_struct *mm)
229{ 225{
230 mm->mmap_base = TASK_UNMAPPED_BASE; 226 mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
272} 268}
273EXPORT_SYMBOL_GPL(get_user_pages_fast); 269EXPORT_SYMBOL_GPL(get_user_pages_fast);
274 270
275SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
276 unsigned long, prot, unsigned long, flags,
277 unsigned long, fd, unsigned long, pgoff)
278{
279 struct file * file = NULL;
280 unsigned long retval = -EBADF;
281
282 if (!(flags & MAP_ANONYMOUS)) {
283 if (unlikely(flags & MAP_HUGETLB))
284 return -EINVAL;
285 file = fget(fd);
286 if (!file)
287 goto out;
288 } else if (flags & MAP_HUGETLB) {
289 struct user_struct *user = NULL;
290 /*
291 * VM_NORESERVE is used because the reservations will be
292 * taken when vm_ops->mmap() is called
293 * A dummy user value is used because we are not locking
294 * memory so no accounting is necessary
295 */
296 len = ALIGN(len, huge_page_size(&default_hstate));
297 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
298 &user, HUGETLB_ANONHUGE_INODE);
299 if (IS_ERR(file))
300 return PTR_ERR(file);
301 }
302
303 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
304
305 down_write(&current->mm->mmap_sem);
306 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
307 up_write(&current->mm->mmap_sem);
308
309 if (file)
310 fput(file);
311out:
312 return retval;
313}
314
315/* Tracepoints definitions. */ 271/* Tracepoints definitions. */
316EXPORT_TRACEPOINT_SYMBOL(kmalloc); 272EXPORT_TRACEPOINT_SYMBOL(kmalloc);
317EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 273EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f250..ae007462b7f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
509 509
510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
511 511
512/* for per-CPU blocks */
513static void purge_fragmented_blocks_allcpus(void);
514
512/* 515/*
513 * Purges all lazily-freed vmap areas. 516 * Purges all lazily-freed vmap areas.
514 * 517 *
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
539 } else 542 } else
540 spin_lock(&purge_lock); 543 spin_lock(&purge_lock);
541 544
545 if (sync)
546 purge_fragmented_blocks_allcpus();
547
542 rcu_read_lock(); 548 rcu_read_lock();
543 list_for_each_entry_rcu(va, &vmap_area_list, list) { 549 list_for_each_entry_rcu(va, &vmap_area_list, list) {
544 if (va->flags & VM_LAZY_FREE) { 550 if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
555 } 561 }
556 rcu_read_unlock(); 562 rcu_read_unlock();
557 563
558 if (nr) { 564 if (nr)
559 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
560 atomic_sub(nr, &vmap_lazy_nr); 565 atomic_sub(nr, &vmap_lazy_nr);
561 }
562 566
563 if (nr || force_flush) 567 if (nr || force_flush)
564 flush_tlb_kernel_range(*start, *end); 568 flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
669struct vmap_block_queue { 673struct vmap_block_queue {
670 spinlock_t lock; 674 spinlock_t lock;
671 struct list_head free; 675 struct list_head free;
672 struct list_head dirty;
673 unsigned int nr_dirty;
674}; 676};
675 677
676struct vmap_block { 678struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
680 unsigned long free, dirty; 682 unsigned long free, dirty;
681 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 683 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
682 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 684 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
683 union { 685 struct list_head free_list;
684 struct list_head free_list; 686 struct rcu_head rcu_head;
685 struct rcu_head rcu_head; 687 struct list_head purge;
686 };
687}; 688};
688 689
689/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 690/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
759 vbq = &get_cpu_var(vmap_block_queue); 760 vbq = &get_cpu_var(vmap_block_queue);
760 vb->vbq = vbq; 761 vb->vbq = vbq;
761 spin_lock(&vbq->lock); 762 spin_lock(&vbq->lock);
762 list_add(&vb->free_list, &vbq->free); 763 list_add_rcu(&vb->free_list, &vbq->free);
763 spin_unlock(&vbq->lock); 764 spin_unlock(&vbq->lock);
764 put_cpu_var(vmap_block_queue); 765 put_cpu_var(vmap_block_queue);
765 766
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
778 struct vmap_block *tmp; 779 struct vmap_block *tmp;
779 unsigned long vb_idx; 780 unsigned long vb_idx;
780 781
781 BUG_ON(!list_empty(&vb->free_list));
782
783 vb_idx = addr_to_vb_idx(vb->va->va_start); 782 vb_idx = addr_to_vb_idx(vb->va->va_start);
784 spin_lock(&vmap_block_tree_lock); 783 spin_lock(&vmap_block_tree_lock);
785 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 784 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
790 call_rcu(&vb->rcu_head, rcu_free_vb); 789 call_rcu(&vb->rcu_head, rcu_free_vb);
791} 790}
792 791
792static void purge_fragmented_blocks(int cpu)
793{
794 LIST_HEAD(purge);
795 struct vmap_block *vb;
796 struct vmap_block *n_vb;
797 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
798
799 rcu_read_lock();
800 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
801
802 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
803 continue;
804
805 spin_lock(&vb->lock);
806 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
807 vb->free = 0; /* prevent further allocs after releasing lock */
808 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
809 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
810 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
811 spin_lock(&vbq->lock);
812 list_del_rcu(&vb->free_list);
813 spin_unlock(&vbq->lock);
814 spin_unlock(&vb->lock);
815 list_add_tail(&vb->purge, &purge);
816 } else
817 spin_unlock(&vb->lock);
818 }
819 rcu_read_unlock();
820
821 list_for_each_entry_safe(vb, n_vb, &purge, purge) {
822 list_del(&vb->purge);
823 free_vmap_block(vb);
824 }
825}
826
827static void purge_fragmented_blocks_thiscpu(void)
828{
829 purge_fragmented_blocks(smp_processor_id());
830}
831
832static void purge_fragmented_blocks_allcpus(void)
833{
834 int cpu;
835
836 for_each_possible_cpu(cpu)
837 purge_fragmented_blocks(cpu);
838}
839
793static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 840static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
794{ 841{
795 struct vmap_block_queue *vbq; 842 struct vmap_block_queue *vbq;
796 struct vmap_block *vb; 843 struct vmap_block *vb;
797 unsigned long addr = 0; 844 unsigned long addr = 0;
798 unsigned int order; 845 unsigned int order;
846 int purge = 0;
799 847
800 BUG_ON(size & ~PAGE_MASK); 848 BUG_ON(size & ~PAGE_MASK);
801 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 849 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,24 +856,38 @@ again:
808 int i; 856 int i;
809 857
810 spin_lock(&vb->lock); 858 spin_lock(&vb->lock);
859 if (vb->free < 1UL << order)
860 goto next;
861
811 i = bitmap_find_free_region(vb->alloc_map, 862 i = bitmap_find_free_region(vb->alloc_map,
812 VMAP_BBMAP_BITS, order); 863 VMAP_BBMAP_BITS, order);
813 864
814 if (i >= 0) { 865 if (i < 0) {
815 addr = vb->va->va_start + (i << PAGE_SHIFT); 866 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
816 BUG_ON(addr_to_vb_idx(addr) != 867 /* fragmented and no outstanding allocations */
817 addr_to_vb_idx(vb->va->va_start)); 868 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
818 vb->free -= 1UL << order; 869 purge = 1;
819 if (vb->free == 0) {
820 spin_lock(&vbq->lock);
821 list_del_init(&vb->free_list);
822 spin_unlock(&vbq->lock);
823 } 870 }
824 spin_unlock(&vb->lock); 871 goto next;
825 break; 872 }
873 addr = vb->va->va_start + (i << PAGE_SHIFT);
874 BUG_ON(addr_to_vb_idx(addr) !=
875 addr_to_vb_idx(vb->va->va_start));
876 vb->free -= 1UL << order;
877 if (vb->free == 0) {
878 spin_lock(&vbq->lock);
879 list_del_rcu(&vb->free_list);
880 spin_unlock(&vbq->lock);
826 } 881 }
827 spin_unlock(&vb->lock); 882 spin_unlock(&vb->lock);
883 break;
884next:
885 spin_unlock(&vb->lock);
828 } 886 }
887
888 if (purge)
889 purge_fragmented_blocks_thiscpu();
890
829 put_cpu_var(vmap_block_queue); 891 put_cpu_var(vmap_block_queue);
830 rcu_read_unlock(); 892 rcu_read_unlock();
831 893
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
862 BUG_ON(!vb); 924 BUG_ON(!vb);
863 925
864 spin_lock(&vb->lock); 926 spin_lock(&vb->lock);
865 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); 927 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
866 928
867 vb->dirty += 1UL << order; 929 vb->dirty += 1UL << order;
868 if (vb->dirty == VMAP_BBMAP_BITS) { 930 if (vb->dirty == VMAP_BBMAP_BITS) {
869 BUG_ON(vb->free || !list_empty(&vb->free_list)); 931 BUG_ON(vb->free);
870 spin_unlock(&vb->lock); 932 spin_unlock(&vb->lock);
871 free_vmap_block(vb); 933 free_vmap_block(vb);
872 } else 934 } else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
1035 vbq = &per_cpu(vmap_block_queue, i); 1097 vbq = &per_cpu(vmap_block_queue, i);
1036 spin_lock_init(&vbq->lock); 1098 spin_lock_init(&vbq->lock);
1037 INIT_LIST_HEAD(&vbq->free); 1099 INIT_LIST_HEAD(&vbq->free);
1038 INIT_LIST_HEAD(&vbq->dirty);
1039 vbq->nr_dirty = 0;
1040 } 1100 }
1041 1101
1042 /* Import existing vmlist entries. */ 1102 /* Import existing vmlist entries. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b7..c26986c85ce0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1922 if (!populated_zone(zone)) 1922 if (!populated_zone(zone))
1923 continue; 1923 continue;
1924 1924
1925 if (zone_is_all_unreclaimable(zone))
1926 continue;
1927
1925 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 1928 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
1926 0, 0)) 1929 0, 0))
1927 return 1; 1930 return 1;