diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/filemap.c | 118 | ||||
-rw-r--r-- | mm/hugetlb.c | 9 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
-rw-r--r-- | mm/internal.h | 12 | ||||
-rw-r--r-- | mm/kmemleak.c | 188 | ||||
-rw-r--r-- | mm/maccess.c | 11 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memcontrol.c | 27 | ||||
-rw-r--r-- | mm/memory-failure.c | 569 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 39 | ||||
-rw-r--r-- | mm/mmap.c | 40 | ||||
-rw-r--r-- | mm/nommu.c | 144 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 102 | ||||
-rw-r--r-- | mm/percpu.c | 4 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/shmem.c | 73 | ||||
-rw-r--r-- | mm/shmem_acl.c | 171 | ||||
-rw-r--r-- | mm/slab.c | 16 | ||||
-rw-r--r-- | mm/truncate.c | 30 | ||||
-rw-r--r-- | mm/util.c | 46 | ||||
-rw-r--r-- | mm/vmalloc.c | 114 | ||||
-rw-r--r-- | mm/vmscan.c | 3 |
26 files changed, 1273 insertions, 602 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 2310984591ed..17b8947aa7da 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -221,6 +221,7 @@ config KSM | |||
221 | 221 | ||
222 | config DEFAULT_MMAP_MIN_ADDR | 222 | config DEFAULT_MMAP_MIN_ADDR |
223 | int "Low address space to protect from user allocation" | 223 | int "Low address space to protect from user allocation" |
224 | depends on MMU | ||
224 | default 4096 | 225 | default 4096 |
225 | help | 226 | help |
226 | This is the portion of low virtual memory which should be protected | 227 | This is the portion of low virtual memory which should be protected |
@@ -251,8 +252,9 @@ config MEMORY_FAILURE | |||
251 | special hardware support and typically ECC memory. | 252 | special hardware support and typically ECC memory. |
252 | 253 | ||
253 | config HWPOISON_INJECT | 254 | config HWPOISON_INJECT |
254 | tristate "Poison pages injector" | 255 | tristate "HWPoison pages injector" |
255 | depends on MEMORY_FAILURE && DEBUG_KERNEL | 256 | depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS |
257 | select PROC_PAGE_MONITOR | ||
256 | 258 | ||
257 | config NOMMU_INITIAL_TRIM_EXCESS | 259 | config NOMMU_INITIAL_TRIM_EXCESS |
258 | int "Turn on mmap() excess space trimming before booting" | 260 | int "Turn on mmap() excess space trimming before booting" |
diff --git a/mm/Makefile b/mm/Makefile index 82131d0f8d85..7a68d2ab5560 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
22 | obj-$(CONFIG_NUMA) += mempolicy.o | 22 | obj-$(CONFIG_NUMA) += mempolicy.o |
23 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
26 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
28 | obj-$(CONFIG_KSM) += ksm.o | 27 | obj-$(CONFIG_KSM) += ksm.o |
diff --git a/mm/filemap.c b/mm/filemap.c index 8b4d88f9249e..698ea80f2102 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); | |||
1634 | static struct page *__read_cache_page(struct address_space *mapping, | 1634 | static struct page *__read_cache_page(struct address_space *mapping, |
1635 | pgoff_t index, | 1635 | pgoff_t index, |
1636 | int (*filler)(void *,struct page*), | 1636 | int (*filler)(void *,struct page*), |
1637 | void *data) | 1637 | void *data, |
1638 | gfp_t gfp) | ||
1638 | { | 1639 | { |
1639 | struct page *page; | 1640 | struct page *page; |
1640 | int err; | 1641 | int err; |
1641 | repeat: | 1642 | repeat: |
1642 | page = find_get_page(mapping, index); | 1643 | page = find_get_page(mapping, index); |
1643 | if (!page) { | 1644 | if (!page) { |
1644 | page = page_cache_alloc_cold(mapping); | 1645 | page = __page_cache_alloc(gfp | __GFP_COLD); |
1645 | if (!page) | 1646 | if (!page) |
1646 | return ERR_PTR(-ENOMEM); | 1647 | return ERR_PTR(-ENOMEM); |
1647 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); | 1648 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
@@ -1661,31 +1662,18 @@ repeat: | |||
1661 | return page; | 1662 | return page; |
1662 | } | 1663 | } |
1663 | 1664 | ||
1664 | /** | 1665 | static struct page *do_read_cache_page(struct address_space *mapping, |
1665 | * read_cache_page_async - read into page cache, fill it if needed | ||
1666 | * @mapping: the page's address_space | ||
1667 | * @index: the page index | ||
1668 | * @filler: function to perform the read | ||
1669 | * @data: destination for read data | ||
1670 | * | ||
1671 | * Same as read_cache_page, but don't wait for page to become unlocked | ||
1672 | * after submitting it to the filler. | ||
1673 | * | ||
1674 | * Read into the page cache. If a page already exists, and PageUptodate() is | ||
1675 | * not set, try to fill the page but don't wait for it to become unlocked. | ||
1676 | * | ||
1677 | * If the page does not get brought uptodate, return -EIO. | ||
1678 | */ | ||
1679 | struct page *read_cache_page_async(struct address_space *mapping, | ||
1680 | pgoff_t index, | 1666 | pgoff_t index, |
1681 | int (*filler)(void *,struct page*), | 1667 | int (*filler)(void *,struct page*), |
1682 | void *data) | 1668 | void *data, |
1669 | gfp_t gfp) | ||
1670 | |||
1683 | { | 1671 | { |
1684 | struct page *page; | 1672 | struct page *page; |
1685 | int err; | 1673 | int err; |
1686 | 1674 | ||
1687 | retry: | 1675 | retry: |
1688 | page = __read_cache_page(mapping, index, filler, data); | 1676 | page = __read_cache_page(mapping, index, filler, data, gfp); |
1689 | if (IS_ERR(page)) | 1677 | if (IS_ERR(page)) |
1690 | return page; | 1678 | return page; |
1691 | if (PageUptodate(page)) | 1679 | if (PageUptodate(page)) |
@@ -1710,8 +1698,67 @@ out: | |||
1710 | mark_page_accessed(page); | 1698 | mark_page_accessed(page); |
1711 | return page; | 1699 | return page; |
1712 | } | 1700 | } |
1701 | |||
1702 | /** | ||
1703 | * read_cache_page_async - read into page cache, fill it if needed | ||
1704 | * @mapping: the page's address_space | ||
1705 | * @index: the page index | ||
1706 | * @filler: function to perform the read | ||
1707 | * @data: destination for read data | ||
1708 | * | ||
1709 | * Same as read_cache_page, but don't wait for page to become unlocked | ||
1710 | * after submitting it to the filler. | ||
1711 | * | ||
1712 | * Read into the page cache. If a page already exists, and PageUptodate() is | ||
1713 | * not set, try to fill the page but don't wait for it to become unlocked. | ||
1714 | * | ||
1715 | * If the page does not get brought uptodate, return -EIO. | ||
1716 | */ | ||
1717 | struct page *read_cache_page_async(struct address_space *mapping, | ||
1718 | pgoff_t index, | ||
1719 | int (*filler)(void *,struct page*), | ||
1720 | void *data) | ||
1721 | { | ||
1722 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | ||
1723 | } | ||
1713 | EXPORT_SYMBOL(read_cache_page_async); | 1724 | EXPORT_SYMBOL(read_cache_page_async); |
1714 | 1725 | ||
1726 | static struct page *wait_on_page_read(struct page *page) | ||
1727 | { | ||
1728 | if (!IS_ERR(page)) { | ||
1729 | wait_on_page_locked(page); | ||
1730 | if (!PageUptodate(page)) { | ||
1731 | page_cache_release(page); | ||
1732 | page = ERR_PTR(-EIO); | ||
1733 | } | ||
1734 | } | ||
1735 | return page; | ||
1736 | } | ||
1737 | |||
1738 | /** | ||
1739 | * read_cache_page_gfp - read into page cache, using specified page allocation flags. | ||
1740 | * @mapping: the page's address_space | ||
1741 | * @index: the page index | ||
1742 | * @gfp: the page allocator flags to use if allocating | ||
1743 | * | ||
1744 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with | ||
1745 | * any new page allocations done using the specified allocation flags. Note | ||
1746 | * that the Radix tree operations will still use GFP_KERNEL, so you can't | ||
1747 | * expect to do this atomically or anything like that - but you can pass in | ||
1748 | * other page requirements. | ||
1749 | * | ||
1750 | * If the page does not get brought uptodate, return -EIO. | ||
1751 | */ | ||
1752 | struct page *read_cache_page_gfp(struct address_space *mapping, | ||
1753 | pgoff_t index, | ||
1754 | gfp_t gfp) | ||
1755 | { | ||
1756 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; | ||
1757 | |||
1758 | return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); | ||
1759 | } | ||
1760 | EXPORT_SYMBOL(read_cache_page_gfp); | ||
1761 | |||
1715 | /** | 1762 | /** |
1716 | * read_cache_page - read into page cache, fill it if needed | 1763 | * read_cache_page - read into page cache, fill it if needed |
1717 | * @mapping: the page's address_space | 1764 | * @mapping: the page's address_space |
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping, | |||
1729 | int (*filler)(void *,struct page*), | 1776 | int (*filler)(void *,struct page*), |
1730 | void *data) | 1777 | void *data) |
1731 | { | 1778 | { |
1732 | struct page *page; | 1779 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); |
1733 | |||
1734 | page = read_cache_page_async(mapping, index, filler, data); | ||
1735 | if (IS_ERR(page)) | ||
1736 | goto out; | ||
1737 | wait_on_page_locked(page); | ||
1738 | if (!PageUptodate(page)) { | ||
1739 | page_cache_release(page); | ||
1740 | page = ERR_PTR(-EIO); | ||
1741 | } | ||
1742 | out: | ||
1743 | return page; | ||
1744 | } | 1780 | } |
1745 | EXPORT_SYMBOL(read_cache_page); | 1781 | EXPORT_SYMBOL(read_cache_page); |
1746 | 1782 | ||
@@ -2196,6 +2232,9 @@ again: | |||
2196 | if (unlikely(status)) | 2232 | if (unlikely(status)) |
2197 | break; | 2233 | break; |
2198 | 2234 | ||
2235 | if (mapping_writably_mapped(mapping)) | ||
2236 | flush_dcache_page(page); | ||
2237 | |||
2199 | pagefault_disable(); | 2238 | pagefault_disable(); |
2200 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2239 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
2201 | pagefault_enable(); | 2240 | pagefault_enable(); |
@@ -2240,7 +2279,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2240 | size_t count, ssize_t written) | 2279 | size_t count, ssize_t written) |
2241 | { | 2280 | { |
2242 | struct file *file = iocb->ki_filp; | 2281 | struct file *file = iocb->ki_filp; |
2243 | struct address_space *mapping = file->f_mapping; | ||
2244 | ssize_t status; | 2282 | ssize_t status; |
2245 | struct iov_iter i; | 2283 | struct iov_iter i; |
2246 | 2284 | ||
@@ -2252,15 +2290,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2252 | *ppos = pos + status; | 2290 | *ppos = pos + status; |
2253 | } | 2291 | } |
2254 | 2292 | ||
2255 | /* | ||
2256 | * If we get here for O_DIRECT writes then we must have fallen through | ||
2257 | * to buffered writes (block instantiation inside i_size). So we sync | ||
2258 | * the file data here, to try to honour O_DIRECT expectations. | ||
2259 | */ | ||
2260 | if (unlikely(file->f_flags & O_DIRECT) && written) | ||
2261 | status = filemap_write_and_wait_range(mapping, | ||
2262 | pos, pos + written - 1); | ||
2263 | |||
2264 | return written ? written : status; | 2293 | return written ? written : status; |
2265 | } | 2294 | } |
2266 | EXPORT_SYMBOL(generic_file_buffered_write); | 2295 | EXPORT_SYMBOL(generic_file_buffered_write); |
@@ -2359,10 +2388,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2359 | * semantics. | 2388 | * semantics. |
2360 | */ | 2389 | */ |
2361 | endbyte = pos + written_buffered - written - 1; | 2390 | endbyte = pos + written_buffered - written - 1; |
2362 | err = do_sync_mapping_range(file->f_mapping, pos, endbyte, | 2391 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); |
2363 | SYNC_FILE_RANGE_WAIT_BEFORE| | ||
2364 | SYNC_FILE_RANGE_WRITE| | ||
2365 | SYNC_FILE_RANGE_WAIT_AFTER); | ||
2366 | if (err == 0) { | 2392 | if (err == 0) { |
2367 | written = written_buffered; | 2393 | written = written_buffered; |
2368 | invalidate_mapping_pages(mapping, | 2394 | invalidate_mapping_pages(mapping, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65f38c218207..2d16fa6b8c2d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page, | |||
402 | { | 402 | { |
403 | int i; | 403 | int i; |
404 | 404 | ||
405 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) { | 405 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { |
406 | clear_gigantic_page(page, addr, sz); | 406 | clear_gigantic_page(page, addr, sz); |
407 | return; | 407 | return; |
408 | } | 408 | } |
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = { | |||
1515 | .attrs = hstate_attrs, | 1515 | .attrs = hstate_attrs, |
1516 | }; | 1516 | }; |
1517 | 1517 | ||
1518 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h, | 1518 | static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, |
1519 | struct kobject *parent, | 1519 | struct kobject **hstate_kobjs, |
1520 | struct kobject **hstate_kobjs, | 1520 | struct attribute_group *hstate_attr_group) |
1521 | struct attribute_group *hstate_attr_group) | ||
1522 | { | 1521 | { |
1523 | int retval; | 1522 | int retval; |
1524 | int hi = h - hstates; | 1523 | int hi = h - hstates; |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..10ea71905c1f 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -3,18 +3,68 @@ | |||
3 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/swap.h> | ||
7 | #include <linux/pagemap.h> | ||
8 | #include "internal.h" | ||
6 | 9 | ||
7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | 10 | static struct dentry *hwpoison_dir; |
8 | 11 | ||
9 | static int hwpoison_inject(void *data, u64 val) | 12 | static int hwpoison_inject(void *data, u64 val) |
10 | { | 13 | { |
14 | unsigned long pfn = val; | ||
15 | struct page *p; | ||
16 | int err; | ||
17 | |||
18 | if (!capable(CAP_SYS_ADMIN)) | ||
19 | return -EPERM; | ||
20 | |||
21 | if (!hwpoison_filter_enable) | ||
22 | goto inject; | ||
23 | if (!pfn_valid(pfn)) | ||
24 | return -ENXIO; | ||
25 | |||
26 | p = pfn_to_page(pfn); | ||
27 | /* | ||
28 | * This implies unable to support free buddy pages. | ||
29 | */ | ||
30 | if (!get_page_unless_zero(p)) | ||
31 | return 0; | ||
32 | |||
33 | if (!PageLRU(p)) | ||
34 | shake_page(p, 0); | ||
35 | /* | ||
36 | * This implies unable to support non-LRU pages. | ||
37 | */ | ||
38 | if (!PageLRU(p)) | ||
39 | return 0; | ||
40 | |||
41 | /* | ||
42 | * do a racy check with elevated page count, to make sure PG_hwpoison | ||
43 | * will only be set for the targeted owner (or on a free page). | ||
44 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | ||
45 | * __memory_failure() will redo the check reliably inside page lock. | ||
46 | */ | ||
47 | lock_page(p); | ||
48 | err = hwpoison_filter(p); | ||
49 | unlock_page(p); | ||
50 | if (err) | ||
51 | return 0; | ||
52 | |||
53 | inject: | ||
54 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | ||
55 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | ||
56 | } | ||
57 | |||
58 | static int hwpoison_unpoison(void *data, u64 val) | ||
59 | { | ||
11 | if (!capable(CAP_SYS_ADMIN)) | 60 | if (!capable(CAP_SYS_ADMIN)) |
12 | return -EPERM; | 61 | return -EPERM; |
13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | 62 | |
14 | return __memory_failure(val, 18, 0); | 63 | return unpoison_memory(val); |
15 | } | 64 | } |
16 | 65 | ||
17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | 66 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); |
67 | DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); | ||
18 | 68 | ||
19 | static void pfn_inject_exit(void) | 69 | static void pfn_inject_exit(void) |
20 | { | 70 | { |
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void) | |||
24 | 74 | ||
25 | static int pfn_inject_init(void) | 75 | static int pfn_inject_init(void) |
26 | { | 76 | { |
77 | struct dentry *dentry; | ||
78 | |||
27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | 79 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); |
28 | if (hwpoison_dir == NULL) | 80 | if (hwpoison_dir == NULL) |
29 | return -ENOMEM; | 81 | return -ENOMEM; |
30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | 82 | |
83 | /* | ||
84 | * Note that the below poison/unpoison interfaces do not involve | ||
85 | * hardware status change, hence do not require hardware support. | ||
86 | * They are mainly for testing hwpoison in software level. | ||
87 | */ | ||
88 | dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
31 | NULL, &hwpoison_fops); | 89 | NULL, &hwpoison_fops); |
32 | if (corrupt_pfn == NULL) { | 90 | if (!dentry) |
33 | pfn_inject_exit(); | 91 | goto fail; |
34 | return -ENOMEM; | 92 | |
35 | } | 93 | dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, |
94 | NULL, &unpoison_fops); | ||
95 | if (!dentry) | ||
96 | goto fail; | ||
97 | |||
98 | dentry = debugfs_create_u32("corrupt-filter-enable", 0600, | ||
99 | hwpoison_dir, &hwpoison_filter_enable); | ||
100 | if (!dentry) | ||
101 | goto fail; | ||
102 | |||
103 | dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, | ||
104 | hwpoison_dir, &hwpoison_filter_dev_major); | ||
105 | if (!dentry) | ||
106 | goto fail; | ||
107 | |||
108 | dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, | ||
109 | hwpoison_dir, &hwpoison_filter_dev_minor); | ||
110 | if (!dentry) | ||
111 | goto fail; | ||
112 | |||
113 | dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, | ||
114 | hwpoison_dir, &hwpoison_filter_flags_mask); | ||
115 | if (!dentry) | ||
116 | goto fail; | ||
117 | |||
118 | dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, | ||
119 | hwpoison_dir, &hwpoison_filter_flags_value); | ||
120 | if (!dentry) | ||
121 | goto fail; | ||
122 | |||
123 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
124 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | ||
125 | hwpoison_dir, &hwpoison_filter_memcg); | ||
126 | if (!dentry) | ||
127 | goto fail; | ||
128 | #endif | ||
129 | |||
36 | return 0; | 130 | return 0; |
131 | fail: | ||
132 | pfn_inject_exit(); | ||
133 | return -ENOMEM; | ||
37 | } | 134 | } |
38 | 135 | ||
39 | module_init(pfn_inject_init); | 136 | module_init(pfn_inject_init); |
diff --git a/mm/internal.h b/mm/internal.h index 4fe67a162cb4..6a697bb97fc5 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); | |||
50 | */ | 50 | */ |
51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
52 | extern void prep_compound_page(struct page *page, unsigned long order); | 52 | extern void prep_compound_page(struct page *page, unsigned long order); |
53 | #ifdef CONFIG_MEMORY_FAILURE | ||
54 | extern bool is_free_buddy_page(struct page *page); | ||
55 | #endif | ||
53 | 56 | ||
54 | 57 | ||
55 | /* | 58 | /* |
@@ -247,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
247 | #define ZONE_RECLAIM_SOME 0 | 250 | #define ZONE_RECLAIM_SOME 0 |
248 | #define ZONE_RECLAIM_SUCCESS 1 | 251 | #define ZONE_RECLAIM_SUCCESS 1 |
249 | #endif | 252 | #endif |
253 | |||
254 | extern int hwpoison_filter(struct page *p); | ||
255 | |||
256 | extern u32 hwpoison_filter_dev_major; | ||
257 | extern u32 hwpoison_filter_dev_minor; | ||
258 | extern u64 hwpoison_filter_flags_mask; | ||
259 | extern u64 hwpoison_filter_flags_value; | ||
260 | extern u64 hwpoison_filter_memcg; | ||
261 | extern u32 hwpoison_filter_enable; | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 13f33b3081ec..5b069e4f5e48 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -93,6 +93,7 @@ | |||
93 | #include <linux/nodemask.h> | 93 | #include <linux/nodemask.h> |
94 | #include <linux/mm.h> | 94 | #include <linux/mm.h> |
95 | #include <linux/workqueue.h> | 95 | #include <linux/workqueue.h> |
96 | #include <linux/crc32.h> | ||
96 | 97 | ||
97 | #include <asm/sections.h> | 98 | #include <asm/sections.h> |
98 | #include <asm/processor.h> | 99 | #include <asm/processor.h> |
@@ -108,7 +109,6 @@ | |||
108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | 109 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ |
109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | 110 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ |
110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | 111 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ |
111 | #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ | ||
112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ | 112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ |
113 | 113 | ||
114 | #define BYTES_PER_POINTER sizeof(void *) | 114 | #define BYTES_PER_POINTER sizeof(void *) |
@@ -119,8 +119,8 @@ | |||
119 | /* scanning area inside a memory block */ | 119 | /* scanning area inside a memory block */ |
120 | struct kmemleak_scan_area { | 120 | struct kmemleak_scan_area { |
121 | struct hlist_node node; | 121 | struct hlist_node node; |
122 | unsigned long offset; | 122 | unsigned long start; |
123 | size_t length; | 123 | size_t size; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | #define KMEMLEAK_GREY 0 | 126 | #define KMEMLEAK_GREY 0 |
@@ -149,6 +149,8 @@ struct kmemleak_object { | |||
149 | int min_count; | 149 | int min_count; |
150 | /* the total number of pointers found pointing to this object */ | 150 | /* the total number of pointers found pointing to this object */ |
151 | int count; | 151 | int count; |
152 | /* checksum for detecting modified objects */ | ||
153 | u32 checksum; | ||
152 | /* memory ranges to be scanned inside an object (empty for all) */ | 154 | /* memory ranges to be scanned inside an object (empty for all) */ |
153 | struct hlist_head area_list; | 155 | struct hlist_head area_list; |
154 | unsigned long trace[MAX_TRACE]; | 156 | unsigned long trace[MAX_TRACE]; |
@@ -164,8 +166,6 @@ struct kmemleak_object { | |||
164 | #define OBJECT_REPORTED (1 << 1) | 166 | #define OBJECT_REPORTED (1 << 1) |
165 | /* flag set to not scan the object */ | 167 | /* flag set to not scan the object */ |
166 | #define OBJECT_NO_SCAN (1 << 2) | 168 | #define OBJECT_NO_SCAN (1 << 2) |
167 | /* flag set on newly allocated objects */ | ||
168 | #define OBJECT_NEW (1 << 3) | ||
169 | 169 | ||
170 | /* number of bytes to print per line; must be 16 or 32 */ | 170 | /* number of bytes to print per line; must be 16 or 32 */ |
171 | #define HEX_ROW_SIZE 16 | 171 | #define HEX_ROW_SIZE 16 |
@@ -241,8 +241,6 @@ struct early_log { | |||
241 | const void *ptr; /* allocated/freed memory block */ | 241 | const void *ptr; /* allocated/freed memory block */ |
242 | size_t size; /* memory block size */ | 242 | size_t size; /* memory block size */ |
243 | int min_count; /* minimum reference count */ | 243 | int min_count; /* minimum reference count */ |
244 | unsigned long offset; /* scan area offset */ | ||
245 | size_t length; /* scan area length */ | ||
246 | unsigned long trace[MAX_TRACE]; /* stack trace */ | 244 | unsigned long trace[MAX_TRACE]; /* stack trace */ |
247 | unsigned int trace_len; /* stack trace length */ | 245 | unsigned int trace_len; /* stack trace length */ |
248 | }; | 246 | }; |
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object) | |||
323 | object->count >= object->min_count; | 321 | object->count >= object->min_count; |
324 | } | 322 | } |
325 | 323 | ||
326 | static bool color_black(const struct kmemleak_object *object) | ||
327 | { | ||
328 | return object->min_count == KMEMLEAK_BLACK; | ||
329 | } | ||
330 | |||
331 | /* | 324 | /* |
332 | * Objects are considered unreferenced only if their color is white, they have | 325 | * Objects are considered unreferenced only if their color is white, they have |
333 | * not be deleted and have a minimum age to avoid false positives caused by | 326 | * not be deleted and have a minimum age to avoid false positives caused by |
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object) | |||
335 | */ | 328 | */ |
336 | static bool unreferenced_object(struct kmemleak_object *object) | 329 | static bool unreferenced_object(struct kmemleak_object *object) |
337 | { | 330 | { |
338 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | 331 | return (color_white(object) && object->flags & OBJECT_ALLOCATED) && |
339 | time_before_eq(object->jiffies + jiffies_min_age, | 332 | time_before_eq(object->jiffies + jiffies_min_age, |
340 | jiffies_last_scan); | 333 | jiffies_last_scan); |
341 | } | 334 | } |
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq, | |||
348 | struct kmemleak_object *object) | 341 | struct kmemleak_object *object) |
349 | { | 342 | { |
350 | int i; | 343 | int i; |
344 | unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); | ||
351 | 345 | ||
352 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", | 346 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", |
353 | object->pointer, object->size); | 347 | object->pointer, object->size); |
354 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", | 348 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", |
355 | object->comm, object->pid, object->jiffies); | 349 | object->comm, object->pid, object->jiffies, |
350 | msecs_age / 1000, msecs_age % 1000); | ||
356 | hex_dump_object(seq, object); | 351 | hex_dump_object(seq, object); |
357 | seq_printf(seq, " backtrace:\n"); | 352 | seq_printf(seq, " backtrace:\n"); |
358 | 353 | ||
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
381 | pr_notice(" min_count = %d\n", object->min_count); | 376 | pr_notice(" min_count = %d\n", object->min_count); |
382 | pr_notice(" count = %d\n", object->count); | 377 | pr_notice(" count = %d\n", object->count); |
383 | pr_notice(" flags = 0x%lx\n", object->flags); | 378 | pr_notice(" flags = 0x%lx\n", object->flags); |
379 | pr_notice(" checksum = %d\n", object->checksum); | ||
384 | pr_notice(" backtrace:\n"); | 380 | pr_notice(" backtrace:\n"); |
385 | print_stack_trace(&trace, 4); | 381 | print_stack_trace(&trace, 4); |
386 | } | 382 | } |
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
522 | INIT_HLIST_HEAD(&object->area_list); | 518 | INIT_HLIST_HEAD(&object->area_list); |
523 | spin_lock_init(&object->lock); | 519 | spin_lock_init(&object->lock); |
524 | atomic_set(&object->use_count, 1); | 520 | atomic_set(&object->use_count, 1); |
525 | object->flags = OBJECT_ALLOCATED | OBJECT_NEW; | 521 | object->flags = OBJECT_ALLOCATED; |
526 | object->pointer = ptr; | 522 | object->pointer = ptr; |
527 | object->size = size; | 523 | object->size = size; |
528 | object->min_count = min_count; | 524 | object->min_count = min_count; |
529 | object->count = -1; /* no color initially */ | 525 | object->count = 0; /* white color initially */ |
530 | object->jiffies = jiffies; | 526 | object->jiffies = jiffies; |
527 | object->checksum = 0; | ||
531 | 528 | ||
532 | /* task information */ | 529 | /* task information */ |
533 | if (in_irq()) { | 530 | if (in_irq()) { |
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr) | |||
720 | * Add a scanning area to the object. If at least one such area is added, | 717 | * Add a scanning area to the object. If at least one such area is added, |
721 | * kmemleak will only scan these ranges rather than the whole memory block. | 718 | * kmemleak will only scan these ranges rather than the whole memory block. |
722 | */ | 719 | */ |
723 | static void add_scan_area(unsigned long ptr, unsigned long offset, | 720 | static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) |
724 | size_t length, gfp_t gfp) | ||
725 | { | 721 | { |
726 | unsigned long flags; | 722 | unsigned long flags; |
727 | struct kmemleak_object *object; | 723 | struct kmemleak_object *object; |
728 | struct kmemleak_scan_area *area; | 724 | struct kmemleak_scan_area *area; |
729 | 725 | ||
730 | object = find_and_get_object(ptr, 0); | 726 | object = find_and_get_object(ptr, 1); |
731 | if (!object) { | 727 | if (!object) { |
732 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", | 728 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", |
733 | ptr); | 729 | ptr); |
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
741 | } | 737 | } |
742 | 738 | ||
743 | spin_lock_irqsave(&object->lock, flags); | 739 | spin_lock_irqsave(&object->lock, flags); |
744 | if (offset + length > object->size) { | 740 | if (ptr + size > object->pointer + object->size) { |
745 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | 741 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); |
746 | dump_object_info(object); | 742 | dump_object_info(object); |
747 | kmem_cache_free(scan_area_cache, area); | 743 | kmem_cache_free(scan_area_cache, area); |
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
749 | } | 745 | } |
750 | 746 | ||
751 | INIT_HLIST_NODE(&area->node); | 747 | INIT_HLIST_NODE(&area->node); |
752 | area->offset = offset; | 748 | area->start = ptr; |
753 | area->length = length; | 749 | area->size = size; |
754 | 750 | ||
755 | hlist_add_head(&area->node, &object->area_list); | 751 | hlist_add_head(&area->node, &object->area_list); |
756 | out_unlock: | 752 | out_unlock: |
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr) | |||
786 | * processed later once kmemleak is fully initialized. | 782 | * processed later once kmemleak is fully initialized. |
787 | */ | 783 | */ |
788 | static void __init log_early(int op_type, const void *ptr, size_t size, | 784 | static void __init log_early(int op_type, const void *ptr, size_t size, |
789 | int min_count, unsigned long offset, size_t length) | 785 | int min_count) |
790 | { | 786 | { |
791 | unsigned long flags; | 787 | unsigned long flags; |
792 | struct early_log *log; | 788 | struct early_log *log; |
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, | |||
808 | log->ptr = ptr; | 804 | log->ptr = ptr; |
809 | log->size = size; | 805 | log->size = size; |
810 | log->min_count = min_count; | 806 | log->min_count = min_count; |
811 | log->offset = offset; | ||
812 | log->length = length; | ||
813 | if (op_type == KMEMLEAK_ALLOC) | 807 | if (op_type == KMEMLEAK_ALLOC) |
814 | log->trace_len = __save_stack_trace(log->trace); | 808 | log->trace_len = __save_stack_trace(log->trace); |
815 | crt_early_log++; | 809 | crt_early_log++; |
@@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, | |||
858 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 852 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
859 | create_object((unsigned long)ptr, size, min_count, gfp); | 853 | create_object((unsigned long)ptr, size, min_count, gfp); |
860 | else if (atomic_read(&kmemleak_early_log)) | 854 | else if (atomic_read(&kmemleak_early_log)) |
861 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | 855 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count); |
862 | } | 856 | } |
863 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | 857 | EXPORT_SYMBOL_GPL(kmemleak_alloc); |
864 | 858 | ||
@@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr) | |||
873 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 867 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
874 | delete_object_full((unsigned long)ptr); | 868 | delete_object_full((unsigned long)ptr); |
875 | else if (atomic_read(&kmemleak_early_log)) | 869 | else if (atomic_read(&kmemleak_early_log)) |
876 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | 870 | log_early(KMEMLEAK_FREE, ptr, 0, 0); |
877 | } | 871 | } |
878 | EXPORT_SYMBOL_GPL(kmemleak_free); | 872 | EXPORT_SYMBOL_GPL(kmemleak_free); |
879 | 873 | ||
@@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) | |||
888 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 882 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
889 | delete_object_part((unsigned long)ptr, size); | 883 | delete_object_part((unsigned long)ptr, size); |
890 | else if (atomic_read(&kmemleak_early_log)) | 884 | else if (atomic_read(&kmemleak_early_log)) |
891 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); | 885 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0); |
892 | } | 886 | } |
893 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | 887 | EXPORT_SYMBOL_GPL(kmemleak_free_part); |
894 | 888 | ||
@@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr) | |||
903 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 897 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
904 | make_gray_object((unsigned long)ptr); | 898 | make_gray_object((unsigned long)ptr); |
905 | else if (atomic_read(&kmemleak_early_log)) | 899 | else if (atomic_read(&kmemleak_early_log)) |
906 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | 900 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); |
907 | } | 901 | } |
908 | EXPORT_SYMBOL(kmemleak_not_leak); | 902 | EXPORT_SYMBOL(kmemleak_not_leak); |
909 | 903 | ||
@@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr) | |||
919 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 913 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
920 | make_black_object((unsigned long)ptr); | 914 | make_black_object((unsigned long)ptr); |
921 | else if (atomic_read(&kmemleak_early_log)) | 915 | else if (atomic_read(&kmemleak_early_log)) |
922 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | 916 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0); |
923 | } | 917 | } |
924 | EXPORT_SYMBOL(kmemleak_ignore); | 918 | EXPORT_SYMBOL(kmemleak_ignore); |
925 | 919 | ||
926 | /* | 920 | /* |
927 | * Limit the range to be scanned in an allocated memory block. | 921 | * Limit the range to be scanned in an allocated memory block. |
928 | */ | 922 | */ |
929 | void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, | 923 | void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) |
930 | size_t length, gfp_t gfp) | ||
931 | { | 924 | { |
932 | pr_debug("%s(0x%p)\n", __func__, ptr); | 925 | pr_debug("%s(0x%p)\n", __func__, ptr); |
933 | 926 | ||
934 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 927 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
935 | add_scan_area((unsigned long)ptr, offset, length, gfp); | 928 | add_scan_area((unsigned long)ptr, size, gfp); |
936 | else if (atomic_read(&kmemleak_early_log)) | 929 | else if (atomic_read(&kmemleak_early_log)) |
937 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | 930 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); |
938 | } | 931 | } |
939 | EXPORT_SYMBOL(kmemleak_scan_area); | 932 | EXPORT_SYMBOL(kmemleak_scan_area); |
940 | 933 | ||
@@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr) | |||
948 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 941 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
949 | object_no_scan((unsigned long)ptr); | 942 | object_no_scan((unsigned long)ptr); |
950 | else if (atomic_read(&kmemleak_early_log)) | 943 | else if (atomic_read(&kmemleak_early_log)) |
951 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | 944 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); |
952 | } | 945 | } |
953 | EXPORT_SYMBOL(kmemleak_no_scan); | 946 | EXPORT_SYMBOL(kmemleak_no_scan); |
954 | 947 | ||
955 | /* | 948 | /* |
949 | * Update an object's checksum and return true if it was modified. | ||
950 | */ | ||
951 | static bool update_checksum(struct kmemleak_object *object) | ||
952 | { | ||
953 | u32 old_csum = object->checksum; | ||
954 | |||
955 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | ||
956 | return false; | ||
957 | |||
958 | object->checksum = crc32(0, (void *)object->pointer, object->size); | ||
959 | return object->checksum != old_csum; | ||
960 | } | ||
961 | |||
962 | /* | ||
956 | * Memory scanning is a long process and it needs to be interruptable. This | 963 | * Memory scanning is a long process and it needs to be interruptable. This |
957 | * function checks whether such interrupt condition occured. | 964 | * function checks whether such interrupt condition occured. |
958 | */ | 965 | */ |
@@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end, | |||
1031 | * added to the gray_list. | 1038 | * added to the gray_list. |
1032 | */ | 1039 | */ |
1033 | object->count++; | 1040 | object->count++; |
1034 | if (color_gray(object)) | 1041 | if (color_gray(object)) { |
1035 | list_add_tail(&object->gray_list, &gray_list); | 1042 | list_add_tail(&object->gray_list, &gray_list); |
1036 | else | 1043 | spin_unlock_irqrestore(&object->lock, flags); |
1037 | put_object(object); | 1044 | continue; |
1045 | } | ||
1046 | |||
1038 | spin_unlock_irqrestore(&object->lock, flags); | 1047 | spin_unlock_irqrestore(&object->lock, flags); |
1048 | put_object(object); | ||
1039 | } | 1049 | } |
1040 | } | 1050 | } |
1041 | 1051 | ||
@@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object) | |||
1075 | } | 1085 | } |
1076 | } else | 1086 | } else |
1077 | hlist_for_each_entry(area, elem, &object->area_list, node) | 1087 | hlist_for_each_entry(area, elem, &object->area_list, node) |
1078 | scan_block((void *)(object->pointer + area->offset), | 1088 | scan_block((void *)area->start, |
1079 | (void *)(object->pointer + area->offset | 1089 | (void *)(area->start + area->size), |
1080 | + area->length), object, 0); | 1090 | object, 0); |
1081 | out: | 1091 | out: |
1082 | spin_unlock_irqrestore(&object->lock, flags); | 1092 | spin_unlock_irqrestore(&object->lock, flags); |
1083 | } | 1093 | } |
1084 | 1094 | ||
1085 | /* | 1095 | /* |
1096 | * Scan the objects already referenced (gray objects). More objects will be | ||
1097 | * referenced and, if there are no memory leaks, all the objects are scanned. | ||
1098 | */ | ||
1099 | static void scan_gray_list(void) | ||
1100 | { | ||
1101 | struct kmemleak_object *object, *tmp; | ||
1102 | |||
1103 | /* | ||
1104 | * The list traversal is safe for both tail additions and removals | ||
1105 | * from inside the loop. The kmemleak objects cannot be freed from | ||
1106 | * outside the loop because their use_count was incremented. | ||
1107 | */ | ||
1108 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1109 | while (&object->gray_list != &gray_list) { | ||
1110 | cond_resched(); | ||
1111 | |||
1112 | /* may add new objects to the list */ | ||
1113 | if (!scan_should_stop()) | ||
1114 | scan_object(object); | ||
1115 | |||
1116 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1117 | gray_list); | ||
1118 | |||
1119 | /* remove the object from the list and release it */ | ||
1120 | list_del(&object->gray_list); | ||
1121 | put_object(object); | ||
1122 | |||
1123 | object = tmp; | ||
1124 | } | ||
1125 | WARN_ON(!list_empty(&gray_list)); | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1086 | * Scan data sections and all the referenced memory blocks allocated via the | 1129 | * Scan data sections and all the referenced memory blocks allocated via the |
1087 | * kernel's standard allocators. This function must be called with the | 1130 | * kernel's standard allocators. This function must be called with the |
1088 | * scan_mutex held. | 1131 | * scan_mutex held. |
@@ -1090,10 +1133,9 @@ out: | |||
1090 | static void kmemleak_scan(void) | 1133 | static void kmemleak_scan(void) |
1091 | { | 1134 | { |
1092 | unsigned long flags; | 1135 | unsigned long flags; |
1093 | struct kmemleak_object *object, *tmp; | 1136 | struct kmemleak_object *object; |
1094 | int i; | 1137 | int i; |
1095 | int new_leaks = 0; | 1138 | int new_leaks = 0; |
1096 | int gray_list_pass = 0; | ||
1097 | 1139 | ||
1098 | jiffies_last_scan = jiffies; | 1140 | jiffies_last_scan = jiffies; |
1099 | 1141 | ||
@@ -1114,7 +1156,6 @@ static void kmemleak_scan(void) | |||
1114 | #endif | 1156 | #endif |
1115 | /* reset the reference count (whiten the object) */ | 1157 | /* reset the reference count (whiten the object) */ |
1116 | object->count = 0; | 1158 | object->count = 0; |
1117 | object->flags &= ~OBJECT_NEW; | ||
1118 | if (color_gray(object) && get_object(object)) | 1159 | if (color_gray(object) && get_object(object)) |
1119 | list_add_tail(&object->gray_list, &gray_list); | 1160 | list_add_tail(&object->gray_list, &gray_list); |
1120 | 1161 | ||
@@ -1172,62 +1213,36 @@ static void kmemleak_scan(void) | |||
1172 | 1213 | ||
1173 | /* | 1214 | /* |
1174 | * Scan the objects already referenced from the sections scanned | 1215 | * Scan the objects already referenced from the sections scanned |
1175 | * above. More objects will be referenced and, if there are no memory | 1216 | * above. |
1176 | * leaks, all the objects will be scanned. The list traversal is safe | ||
1177 | * for both tail additions and removals from inside the loop. The | ||
1178 | * kmemleak objects cannot be freed from outside the loop because their | ||
1179 | * use_count was increased. | ||
1180 | */ | 1217 | */ |
1181 | repeat: | 1218 | scan_gray_list(); |
1182 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
1183 | while (&object->gray_list != &gray_list) { | ||
1184 | cond_resched(); | ||
1185 | |||
1186 | /* may add new objects to the list */ | ||
1187 | if (!scan_should_stop()) | ||
1188 | scan_object(object); | ||
1189 | |||
1190 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
1191 | gray_list); | ||
1192 | |||
1193 | /* remove the object from the list and release it */ | ||
1194 | list_del(&object->gray_list); | ||
1195 | put_object(object); | ||
1196 | |||
1197 | object = tmp; | ||
1198 | } | ||
1199 | |||
1200 | if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) | ||
1201 | goto scan_end; | ||
1202 | 1219 | ||
1203 | /* | 1220 | /* |
1204 | * Check for new objects allocated during this scanning and add them | 1221 | * Check for new or unreferenced objects modified since the previous |
1205 | * to the gray list. | 1222 | * scan and color them gray until the next scan. |
1206 | */ | 1223 | */ |
1207 | rcu_read_lock(); | 1224 | rcu_read_lock(); |
1208 | list_for_each_entry_rcu(object, &object_list, object_list) { | 1225 | list_for_each_entry_rcu(object, &object_list, object_list) { |
1209 | spin_lock_irqsave(&object->lock, flags); | 1226 | spin_lock_irqsave(&object->lock, flags); |
1210 | if ((object->flags & OBJECT_NEW) && !color_black(object) && | 1227 | if (color_white(object) && (object->flags & OBJECT_ALLOCATED) |
1211 | get_object(object)) { | 1228 | && update_checksum(object) && get_object(object)) { |
1212 | object->flags &= ~OBJECT_NEW; | 1229 | /* color it gray temporarily */ |
1230 | object->count = object->min_count; | ||
1213 | list_add_tail(&object->gray_list, &gray_list); | 1231 | list_add_tail(&object->gray_list, &gray_list); |
1214 | } | 1232 | } |
1215 | spin_unlock_irqrestore(&object->lock, flags); | 1233 | spin_unlock_irqrestore(&object->lock, flags); |
1216 | } | 1234 | } |
1217 | rcu_read_unlock(); | 1235 | rcu_read_unlock(); |
1218 | 1236 | ||
1219 | if (!list_empty(&gray_list)) | 1237 | /* |
1220 | goto repeat; | 1238 | * Re-scan the gray list for modified unreferenced objects. |
1221 | 1239 | */ | |
1222 | scan_end: | 1240 | scan_gray_list(); |
1223 | WARN_ON(!list_empty(&gray_list)); | ||
1224 | 1241 | ||
1225 | /* | 1242 | /* |
1226 | * If scanning was stopped or new objects were being allocated at a | 1243 | * If scanning was stopped do not report any new unreferenced objects. |
1227 | * higher rate than gray list scanning, do not report any new | ||
1228 | * unreferenced objects. | ||
1229 | */ | 1244 | */ |
1230 | if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) | 1245 | if (scan_should_stop()) |
1231 | return; | 1246 | return; |
1232 | 1247 | ||
1233 | /* | 1248 | /* |
@@ -1642,8 +1657,7 @@ void __init kmemleak_init(void) | |||
1642 | kmemleak_ignore(log->ptr); | 1657 | kmemleak_ignore(log->ptr); |
1643 | break; | 1658 | break; |
1644 | case KMEMLEAK_SCAN_AREA: | 1659 | case KMEMLEAK_SCAN_AREA: |
1645 | kmemleak_scan_area(log->ptr, log->offset, log->length, | 1660 | kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); |
1646 | GFP_KERNEL); | ||
1647 | break; | 1661 | break; |
1648 | case KMEMLEAK_NO_SCAN: | 1662 | case KMEMLEAK_NO_SCAN: |
1649 | kmemleak_no_scan(log->ptr); | 1663 | kmemleak_no_scan(log->ptr); |
diff --git a/mm/maccess.c b/mm/maccess.c index 9073695ff25f..4e348dbaecd7 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
@@ -14,7 +14,11 @@ | |||
14 | * Safely read from address @src to the buffer at @dst. If a kernel fault | 14 | * Safely read from address @src to the buffer at @dst. If a kernel fault |
15 | * happens, handle that and return -EFAULT. | 15 | * happens, handle that and return -EFAULT. |
16 | */ | 16 | */ |
17 | long probe_kernel_read(void *dst, void *src, size_t size) | 17 | |
18 | long __weak probe_kernel_read(void *dst, void *src, size_t size) | ||
19 | __attribute__((alias("__probe_kernel_read"))); | ||
20 | |||
21 | long __probe_kernel_read(void *dst, void *src, size_t size) | ||
18 | { | 22 | { |
19 | long ret; | 23 | long ret; |
20 | mm_segment_t old_fs = get_fs(); | 24 | mm_segment_t old_fs = get_fs(); |
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); | |||
39 | * Safely write to address @dst from the buffer at @src. If a kernel fault | 43 | * Safely write to address @dst from the buffer at @src. If a kernel fault |
40 | * happens, handle that and return -EFAULT. | 44 | * happens, handle that and return -EFAULT. |
41 | */ | 45 | */ |
42 | long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) | 46 | long __weak probe_kernel_write(void *dst, void *src, size_t size) |
47 | __attribute__((alias("__probe_kernel_write"))); | ||
48 | |||
49 | long __probe_kernel_write(void *dst, void *src, size_t size) | ||
43 | { | 50 | { |
44 | long ret; | 51 | long ret; |
45 | mm_segment_t old_fs = get_fs(); | 52 | mm_segment_t old_fs = get_fs(); |
diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9d..319528b8db74 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/pagemap.h> | 9 | #include <linux/pagemap.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/page-isolation.h> | ||
12 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
14 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
222 | /* | 223 | /* |
223 | * Error injection support for memory error handling. | 224 | * Error injection support for memory error handling. |
224 | */ | 225 | */ |
225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | 226 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
226 | { | 227 | { |
227 | int ret = 0; | 228 | int ret = 0; |
228 | 229 | ||
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) | |||
230 | return -EPERM; | 231 | return -EPERM; |
231 | for (; start < end; start += PAGE_SIZE) { | 232 | for (; start < end; start += PAGE_SIZE) { |
232 | struct page *p; | 233 | struct page *p; |
233 | int ret = get_user_pages(current, current->mm, start, 1, | 234 | int ret = get_user_pages_fast(start, 1, 0, &p); |
234 | 0, 0, &p, NULL); | ||
235 | if (ret != 1) | 235 | if (ret != 1) |
236 | return ret; | 236 | return ret; |
237 | if (bhv == MADV_SOFT_OFFLINE) { | ||
238 | printk(KERN_INFO "Soft offlining page %lx at %lx\n", | ||
239 | page_to_pfn(p), start); | ||
240 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | ||
241 | if (ret) | ||
242 | break; | ||
243 | continue; | ||
244 | } | ||
237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 245 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
238 | page_to_pfn(p), start); | 246 | page_to_pfn(p), start); |
239 | /* Ignore return value for now */ | 247 | /* Ignore return value for now */ |
240 | __memory_failure(page_to_pfn(p), 0, 1); | 248 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
241 | put_page(p); | ||
242 | } | 249 | } |
243 | return ret; | 250 | return ret; |
244 | } | 251 | } |
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
335 | size_t len; | 342 | size_t len; |
336 | 343 | ||
337 | #ifdef CONFIG_MEMORY_FAILURE | 344 | #ifdef CONFIG_MEMORY_FAILURE |
338 | if (behavior == MADV_HWPOISON) | 345 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
339 | return madvise_hwpoison(start, start+len_in); | 346 | return madvise_hwpoison(behavior, start, start+len_in); |
340 | #endif | 347 | #endif |
341 | if (!madvise_behavior_valid(behavior)) | 348 | if (!madvise_behavior_valid(behavior)) |
342 | return error; | 349 | return error; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 878808c4fcbe..954032b80bed 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -283,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
283 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 283 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
284 | } | 284 | } |
285 | 285 | ||
286 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
287 | { | ||
288 | return &mem->css; | ||
289 | } | ||
290 | |||
286 | static struct mem_cgroup_per_zone * | 291 | static struct mem_cgroup_per_zone * |
287 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 292 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
288 | { | 293 | { |
@@ -1536,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
1536 | return container_of(css, struct mem_cgroup, css); | 1541 | return container_of(css, struct mem_cgroup, css); |
1537 | } | 1542 | } |
1538 | 1543 | ||
1539 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1544 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
1540 | { | 1545 | { |
1541 | struct mem_cgroup *mem; | 1546 | struct mem_cgroup *mem = NULL; |
1542 | struct page_cgroup *pc; | 1547 | struct page_cgroup *pc; |
1543 | unsigned short id; | 1548 | unsigned short id; |
1544 | swp_entry_t ent; | 1549 | swp_entry_t ent; |
1545 | 1550 | ||
1546 | VM_BUG_ON(!PageLocked(page)); | 1551 | VM_BUG_ON(!PageLocked(page)); |
1547 | 1552 | ||
1548 | if (!PageSwapCache(page)) | ||
1549 | return NULL; | ||
1550 | |||
1551 | pc = lookup_page_cgroup(page); | 1553 | pc = lookup_page_cgroup(page); |
1552 | lock_page_cgroup(pc); | 1554 | lock_page_cgroup(pc); |
1553 | if (PageCgroupUsed(pc)) { | 1555 | if (PageCgroupUsed(pc)) { |
1554 | mem = pc->mem_cgroup; | 1556 | mem = pc->mem_cgroup; |
1555 | if (mem && !css_tryget(&mem->css)) | 1557 | if (mem && !css_tryget(&mem->css)) |
1556 | mem = NULL; | 1558 | mem = NULL; |
1557 | } else { | 1559 | } else if (PageSwapCache(page)) { |
1558 | ent.val = page_private(page); | 1560 | ent.val = page_private(page); |
1559 | id = lookup_swap_cgroup(ent); | 1561 | id = lookup_swap_cgroup(ent); |
1560 | rcu_read_lock(); | 1562 | rcu_read_lock(); |
@@ -1874,7 +1876,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1874 | */ | 1876 | */ |
1875 | if (!PageSwapCache(page)) | 1877 | if (!PageSwapCache(page)) |
1876 | goto charge_cur_mm; | 1878 | goto charge_cur_mm; |
1877 | mem = try_get_mem_cgroup_from_swapcache(page); | 1879 | mem = try_get_mem_cgroup_from_page(page); |
1878 | if (!mem) | 1880 | if (!mem) |
1879 | goto charge_cur_mm; | 1881 | goto charge_cur_mm; |
1880 | *ptr = mem; | 1882 | *ptr = mem; |
@@ -2584,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | |||
2584 | if (free_all) | 2586 | if (free_all) |
2585 | goto try_to_free; | 2587 | goto try_to_free; |
2586 | move_account: | 2588 | move_account: |
2587 | while (mem->res.usage > 0) { | 2589 | do { |
2588 | ret = -EBUSY; | 2590 | ret = -EBUSY; |
2589 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 2591 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
2590 | goto out; | 2592 | goto out; |
@@ -2612,8 +2614,8 @@ move_account: | |||
2612 | if (ret == -ENOMEM) | 2614 | if (ret == -ENOMEM) |
2613 | goto try_to_free; | 2615 | goto try_to_free; |
2614 | cond_resched(); | 2616 | cond_resched(); |
2615 | } | 2617 | /* "ret" should also be checked to ensure all lists are empty. */ |
2616 | ret = 0; | 2618 | } while (mem->res.usage > 0 || ret); |
2617 | out: | 2619 | out: |
2618 | css_put(&mem->css); | 2620 | css_put(&mem->css); |
2619 | return ret; | 2621 | return ret; |
@@ -2646,10 +2648,7 @@ try_to_free: | |||
2646 | } | 2648 | } |
2647 | lru_add_drain(); | 2649 | lru_add_drain(); |
2648 | /* try move_account...there may be some *locked* pages. */ | 2650 | /* try move_account...there may be some *locked* pages. */ |
2649 | if (mem->res.usage) | 2651 | goto move_account; |
2650 | goto move_account; | ||
2651 | ret = 0; | ||
2652 | goto out; | ||
2653 | } | 2652 | } |
2654 | 2653 | ||
2655 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 2654 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 50d4f8d7024a..17299fd4577c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -34,12 +34,16 @@ | |||
34 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
36 | #include <linux/page-flags.h> | 36 | #include <linux/page-flags.h> |
37 | #include <linux/kernel-page-flags.h> | ||
37 | #include <linux/sched.h> | 38 | #include <linux/sched.h> |
38 | #include <linux/ksm.h> | 39 | #include <linux/ksm.h> |
39 | #include <linux/rmap.h> | 40 | #include <linux/rmap.h> |
40 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
41 | #include <linux/swap.h> | 42 | #include <linux/swap.h> |
42 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
44 | #include <linux/migrate.h> | ||
45 | #include <linux/page-isolation.h> | ||
46 | #include <linux/suspend.h> | ||
43 | #include "internal.h" | 47 | #include "internal.h" |
44 | 48 | ||
45 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 49 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -48,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1; | |||
48 | 52 | ||
49 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 53 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); |
50 | 54 | ||
55 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | ||
56 | |||
57 | u32 hwpoison_filter_enable = 0; | ||
58 | u32 hwpoison_filter_dev_major = ~0U; | ||
59 | u32 hwpoison_filter_dev_minor = ~0U; | ||
60 | u64 hwpoison_filter_flags_mask; | ||
61 | u64 hwpoison_filter_flags_value; | ||
62 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); | ||
63 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); | ||
64 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); | ||
65 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); | ||
66 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); | ||
67 | |||
68 | static int hwpoison_filter_dev(struct page *p) | ||
69 | { | ||
70 | struct address_space *mapping; | ||
71 | dev_t dev; | ||
72 | |||
73 | if (hwpoison_filter_dev_major == ~0U && | ||
74 | hwpoison_filter_dev_minor == ~0U) | ||
75 | return 0; | ||
76 | |||
77 | /* | ||
78 | * page_mapping() does not accept slab page | ||
79 | */ | ||
80 | if (PageSlab(p)) | ||
81 | return -EINVAL; | ||
82 | |||
83 | mapping = page_mapping(p); | ||
84 | if (mapping == NULL || mapping->host == NULL) | ||
85 | return -EINVAL; | ||
86 | |||
87 | dev = mapping->host->i_sb->s_dev; | ||
88 | if (hwpoison_filter_dev_major != ~0U && | ||
89 | hwpoison_filter_dev_major != MAJOR(dev)) | ||
90 | return -EINVAL; | ||
91 | if (hwpoison_filter_dev_minor != ~0U && | ||
92 | hwpoison_filter_dev_minor != MINOR(dev)) | ||
93 | return -EINVAL; | ||
94 | |||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | static int hwpoison_filter_flags(struct page *p) | ||
99 | { | ||
100 | if (!hwpoison_filter_flags_mask) | ||
101 | return 0; | ||
102 | |||
103 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == | ||
104 | hwpoison_filter_flags_value) | ||
105 | return 0; | ||
106 | else | ||
107 | return -EINVAL; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * This allows stress tests to limit test scope to a collection of tasks | ||
112 | * by putting them under some memcg. This prevents killing unrelated/important | ||
113 | * processes such as /sbin/init. Note that the target task may share clean | ||
114 | * pages with init (eg. libc text), which is harmless. If the target task | ||
115 | * share _dirty_ pages with another task B, the test scheme must make sure B | ||
116 | * is also included in the memcg. At last, due to race conditions this filter | ||
117 | * can only guarantee that the page either belongs to the memcg tasks, or is | ||
118 | * a freed page. | ||
119 | */ | ||
120 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
121 | u64 hwpoison_filter_memcg; | ||
122 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | ||
123 | static int hwpoison_filter_task(struct page *p) | ||
124 | { | ||
125 | struct mem_cgroup *mem; | ||
126 | struct cgroup_subsys_state *css; | ||
127 | unsigned long ino; | ||
128 | |||
129 | if (!hwpoison_filter_memcg) | ||
130 | return 0; | ||
131 | |||
132 | mem = try_get_mem_cgroup_from_page(p); | ||
133 | if (!mem) | ||
134 | return -EINVAL; | ||
135 | |||
136 | css = mem_cgroup_css(mem); | ||
137 | /* root_mem_cgroup has NULL dentries */ | ||
138 | if (!css->cgroup->dentry) | ||
139 | return -EINVAL; | ||
140 | |||
141 | ino = css->cgroup->dentry->d_inode->i_ino; | ||
142 | css_put(css); | ||
143 | |||
144 | if (ino != hwpoison_filter_memcg) | ||
145 | return -EINVAL; | ||
146 | |||
147 | return 0; | ||
148 | } | ||
149 | #else | ||
150 | static int hwpoison_filter_task(struct page *p) { return 0; } | ||
151 | #endif | ||
152 | |||
153 | int hwpoison_filter(struct page *p) | ||
154 | { | ||
155 | if (!hwpoison_filter_enable) | ||
156 | return 0; | ||
157 | |||
158 | if (hwpoison_filter_dev(p)) | ||
159 | return -EINVAL; | ||
160 | |||
161 | if (hwpoison_filter_flags(p)) | ||
162 | return -EINVAL; | ||
163 | |||
164 | if (hwpoison_filter_task(p)) | ||
165 | return -EINVAL; | ||
166 | |||
167 | return 0; | ||
168 | } | ||
169 | #else | ||
170 | int hwpoison_filter(struct page *p) | ||
171 | { | ||
172 | return 0; | ||
173 | } | ||
174 | #endif | ||
175 | |||
176 | EXPORT_SYMBOL_GPL(hwpoison_filter); | ||
177 | |||
51 | /* | 178 | /* |
52 | * Send all the processes who have the page mapped an ``action optional'' | 179 | * Send all the processes who have the page mapped an ``action optional'' |
53 | * signal. | 180 | * signal. |
@@ -83,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
83 | } | 210 | } |
84 | 211 | ||
85 | /* | 212 | /* |
213 | * When a unknown page type is encountered drain as many buffers as possible | ||
214 | * in the hope to turn the page into a LRU or free page, which we can handle. | ||
215 | */ | ||
216 | void shake_page(struct page *p, int access) | ||
217 | { | ||
218 | if (!PageSlab(p)) { | ||
219 | lru_add_drain_all(); | ||
220 | if (PageLRU(p)) | ||
221 | return; | ||
222 | drain_all_pages(); | ||
223 | if (PageLRU(p) || is_free_buddy_page(p)) | ||
224 | return; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * Only all shrink_slab here (which would also | ||
229 | * shrink other caches) if access is not potentially fatal. | ||
230 | */ | ||
231 | if (access) { | ||
232 | int nr; | ||
233 | do { | ||
234 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | ||
235 | if (page_count(p) == 0) | ||
236 | break; | ||
237 | } while (nr > 10); | ||
238 | } | ||
239 | } | ||
240 | EXPORT_SYMBOL_GPL(shake_page); | ||
241 | |||
242 | /* | ||
86 | * Kill all processes that have a poisoned page mapped and then isolate | 243 | * Kill all processes that have a poisoned page mapped and then isolate |
87 | * the page. | 244 | * the page. |
88 | * | 245 | * |
@@ -177,7 +334,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
177 | * In case something went wrong with munmapping | 334 | * In case something went wrong with munmapping |
178 | * make sure the process doesn't catch the | 335 | * make sure the process doesn't catch the |
179 | * signal and then access the memory. Just kill it. | 336 | * signal and then access the memory. Just kill it. |
180 | * the signal handlers | ||
181 | */ | 337 | */ |
182 | if (fail || tk->addr_valid == 0) { | 338 | if (fail || tk->addr_valid == 0) { |
183 | printk(KERN_ERR | 339 | printk(KERN_ERR |
@@ -314,33 +470,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
314 | */ | 470 | */ |
315 | 471 | ||
316 | enum outcome { | 472 | enum outcome { |
317 | FAILED, /* Error handling failed */ | 473 | IGNORED, /* Error: cannot be handled */ |
474 | FAILED, /* Error: handling failed */ | ||
318 | DELAYED, /* Will be handled later */ | 475 | DELAYED, /* Will be handled later */ |
319 | IGNORED, /* Error safely ignored */ | ||
320 | RECOVERED, /* Successfully recovered */ | 476 | RECOVERED, /* Successfully recovered */ |
321 | }; | 477 | }; |
322 | 478 | ||
323 | static const char *action_name[] = { | 479 | static const char *action_name[] = { |
480 | [IGNORED] = "Ignored", | ||
324 | [FAILED] = "Failed", | 481 | [FAILED] = "Failed", |
325 | [DELAYED] = "Delayed", | 482 | [DELAYED] = "Delayed", |
326 | [IGNORED] = "Ignored", | ||
327 | [RECOVERED] = "Recovered", | 483 | [RECOVERED] = "Recovered", |
328 | }; | 484 | }; |
329 | 485 | ||
330 | /* | 486 | /* |
331 | * Error hit kernel page. | 487 | * XXX: It is possible that a page is isolated from LRU cache, |
332 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | 488 | * and then kept in swap cache or failed to remove from page cache. |
333 | * could be more sophisticated. | 489 | * The page count will stop it from being freed by unpoison. |
490 | * Stress tests should be aware of this memory leak problem. | ||
334 | */ | 491 | */ |
335 | static int me_kernel(struct page *p, unsigned long pfn) | 492 | static int delete_from_lru_cache(struct page *p) |
336 | { | 493 | { |
337 | return DELAYED; | 494 | if (!isolate_lru_page(p)) { |
495 | /* | ||
496 | * Clear sensible page flags, so that the buddy system won't | ||
497 | * complain when the page is unpoison-and-freed. | ||
498 | */ | ||
499 | ClearPageActive(p); | ||
500 | ClearPageUnevictable(p); | ||
501 | /* | ||
502 | * drop the page count elevated by isolate_lru_page() | ||
503 | */ | ||
504 | page_cache_release(p); | ||
505 | return 0; | ||
506 | } | ||
507 | return -EIO; | ||
338 | } | 508 | } |
339 | 509 | ||
340 | /* | 510 | /* |
341 | * Already poisoned page. | 511 | * Error hit kernel page. |
512 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
513 | * could be more sophisticated. | ||
342 | */ | 514 | */ |
343 | static int me_ignore(struct page *p, unsigned long pfn) | 515 | static int me_kernel(struct page *p, unsigned long pfn) |
344 | { | 516 | { |
345 | return IGNORED; | 517 | return IGNORED; |
346 | } | 518 | } |
@@ -355,14 +527,6 @@ static int me_unknown(struct page *p, unsigned long pfn) | |||
355 | } | 527 | } |
356 | 528 | ||
357 | /* | 529 | /* |
358 | * Free memory | ||
359 | */ | ||
360 | static int me_free(struct page *p, unsigned long pfn) | ||
361 | { | ||
362 | return DELAYED; | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | * Clean (or cleaned) page cache page. | 530 | * Clean (or cleaned) page cache page. |
367 | */ | 531 | */ |
368 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 532 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
@@ -371,6 +535,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
371 | int ret = FAILED; | 535 | int ret = FAILED; |
372 | struct address_space *mapping; | 536 | struct address_space *mapping; |
373 | 537 | ||
538 | delete_from_lru_cache(p); | ||
539 | |||
374 | /* | 540 | /* |
375 | * For anonymous pages we're done the only reference left | 541 | * For anonymous pages we're done the only reference left |
376 | * should be the one m_f() holds. | 542 | * should be the one m_f() holds. |
@@ -500,14 +666,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) | |||
500 | /* Trigger EIO in shmem: */ | 666 | /* Trigger EIO in shmem: */ |
501 | ClearPageUptodate(p); | 667 | ClearPageUptodate(p); |
502 | 668 | ||
503 | return DELAYED; | 669 | if (!delete_from_lru_cache(p)) |
670 | return DELAYED; | ||
671 | else | ||
672 | return FAILED; | ||
504 | } | 673 | } |
505 | 674 | ||
506 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 675 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
507 | { | 676 | { |
508 | delete_from_swap_cache(p); | 677 | delete_from_swap_cache(p); |
509 | 678 | ||
510 | return RECOVERED; | 679 | if (!delete_from_lru_cache(p)) |
680 | return RECOVERED; | ||
681 | else | ||
682 | return FAILED; | ||
511 | } | 683 | } |
512 | 684 | ||
513 | /* | 685 | /* |
@@ -550,7 +722,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
550 | #define tail (1UL << PG_tail) | 722 | #define tail (1UL << PG_tail) |
551 | #define compound (1UL << PG_compound) | 723 | #define compound (1UL << PG_compound) |
552 | #define slab (1UL << PG_slab) | 724 | #define slab (1UL << PG_slab) |
553 | #define buddy (1UL << PG_buddy) | ||
554 | #define reserved (1UL << PG_reserved) | 725 | #define reserved (1UL << PG_reserved) |
555 | 726 | ||
556 | static struct page_state { | 727 | static struct page_state { |
@@ -559,8 +730,11 @@ static struct page_state { | |||
559 | char *msg; | 730 | char *msg; |
560 | int (*action)(struct page *p, unsigned long pfn); | 731 | int (*action)(struct page *p, unsigned long pfn); |
561 | } error_states[] = { | 732 | } error_states[] = { |
562 | { reserved, reserved, "reserved kernel", me_ignore }, | 733 | { reserved, reserved, "reserved kernel", me_kernel }, |
563 | { buddy, buddy, "free kernel", me_free }, | 734 | /* |
735 | * free pages are specially detected outside this table: | ||
736 | * PG_buddy pages only make a small fraction of all free pages. | ||
737 | */ | ||
564 | 738 | ||
565 | /* | 739 | /* |
566 | * Could in theory check if slab page is free or if we can drop | 740 | * Could in theory check if slab page is free or if we can drop |
@@ -587,7 +761,6 @@ static struct page_state { | |||
587 | 761 | ||
588 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 762 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, |
589 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 763 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
590 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
591 | 764 | ||
592 | /* | 765 | /* |
593 | * Catchall entry: must be at end. | 766 | * Catchall entry: must be at end. |
@@ -595,20 +768,31 @@ static struct page_state { | |||
595 | { 0, 0, "unknown page state", me_unknown }, | 768 | { 0, 0, "unknown page state", me_unknown }, |
596 | }; | 769 | }; |
597 | 770 | ||
771 | #undef dirty | ||
772 | #undef sc | ||
773 | #undef unevict | ||
774 | #undef mlock | ||
775 | #undef writeback | ||
776 | #undef lru | ||
777 | #undef swapbacked | ||
778 | #undef head | ||
779 | #undef tail | ||
780 | #undef compound | ||
781 | #undef slab | ||
782 | #undef reserved | ||
783 | |||
598 | static void action_result(unsigned long pfn, char *msg, int result) | 784 | static void action_result(unsigned long pfn, char *msg, int result) |
599 | { | 785 | { |
600 | struct page *page = NULL; | 786 | struct page *page = pfn_to_page(pfn); |
601 | if (pfn_valid(pfn)) | ||
602 | page = pfn_to_page(pfn); | ||
603 | 787 | ||
604 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | 788 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", |
605 | pfn, | 789 | pfn, |
606 | page && PageDirty(page) ? "dirty " : "", | 790 | PageDirty(page) ? "dirty " : "", |
607 | msg, action_name[result]); | 791 | msg, action_name[result]); |
608 | } | 792 | } |
609 | 793 | ||
610 | static int page_action(struct page_state *ps, struct page *p, | 794 | static int page_action(struct page_state *ps, struct page *p, |
611 | unsigned long pfn, int ref) | 795 | unsigned long pfn) |
612 | { | 796 | { |
613 | int result; | 797 | int result; |
614 | int count; | 798 | int count; |
@@ -616,18 +800,22 @@ static int page_action(struct page_state *ps, struct page *p, | |||
616 | result = ps->action(p, pfn); | 800 | result = ps->action(p, pfn); |
617 | action_result(pfn, ps->msg, result); | 801 | action_result(pfn, ps->msg, result); |
618 | 802 | ||
619 | count = page_count(p) - 1 - ref; | 803 | count = page_count(p) - 1; |
620 | if (count != 0) | 804 | if (ps->action == me_swapcache_dirty && result == DELAYED) |
805 | count--; | ||
806 | if (count != 0) { | ||
621 | printk(KERN_ERR | 807 | printk(KERN_ERR |
622 | "MCE %#lx: %s page still referenced by %d users\n", | 808 | "MCE %#lx: %s page still referenced by %d users\n", |
623 | pfn, ps->msg, count); | 809 | pfn, ps->msg, count); |
810 | result = FAILED; | ||
811 | } | ||
624 | 812 | ||
625 | /* Could do more checks here if page looks ok */ | 813 | /* Could do more checks here if page looks ok */ |
626 | /* | 814 | /* |
627 | * Could adjust zone counters here to correct for the missing page. | 815 | * Could adjust zone counters here to correct for the missing page. |
628 | */ | 816 | */ |
629 | 817 | ||
630 | return result == RECOVERED ? 0 : -EBUSY; | 818 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
631 | } | 819 | } |
632 | 820 | ||
633 | #define N_UNMAP_TRIES 5 | 821 | #define N_UNMAP_TRIES 5 |
@@ -636,7 +824,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
636 | * Do all that is necessary to remove user space mappings. Unmap | 824 | * Do all that is necessary to remove user space mappings. Unmap |
637 | * the pages and send SIGBUS to the processes if the data was dirty. | 825 | * the pages and send SIGBUS to the processes if the data was dirty. |
638 | */ | 826 | */ |
639 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | 827 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
640 | int trapno) | 828 | int trapno) |
641 | { | 829 | { |
642 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 830 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
@@ -646,15 +834,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
646 | int i; | 834 | int i; |
647 | int kill = 1; | 835 | int kill = 1; |
648 | 836 | ||
649 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) | 837 | if (PageReserved(p) || PageSlab(p)) |
650 | return; | 838 | return SWAP_SUCCESS; |
651 | 839 | ||
652 | /* | 840 | /* |
653 | * This check implies we don't kill processes if their pages | 841 | * This check implies we don't kill processes if their pages |
654 | * are in the swap cache early. Those are always late kills. | 842 | * are in the swap cache early. Those are always late kills. |
655 | */ | 843 | */ |
656 | if (!page_mapped(p)) | 844 | if (!page_mapped(p)) |
657 | return; | 845 | return SWAP_SUCCESS; |
846 | |||
847 | if (PageCompound(p) || PageKsm(p)) | ||
848 | return SWAP_FAIL; | ||
658 | 849 | ||
659 | if (PageSwapCache(p)) { | 850 | if (PageSwapCache(p)) { |
660 | printk(KERN_ERR | 851 | printk(KERN_ERR |
@@ -665,6 +856,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
665 | /* | 856 | /* |
666 | * Propagate the dirty bit from PTEs to struct page first, because we | 857 | * Propagate the dirty bit from PTEs to struct page first, because we |
667 | * need this to decide if we should kill or just drop the page. | 858 | * need this to decide if we should kill or just drop the page. |
859 | * XXX: the dirty test could be racy: set_page_dirty() may not always | ||
860 | * be called inside page lock (it's recommended but not enforced). | ||
668 | */ | 861 | */ |
669 | mapping = page_mapping(p); | 862 | mapping = page_mapping(p); |
670 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 863 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { |
@@ -716,11 +909,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
716 | */ | 909 | */ |
717 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 910 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, |
718 | ret != SWAP_SUCCESS, pfn); | 911 | ret != SWAP_SUCCESS, pfn); |
912 | |||
913 | return ret; | ||
719 | } | 914 | } |
720 | 915 | ||
721 | int __memory_failure(unsigned long pfn, int trapno, int ref) | 916 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
722 | { | 917 | { |
723 | unsigned long lru_flag; | ||
724 | struct page_state *ps; | 918 | struct page_state *ps; |
725 | struct page *p; | 919 | struct page *p; |
726 | int res; | 920 | int res; |
@@ -729,13 +923,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
729 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 923 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
730 | 924 | ||
731 | if (!pfn_valid(pfn)) { | 925 | if (!pfn_valid(pfn)) { |
732 | action_result(pfn, "memory outside kernel control", IGNORED); | 926 | printk(KERN_ERR |
733 | return -EIO; | 927 | "MCE %#lx: memory outside kernel control\n", |
928 | pfn); | ||
929 | return -ENXIO; | ||
734 | } | 930 | } |
735 | 931 | ||
736 | p = pfn_to_page(pfn); | 932 | p = pfn_to_page(pfn); |
737 | if (TestSetPageHWPoison(p)) { | 933 | if (TestSetPageHWPoison(p)) { |
738 | action_result(pfn, "already hardware poisoned", IGNORED); | 934 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
739 | return 0; | 935 | return 0; |
740 | } | 936 | } |
741 | 937 | ||
@@ -752,9 +948,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
752 | * In fact it's dangerous to directly bump up page count from 0, | 948 | * In fact it's dangerous to directly bump up page count from 0, |
753 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 949 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
754 | */ | 950 | */ |
755 | if (!get_page_unless_zero(compound_head(p))) { | 951 | if (!(flags & MF_COUNT_INCREASED) && |
756 | action_result(pfn, "free or high order kernel", IGNORED); | 952 | !get_page_unless_zero(compound_head(p))) { |
757 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | 953 | if (is_free_buddy_page(p)) { |
954 | action_result(pfn, "free buddy", DELAYED); | ||
955 | return 0; | ||
956 | } else { | ||
957 | action_result(pfn, "high order kernel", IGNORED); | ||
958 | return -EBUSY; | ||
959 | } | ||
758 | } | 960 | } |
759 | 961 | ||
760 | /* | 962 | /* |
@@ -766,14 +968,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
766 | * walked by the page reclaim code, however that's not a big loss. | 968 | * walked by the page reclaim code, however that's not a big loss. |
767 | */ | 969 | */ |
768 | if (!PageLRU(p)) | 970 | if (!PageLRU(p)) |
769 | lru_add_drain_all(); | 971 | shake_page(p, 0); |
770 | lru_flag = p->flags & lru; | 972 | if (!PageLRU(p)) { |
771 | if (isolate_lru_page(p)) { | 973 | /* |
974 | * shake_page could have turned it free. | ||
975 | */ | ||
976 | if (is_free_buddy_page(p)) { | ||
977 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
978 | return 0; | ||
979 | } | ||
772 | action_result(pfn, "non LRU", IGNORED); | 980 | action_result(pfn, "non LRU", IGNORED); |
773 | put_page(p); | 981 | put_page(p); |
774 | return -EBUSY; | 982 | return -EBUSY; |
775 | } | 983 | } |
776 | page_cache_release(p); | ||
777 | 984 | ||
778 | /* | 985 | /* |
779 | * Lock the page and wait for writeback to finish. | 986 | * Lock the page and wait for writeback to finish. |
@@ -781,26 +988,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
781 | * and in many cases impossible, so we just avoid it here. | 988 | * and in many cases impossible, so we just avoid it here. |
782 | */ | 989 | */ |
783 | lock_page_nosync(p); | 990 | lock_page_nosync(p); |
991 | |||
992 | /* | ||
993 | * unpoison always clear PG_hwpoison inside page lock | ||
994 | */ | ||
995 | if (!PageHWPoison(p)) { | ||
996 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | ||
997 | res = 0; | ||
998 | goto out; | ||
999 | } | ||
1000 | if (hwpoison_filter(p)) { | ||
1001 | if (TestClearPageHWPoison(p)) | ||
1002 | atomic_long_dec(&mce_bad_pages); | ||
1003 | unlock_page(p); | ||
1004 | put_page(p); | ||
1005 | return 0; | ||
1006 | } | ||
1007 | |||
784 | wait_on_page_writeback(p); | 1008 | wait_on_page_writeback(p); |
785 | 1009 | ||
786 | /* | 1010 | /* |
787 | * Now take care of user space mappings. | 1011 | * Now take care of user space mappings. |
1012 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | ||
788 | */ | 1013 | */ |
789 | hwpoison_user_mappings(p, pfn, trapno); | 1014 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
1015 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | ||
1016 | res = -EBUSY; | ||
1017 | goto out; | ||
1018 | } | ||
790 | 1019 | ||
791 | /* | 1020 | /* |
792 | * Torn down by someone else? | 1021 | * Torn down by someone else? |
793 | */ | 1022 | */ |
794 | if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { | 1023 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
795 | action_result(pfn, "already truncated LRU", IGNORED); | 1024 | action_result(pfn, "already truncated LRU", IGNORED); |
796 | res = 0; | 1025 | res = -EBUSY; |
797 | goto out; | 1026 | goto out; |
798 | } | 1027 | } |
799 | 1028 | ||
800 | res = -EBUSY; | 1029 | res = -EBUSY; |
801 | for (ps = error_states;; ps++) { | 1030 | for (ps = error_states;; ps++) { |
802 | if (((p->flags | lru_flag)& ps->mask) == ps->res) { | 1031 | if ((p->flags & ps->mask) == ps->res) { |
803 | res = page_action(ps, p, pfn, ref); | 1032 | res = page_action(ps, p, pfn); |
804 | break; | 1033 | break; |
805 | } | 1034 | } |
806 | } | 1035 | } |
@@ -831,3 +1060,235 @@ void memory_failure(unsigned long pfn, int trapno) | |||
831 | { | 1060 | { |
832 | __memory_failure(pfn, trapno, 0); | 1061 | __memory_failure(pfn, trapno, 0); |
833 | } | 1062 | } |
1063 | |||
1064 | /** | ||
1065 | * unpoison_memory - Unpoison a previously poisoned page | ||
1066 | * @pfn: Page number of the to be unpoisoned page | ||
1067 | * | ||
1068 | * Software-unpoison a page that has been poisoned by | ||
1069 | * memory_failure() earlier. | ||
1070 | * | ||
1071 | * This is only done on the software-level, so it only works | ||
1072 | * for linux injected failures, not real hardware failures | ||
1073 | * | ||
1074 | * Returns 0 for success, otherwise -errno. | ||
1075 | */ | ||
1076 | int unpoison_memory(unsigned long pfn) | ||
1077 | { | ||
1078 | struct page *page; | ||
1079 | struct page *p; | ||
1080 | int freeit = 0; | ||
1081 | |||
1082 | if (!pfn_valid(pfn)) | ||
1083 | return -ENXIO; | ||
1084 | |||
1085 | p = pfn_to_page(pfn); | ||
1086 | page = compound_head(p); | ||
1087 | |||
1088 | if (!PageHWPoison(p)) { | ||
1089 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | ||
1090 | return 0; | ||
1091 | } | ||
1092 | |||
1093 | if (!get_page_unless_zero(page)) { | ||
1094 | if (TestClearPageHWPoison(p)) | ||
1095 | atomic_long_dec(&mce_bad_pages); | ||
1096 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | ||
1097 | return 0; | ||
1098 | } | ||
1099 | |||
1100 | lock_page_nosync(page); | ||
1101 | /* | ||
1102 | * This test is racy because PG_hwpoison is set outside of page lock. | ||
1103 | * That's acceptable because that won't trigger kernel panic. Instead, | ||
1104 | * the PG_hwpoison page will be caught and isolated on the entrance to | ||
1105 | * the free buddy page pool. | ||
1106 | */ | ||
1107 | if (TestClearPageHWPoison(p)) { | ||
1108 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | ||
1109 | atomic_long_dec(&mce_bad_pages); | ||
1110 | freeit = 1; | ||
1111 | } | ||
1112 | unlock_page(page); | ||
1113 | |||
1114 | put_page(page); | ||
1115 | if (freeit) | ||
1116 | put_page(page); | ||
1117 | |||
1118 | return 0; | ||
1119 | } | ||
1120 | EXPORT_SYMBOL(unpoison_memory); | ||
1121 | |||
1122 | static struct page *new_page(struct page *p, unsigned long private, int **x) | ||
1123 | { | ||
1124 | int nid = page_to_nid(p); | ||
1125 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1129 | * Safely get reference count of an arbitrary page. | ||
1130 | * Returns 0 for a free page, -EIO for a zero refcount page | ||
1131 | * that is not free, and 1 for any other page type. | ||
1132 | * For 1 the page is returned with increased page count, otherwise not. | ||
1133 | */ | ||
1134 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | ||
1135 | { | ||
1136 | int ret; | ||
1137 | |||
1138 | if (flags & MF_COUNT_INCREASED) | ||
1139 | return 1; | ||
1140 | |||
1141 | /* | ||
1142 | * The lock_system_sleep prevents a race with memory hotplug, | ||
1143 | * because the isolation assumes there's only a single user. | ||
1144 | * This is a big hammer, a better would be nicer. | ||
1145 | */ | ||
1146 | lock_system_sleep(); | ||
1147 | |||
1148 | /* | ||
1149 | * Isolate the page, so that it doesn't get reallocated if it | ||
1150 | * was free. | ||
1151 | */ | ||
1152 | set_migratetype_isolate(p); | ||
1153 | if (!get_page_unless_zero(compound_head(p))) { | ||
1154 | if (is_free_buddy_page(p)) { | ||
1155 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | ||
1156 | /* Set hwpoison bit while page is still isolated */ | ||
1157 | SetPageHWPoison(p); | ||
1158 | ret = 0; | ||
1159 | } else { | ||
1160 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | ||
1161 | pfn, p->flags); | ||
1162 | ret = -EIO; | ||
1163 | } | ||
1164 | } else { | ||
1165 | /* Not a free page */ | ||
1166 | ret = 1; | ||
1167 | } | ||
1168 | unset_migratetype_isolate(p); | ||
1169 | unlock_system_sleep(); | ||
1170 | return ret; | ||
1171 | } | ||
1172 | |||
1173 | /** | ||
1174 | * soft_offline_page - Soft offline a page. | ||
1175 | * @page: page to offline | ||
1176 | * @flags: flags. Same as memory_failure(). | ||
1177 | * | ||
1178 | * Returns 0 on success, otherwise negated errno. | ||
1179 | * | ||
1180 | * Soft offline a page, by migration or invalidation, | ||
1181 | * without killing anything. This is for the case when | ||
1182 | * a page is not corrupted yet (so it's still valid to access), | ||
1183 | * but has had a number of corrected errors and is better taken | ||
1184 | * out. | ||
1185 | * | ||
1186 | * The actual policy on when to do that is maintained by | ||
1187 | * user space. | ||
1188 | * | ||
1189 | * This should never impact any application or cause data loss, | ||
1190 | * however it might take some time. | ||
1191 | * | ||
1192 | * This is not a 100% solution for all memory, but tries to be | ||
1193 | * ``good enough'' for the majority of memory. | ||
1194 | */ | ||
1195 | int soft_offline_page(struct page *page, int flags) | ||
1196 | { | ||
1197 | int ret; | ||
1198 | unsigned long pfn = page_to_pfn(page); | ||
1199 | |||
1200 | ret = get_any_page(page, pfn, flags); | ||
1201 | if (ret < 0) | ||
1202 | return ret; | ||
1203 | if (ret == 0) | ||
1204 | goto done; | ||
1205 | |||
1206 | /* | ||
1207 | * Page cache page we can handle? | ||
1208 | */ | ||
1209 | if (!PageLRU(page)) { | ||
1210 | /* | ||
1211 | * Try to free it. | ||
1212 | */ | ||
1213 | put_page(page); | ||
1214 | shake_page(page, 1); | ||
1215 | |||
1216 | /* | ||
1217 | * Did it turn free? | ||
1218 | */ | ||
1219 | ret = get_any_page(page, pfn, 0); | ||
1220 | if (ret < 0) | ||
1221 | return ret; | ||
1222 | if (ret == 0) | ||
1223 | goto done; | ||
1224 | } | ||
1225 | if (!PageLRU(page)) { | ||
1226 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1227 | pfn, page->flags); | ||
1228 | return -EIO; | ||
1229 | } | ||
1230 | |||
1231 | lock_page(page); | ||
1232 | wait_on_page_writeback(page); | ||
1233 | |||
1234 | /* | ||
1235 | * Synchronized using the page lock with memory_failure() | ||
1236 | */ | ||
1237 | if (PageHWPoison(page)) { | ||
1238 | unlock_page(page); | ||
1239 | put_page(page); | ||
1240 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | ||
1241 | return -EBUSY; | ||
1242 | } | ||
1243 | |||
1244 | /* | ||
1245 | * Try to invalidate first. This should work for | ||
1246 | * non dirty unmapped page cache pages. | ||
1247 | */ | ||
1248 | ret = invalidate_inode_page(page); | ||
1249 | unlock_page(page); | ||
1250 | |||
1251 | /* | ||
1252 | * Drop count because page migration doesn't like raised | ||
1253 | * counts. The page could get re-allocated, but if it becomes | ||
1254 | * LRU the isolation will just fail. | ||
1255 | * RED-PEN would be better to keep it isolated here, but we | ||
1256 | * would need to fix isolation locking first. | ||
1257 | */ | ||
1258 | put_page(page); | ||
1259 | if (ret == 1) { | ||
1260 | ret = 0; | ||
1261 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | ||
1262 | goto done; | ||
1263 | } | ||
1264 | |||
1265 | /* | ||
1266 | * Simple invalidation didn't work. | ||
1267 | * Try to migrate to a new page instead. migrate.c | ||
1268 | * handles a large number of cases for us. | ||
1269 | */ | ||
1270 | ret = isolate_lru_page(page); | ||
1271 | if (!ret) { | ||
1272 | LIST_HEAD(pagelist); | ||
1273 | |||
1274 | list_add(&page->lru, &pagelist); | ||
1275 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
1276 | if (ret) { | ||
1277 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1278 | pfn, ret, page->flags); | ||
1279 | if (ret > 0) | ||
1280 | ret = -EIO; | ||
1281 | } | ||
1282 | } else { | ||
1283 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | ||
1284 | pfn, ret, page_count(page), page->flags); | ||
1285 | } | ||
1286 | if (ret) | ||
1287 | return ret; | ||
1288 | |||
1289 | done: | ||
1290 | atomic_long_add(1, &mce_bad_pages); | ||
1291 | SetPageHWPoison(page); | ||
1292 | /* keep elevated page count for bad page */ | ||
1293 | return ret; | ||
1294 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index aed45eaf8ac9..09e4b1be7b67 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2555,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2555 | ret = VM_FAULT_MAJOR; | 2555 | ret = VM_FAULT_MAJOR; |
2556 | count_vm_event(PGMAJFAULT); | 2556 | count_vm_event(PGMAJFAULT); |
2557 | } else if (PageHWPoison(page)) { | 2557 | } else if (PageHWPoison(page)) { |
2558 | /* | ||
2559 | * hwpoisoned dirty swapcache pages are kept for killing | ||
2560 | * owner processes (which may be unknown at hwpoison time) | ||
2561 | */ | ||
2558 | ret = VM_FAULT_HWPOISON; | 2562 | ret = VM_FAULT_HWPOISON; |
2559 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2563 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2560 | goto out_release; | 2564 | goto out_release; |
diff --git a/mm/migrate.c b/mm/migrate.c index efddbf0926b2..880bd592d38e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
912 | goto out_pm; | 912 | goto out_pm; |
913 | 913 | ||
914 | err = -ENODEV; | 914 | err = -ENODEV; |
915 | if (node < 0 || node >= MAX_NUMNODES) | ||
916 | goto out_pm; | ||
917 | |||
915 | if (!node_state(node, N_HIGH_MEMORY)) | 918 | if (!node_state(node, N_HIGH_MEMORY)) |
916 | goto out_pm; | 919 | goto out_pm; |
917 | 920 | ||
@@ -999,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, | |||
999 | #define DO_PAGES_STAT_CHUNK_NR 16 | 1002 | #define DO_PAGES_STAT_CHUNK_NR 16 |
1000 | const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; | 1003 | const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; |
1001 | int chunk_status[DO_PAGES_STAT_CHUNK_NR]; | 1004 | int chunk_status[DO_PAGES_STAT_CHUNK_NR]; |
1002 | unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; | ||
1003 | int err; | ||
1004 | 1005 | ||
1005 | for (i = 0; i < nr_pages; i += chunk_nr) { | 1006 | while (nr_pages) { |
1006 | if (chunk_nr > nr_pages - i) | 1007 | unsigned long chunk_nr; |
1007 | chunk_nr = nr_pages - i; | ||
1008 | 1008 | ||
1009 | err = copy_from_user(chunk_pages, &pages[i], | 1009 | chunk_nr = nr_pages; |
1010 | chunk_nr * sizeof(*chunk_pages)); | 1010 | if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) |
1011 | if (err) { | 1011 | chunk_nr = DO_PAGES_STAT_CHUNK_NR; |
1012 | err = -EFAULT; | 1012 | |
1013 | goto out; | 1013 | if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) |
1014 | } | 1014 | break; |
1015 | 1015 | ||
1016 | do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); | 1016 | do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); |
1017 | 1017 | ||
1018 | err = copy_to_user(&status[i], chunk_status, | 1018 | if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) |
1019 | chunk_nr * sizeof(*chunk_status)); | 1019 | break; |
1020 | if (err) { | ||
1021 | err = -EFAULT; | ||
1022 | goto out; | ||
1023 | } | ||
1024 | } | ||
1025 | err = 0; | ||
1026 | 1020 | ||
1027 | out: | 1021 | pages += chunk_nr; |
1028 | return err; | 1022 | status += chunk_nr; |
1023 | nr_pages -= chunk_nr; | ||
1024 | } | ||
1025 | return nr_pages ? -EFAULT : 0; | ||
1029 | } | 1026 | } |
1030 | 1027 | ||
1031 | /* | 1028 | /* |
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1043 | } | 1043 | } |
1044 | EXPORT_SYMBOL(do_mmap_pgoff); | 1044 | EXPORT_SYMBOL(do_mmap_pgoff); |
1045 | 1045 | ||
1046 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
1047 | unsigned long, prot, unsigned long, flags, | ||
1048 | unsigned long, fd, unsigned long, pgoff) | ||
1049 | { | ||
1050 | struct file *file = NULL; | ||
1051 | unsigned long retval = -EBADF; | ||
1052 | |||
1053 | if (!(flags & MAP_ANONYMOUS)) { | ||
1054 | if (unlikely(flags & MAP_HUGETLB)) | ||
1055 | return -EINVAL; | ||
1056 | file = fget(fd); | ||
1057 | if (!file) | ||
1058 | goto out; | ||
1059 | } else if (flags & MAP_HUGETLB) { | ||
1060 | struct user_struct *user = NULL; | ||
1061 | /* | ||
1062 | * VM_NORESERVE is used because the reservations will be | ||
1063 | * taken when vm_ops->mmap() is called | ||
1064 | * A dummy user value is used because we are not locking | ||
1065 | * memory so no accounting is necessary | ||
1066 | */ | ||
1067 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
1068 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
1069 | &user, HUGETLB_ANONHUGE_INODE); | ||
1070 | if (IS_ERR(file)) | ||
1071 | return PTR_ERR(file); | ||
1072 | } | ||
1073 | |||
1074 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
1075 | |||
1076 | down_write(¤t->mm->mmap_sem); | ||
1077 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1078 | up_write(¤t->mm->mmap_sem); | ||
1079 | |||
1080 | if (file) | ||
1081 | fput(file); | ||
1082 | out: | ||
1083 | return retval; | ||
1084 | } | ||
1085 | |||
1046 | /* | 1086 | /* |
1047 | * Some shared mappigns will want the pages marked read-only | 1087 | * Some shared mappigns will want the pages marked read-only |
1048 | * to track write events. If so, we'll downgrade vm_page_prot | 1088 | * to track write events. If so, we'll downgrade vm_page_prot |
diff --git a/mm/nommu.c b/mm/nommu.c index 8687973462bb..48a2ecfaf059 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
432 | /* | 432 | /* |
433 | * Ok, looks good - let it rip. | 433 | * Ok, looks good - let it rip. |
434 | */ | 434 | */ |
435 | flush_icache_range(mm->brk, brk); | ||
435 | return mm->brk = brk; | 436 | return mm->brk = brk; |
436 | } | 437 | } |
437 | 438 | ||
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) | |||
551 | static void __put_nommu_region(struct vm_region *region) | 552 | static void __put_nommu_region(struct vm_region *region) |
552 | __releases(nommu_region_sem) | 553 | __releases(nommu_region_sem) |
553 | { | 554 | { |
554 | kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); | 555 | kenter("%p{%d}", region, region->vm_usage); |
555 | 556 | ||
556 | BUG_ON(!nommu_region_tree.rb_node); | 557 | BUG_ON(!nommu_region_tree.rb_node); |
557 | 558 | ||
558 | if (atomic_dec_and_test(®ion->vm_usage)) { | 559 | if (--region->vm_usage == 0) { |
559 | if (region->vm_top > region->vm_start) | 560 | if (region->vm_top > region->vm_start) |
560 | delete_nommu_region(region); | 561 | delete_nommu_region(region); |
561 | up_write(&nommu_region_sem); | 562 | up_write(&nommu_region_sem); |
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1204 | if (!vma) | 1205 | if (!vma) |
1205 | goto error_getting_vma; | 1206 | goto error_getting_vma; |
1206 | 1207 | ||
1207 | atomic_set(®ion->vm_usage, 1); | 1208 | region->vm_usage = 1; |
1208 | region->vm_flags = vm_flags; | 1209 | region->vm_flags = vm_flags; |
1209 | region->vm_pgoff = pgoff; | 1210 | region->vm_pgoff = pgoff; |
1210 | 1211 | ||
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1271 | } | 1272 | } |
1272 | 1273 | ||
1273 | /* we've found a region we can share */ | 1274 | /* we've found a region we can share */ |
1274 | atomic_inc(&pregion->vm_usage); | 1275 | pregion->vm_usage++; |
1275 | vma->vm_region = pregion; | 1276 | vma->vm_region = pregion; |
1276 | start = pregion->vm_start; | 1277 | start = pregion->vm_start; |
1277 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; | 1278 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; |
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1288 | vma->vm_region = NULL; | 1289 | vma->vm_region = NULL; |
1289 | vma->vm_start = 0; | 1290 | vma->vm_start = 0; |
1290 | vma->vm_end = 0; | 1291 | vma->vm_end = 0; |
1291 | atomic_dec(&pregion->vm_usage); | 1292 | pregion->vm_usage--; |
1292 | pregion = NULL; | 1293 | pregion = NULL; |
1293 | goto error_just_free; | 1294 | goto error_just_free; |
1294 | } | 1295 | } |
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1353 | share: | 1354 | share: |
1354 | add_vma_to_mm(current->mm, vma); | 1355 | add_vma_to_mm(current->mm, vma); |
1355 | 1356 | ||
1356 | up_write(&nommu_region_sem); | 1357 | /* we flush the region from the icache only when the first executable |
1358 | * mapping of it is made */ | ||
1359 | if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { | ||
1360 | flush_icache_range(region->vm_start, region->vm_end); | ||
1361 | region->vm_icache_flushed = true; | ||
1362 | } | ||
1357 | 1363 | ||
1358 | if (prot & PROT_EXEC) | 1364 | up_write(&nommu_region_sem); |
1359 | flush_icache_range(result, result + len); | ||
1360 | 1365 | ||
1361 | kleave(" = %lx", result); | 1366 | kleave(" = %lx", result); |
1362 | return result; | 1367 | return result; |
@@ -1398,6 +1403,31 @@ error_getting_region: | |||
1398 | } | 1403 | } |
1399 | EXPORT_SYMBOL(do_mmap_pgoff); | 1404 | EXPORT_SYMBOL(do_mmap_pgoff); |
1400 | 1405 | ||
1406 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
1407 | unsigned long, prot, unsigned long, flags, | ||
1408 | unsigned long, fd, unsigned long, pgoff) | ||
1409 | { | ||
1410 | struct file *file = NULL; | ||
1411 | unsigned long retval = -EBADF; | ||
1412 | |||
1413 | if (!(flags & MAP_ANONYMOUS)) { | ||
1414 | file = fget(fd); | ||
1415 | if (!file) | ||
1416 | goto out; | ||
1417 | } | ||
1418 | |||
1419 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
1420 | |||
1421 | down_write(¤t->mm->mmap_sem); | ||
1422 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
1423 | up_write(¤t->mm->mmap_sem); | ||
1424 | |||
1425 | if (file) | ||
1426 | fput(file); | ||
1427 | out: | ||
1428 | return retval; | ||
1429 | } | ||
1430 | |||
1401 | /* | 1431 | /* |
1402 | * split a vma into two pieces at address 'addr', a new vma is allocated either | 1432 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
1403 | * for the first part or the tail. | 1433 | * for the first part or the tail. |
@@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1411 | 1441 | ||
1412 | kenter(""); | 1442 | kenter(""); |
1413 | 1443 | ||
1414 | /* we're only permitted to split anonymous regions that have a single | 1444 | /* we're only permitted to split anonymous regions (these should have |
1415 | * owner */ | 1445 | * only a single usage on the region) */ |
1416 | if (vma->vm_file || | 1446 | if (vma->vm_file) |
1417 | atomic_read(&vma->vm_region->vm_usage) != 1) | ||
1418 | return -ENOMEM; | 1447 | return -ENOMEM; |
1419 | 1448 | ||
1420 | if (mm->map_count >= sysctl_max_map_count) | 1449 | if (mm->map_count >= sysctl_max_map_count) |
@@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm, | |||
1488 | 1517 | ||
1489 | /* cut the backing region down to size */ | 1518 | /* cut the backing region down to size */ |
1490 | region = vma->vm_region; | 1519 | region = vma->vm_region; |
1491 | BUG_ON(atomic_read(®ion->vm_usage) != 1); | 1520 | BUG_ON(region->vm_usage != 1); |
1492 | 1521 | ||
1493 | down_write(&nommu_region_sem); | 1522 | down_write(&nommu_region_sem); |
1494 | delete_nommu_region(region); | 1523 | delete_nommu_region(region); |
@@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
1732 | EXPORT_SYMBOL(unmap_mapping_range); | 1761 | EXPORT_SYMBOL(unmap_mapping_range); |
1733 | 1762 | ||
1734 | /* | 1763 | /* |
1735 | * ask for an unmapped area at which to create a mapping on a file | ||
1736 | */ | ||
1737 | unsigned long get_unmapped_area(struct file *file, unsigned long addr, | ||
1738 | unsigned long len, unsigned long pgoff, | ||
1739 | unsigned long flags) | ||
1740 | { | ||
1741 | unsigned long (*get_area)(struct file *, unsigned long, unsigned long, | ||
1742 | unsigned long, unsigned long); | ||
1743 | |||
1744 | get_area = current->mm->get_unmapped_area; | ||
1745 | if (file && file->f_op && file->f_op->get_unmapped_area) | ||
1746 | get_area = file->f_op->get_unmapped_area; | ||
1747 | |||
1748 | if (!get_area) | ||
1749 | return -ENOSYS; | ||
1750 | |||
1751 | return get_area(file, addr, len, pgoff, flags); | ||
1752 | } | ||
1753 | EXPORT_SYMBOL(get_unmapped_area); | ||
1754 | |||
1755 | /* | ||
1756 | * Check that a process has enough memory to allocate a new virtual | 1764 | * Check that a process has enough memory to allocate a new virtual |
1757 | * mapping. 0 means there is enough memory for the allocation to | 1765 | * mapping. 0 means there is enough memory for the allocation to |
1758 | * succeed and -ENOMEM implies there is not. | 1766 | * succeed and -ENOMEM implies there is not. |
@@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
1891 | 1899 | ||
1892 | /* only read or write mappings where it is permitted */ | 1900 | /* only read or write mappings where it is permitted */ |
1893 | if (write && vma->vm_flags & VM_MAYWRITE) | 1901 | if (write && vma->vm_flags & VM_MAYWRITE) |
1894 | len -= copy_to_user((void *) addr, buf, len); | 1902 | copy_to_user_page(vma, NULL, addr, |
1903 | (void *) addr, buf, len); | ||
1895 | else if (!write && vma->vm_flags & VM_MAYREAD) | 1904 | else if (!write && vma->vm_flags & VM_MAYREAD) |
1896 | len -= copy_from_user(buf, (void *) addr, len); | 1905 | copy_from_user_page(vma, NULL, addr, |
1906 | buf, (void *) addr, len); | ||
1897 | else | 1907 | else |
1898 | len = 0; | 1908 | len = 0; |
1899 | } else { | 1909 | } else { |
@@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
1904 | mmput(mm); | 1914 | mmput(mm); |
1905 | return len; | 1915 | return len; |
1906 | } | 1916 | } |
1917 | |||
1918 | /** | ||
1919 | * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode | ||
1920 | * @inode: The inode to check | ||
1921 | * @size: The current filesize of the inode | ||
1922 | * @newsize: The proposed filesize of the inode | ||
1923 | * | ||
1924 | * Check the shared mappings on an inode on behalf of a shrinking truncate to | ||
1925 | * make sure that that any outstanding VMAs aren't broken and then shrink the | ||
1926 | * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't | ||
1927 | * automatically grant mappings that are too large. | ||
1928 | */ | ||
1929 | int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | ||
1930 | size_t newsize) | ||
1931 | { | ||
1932 | struct vm_area_struct *vma; | ||
1933 | struct prio_tree_iter iter; | ||
1934 | struct vm_region *region; | ||
1935 | pgoff_t low, high; | ||
1936 | size_t r_size, r_top; | ||
1937 | |||
1938 | low = newsize >> PAGE_SHIFT; | ||
1939 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
1940 | |||
1941 | down_write(&nommu_region_sem); | ||
1942 | |||
1943 | /* search for VMAs that fall within the dead zone */ | ||
1944 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | ||
1945 | low, high) { | ||
1946 | /* found one - only interested if it's shared out of the page | ||
1947 | * cache */ | ||
1948 | if (vma->vm_flags & VM_SHARED) { | ||
1949 | up_write(&nommu_region_sem); | ||
1950 | return -ETXTBSY; /* not quite true, but near enough */ | ||
1951 | } | ||
1952 | } | ||
1953 | |||
1954 | /* reduce any regions that overlap the dead zone - if in existence, | ||
1955 | * these will be pointed to by VMAs that don't overlap the dead zone | ||
1956 | * | ||
1957 | * we don't check for any regions that start beyond the EOF as there | ||
1958 | * shouldn't be any | ||
1959 | */ | ||
1960 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | ||
1961 | 0, ULONG_MAX) { | ||
1962 | if (!(vma->vm_flags & VM_SHARED)) | ||
1963 | continue; | ||
1964 | |||
1965 | region = vma->vm_region; | ||
1966 | r_size = region->vm_top - region->vm_start; | ||
1967 | r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; | ||
1968 | |||
1969 | if (r_top > newsize) { | ||
1970 | region->vm_top -= r_top - newsize; | ||
1971 | if (region->vm_end > region->vm_top) | ||
1972 | region->vm_end = region->vm_top; | ||
1973 | } | ||
1974 | } | ||
1975 | |||
1976 | up_write(&nommu_region_sem); | ||
1977 | return 0; | ||
1978 | } | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f52481b1c1e5..237050478f28 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
459 | list_for_each_entry(c, &p->children, sibling) { | 459 | list_for_each_entry(c, &p->children, sibling) { |
460 | if (c->mm == p->mm) | 460 | if (c->mm == p->mm) |
461 | continue; | 461 | continue; |
462 | if (mem && !task_in_mem_cgroup(c, mem)) | ||
463 | continue; | ||
462 | if (!oom_kill_task(c)) | 464 | if (!oom_kill_task(c)) |
463 | return 0; | 465 | return 0; |
464 | } | 466 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 850c4a7e2fe5..8deb9d0fd5b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
51 | #include <linux/memory.h> | ||
51 | #include <trace/events/kmem.h> | 52 | #include <trace/events/kmem.h> |
52 | 53 | ||
53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
@@ -555,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
555 | page = list_entry(list->prev, struct page, lru); | 556 | page = list_entry(list->prev, struct page, lru); |
556 | /* must delete as __free_one_page list manipulates */ | 557 | /* must delete as __free_one_page list manipulates */ |
557 | list_del(&page->lru); | 558 | list_del(&page->lru); |
558 | __free_one_page(page, zone, 0, migratetype); | 559 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
559 | trace_mm_page_pcpu_drain(page, 0, migratetype); | 560 | __free_one_page(page, zone, 0, page_private(page)); |
561 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | ||
560 | } while (--count && --batch_free && !list_empty(list)); | 562 | } while (--count && --batch_free && !list_empty(list)); |
561 | } | 563 | } |
562 | spin_unlock(&zone->lock); | 564 | spin_unlock(&zone->lock); |
@@ -1221,10 +1223,10 @@ again: | |||
1221 | } | 1223 | } |
1222 | spin_lock_irqsave(&zone->lock, flags); | 1224 | spin_lock_irqsave(&zone->lock, flags); |
1223 | page = __rmqueue(zone, order, migratetype); | 1225 | page = __rmqueue(zone, order, migratetype); |
1224 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1225 | spin_unlock(&zone->lock); | 1226 | spin_unlock(&zone->lock); |
1226 | if (!page) | 1227 | if (!page) |
1227 | goto failed; | 1228 | goto failed; |
1229 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1228 | } | 1230 | } |
1229 | 1231 | ||
1230 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1232 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
@@ -2401,13 +2403,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2401 | { | 2403 | { |
2402 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 2404 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
2403 | int ret; | 2405 | int ret; |
2406 | static DEFINE_MUTEX(zl_order_mutex); | ||
2404 | 2407 | ||
2408 | mutex_lock(&zl_order_mutex); | ||
2405 | if (write) | 2409 | if (write) |
2406 | strncpy(saved_string, (char*)table->data, | 2410 | strcpy(saved_string, (char*)table->data); |
2407 | NUMA_ZONELIST_ORDER_LEN); | ||
2408 | ret = proc_dostring(table, write, buffer, length, ppos); | 2411 | ret = proc_dostring(table, write, buffer, length, ppos); |
2409 | if (ret) | 2412 | if (ret) |
2410 | return ret; | 2413 | goto out; |
2411 | if (write) { | 2414 | if (write) { |
2412 | int oldval = user_zonelist_order; | 2415 | int oldval = user_zonelist_order; |
2413 | if (__parse_numa_zonelist_order((char*)table->data)) { | 2416 | if (__parse_numa_zonelist_order((char*)table->data)) { |
@@ -2420,7 +2423,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2420 | } else if (oldval != user_zonelist_order) | 2423 | } else if (oldval != user_zonelist_order) |
2421 | build_all_zonelists(); | 2424 | build_all_zonelists(); |
2422 | } | 2425 | } |
2423 | return 0; | 2426 | out: |
2427 | mutex_unlock(&zl_order_mutex); | ||
2428 | return ret; | ||
2424 | } | 2429 | } |
2425 | 2430 | ||
2426 | 2431 | ||
@@ -3579,7 +3584,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
3579 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 3584 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
3580 | * then all holes in the requested range will be accounted for. | 3585 | * then all holes in the requested range will be accounted for. |
3581 | */ | 3586 | */ |
3582 | static unsigned long __meminit __absent_pages_in_range(int nid, | 3587 | unsigned long __meminit __absent_pages_in_range(int nid, |
3583 | unsigned long range_start_pfn, | 3588 | unsigned long range_start_pfn, |
3584 | unsigned long range_end_pfn) | 3589 | unsigned long range_end_pfn) |
3585 | { | 3590 | { |
@@ -3994,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, | |||
3994 | } | 3999 | } |
3995 | 4000 | ||
3996 | /* Merge backward if suitable */ | 4001 | /* Merge backward if suitable */ |
3997 | if (start_pfn < early_node_map[i].end_pfn && | 4002 | if (start_pfn < early_node_map[i].start_pfn && |
3998 | end_pfn >= early_node_map[i].start_pfn) { | 4003 | end_pfn >= early_node_map[i].start_pfn) { |
3999 | early_node_map[i].start_pfn = start_pfn; | 4004 | early_node_map[i].start_pfn = start_pfn; |
4000 | return; | 4005 | return; |
@@ -4108,7 +4113,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) | |||
4108 | } | 4113 | } |
4109 | 4114 | ||
4110 | /* sort the node_map by start_pfn */ | 4115 | /* sort the node_map by start_pfn */ |
4111 | static void __init sort_node_map(void) | 4116 | void __init sort_node_map(void) |
4112 | { | 4117 | { |
4113 | sort(early_node_map, (size_t)nr_nodemap_entries, | 4118 | sort(early_node_map, (size_t)nr_nodemap_entries, |
4114 | sizeof(struct node_active_region), | 4119 | sizeof(struct node_active_region), |
@@ -5008,23 +5013,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5008 | int set_migratetype_isolate(struct page *page) | 5013 | int set_migratetype_isolate(struct page *page) |
5009 | { | 5014 | { |
5010 | struct zone *zone; | 5015 | struct zone *zone; |
5011 | unsigned long flags; | 5016 | struct page *curr_page; |
5017 | unsigned long flags, pfn, iter; | ||
5018 | unsigned long immobile = 0; | ||
5019 | struct memory_isolate_notify arg; | ||
5020 | int notifier_ret; | ||
5012 | int ret = -EBUSY; | 5021 | int ret = -EBUSY; |
5013 | int zone_idx; | 5022 | int zone_idx; |
5014 | 5023 | ||
5015 | zone = page_zone(page); | 5024 | zone = page_zone(page); |
5016 | zone_idx = zone_idx(zone); | 5025 | zone_idx = zone_idx(zone); |
5026 | |||
5017 | spin_lock_irqsave(&zone->lock, flags); | 5027 | spin_lock_irqsave(&zone->lock, flags); |
5028 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || | ||
5029 | zone_idx == ZONE_MOVABLE) { | ||
5030 | ret = 0; | ||
5031 | goto out; | ||
5032 | } | ||
5033 | |||
5034 | pfn = page_to_pfn(page); | ||
5035 | arg.start_pfn = pfn; | ||
5036 | arg.nr_pages = pageblock_nr_pages; | ||
5037 | arg.pages_found = 0; | ||
5038 | |||
5018 | /* | 5039 | /* |
5019 | * In future, more migrate types will be able to be isolation target. | 5040 | * It may be possible to isolate a pageblock even if the |
5041 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5042 | * notifier chain is used by balloon drivers to return the | ||
5043 | * number of pages in a range that are held by the balloon | ||
5044 | * driver to shrink memory. If all the pages are accounted for | ||
5045 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5046 | * Later, for example, when memory hotplug notifier runs, these | ||
5047 | * pages reported as "can be isolated" should be isolated(freed) | ||
5048 | * by the balloon driver through the memory notifier chain. | ||
5020 | */ | 5049 | */ |
5021 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && | 5050 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); |
5022 | zone_idx != ZONE_MOVABLE) | 5051 | notifier_ret = notifier_to_errno(notifier_ret); |
5052 | if (notifier_ret || !arg.pages_found) | ||
5023 | goto out; | 5053 | goto out; |
5024 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5054 | |
5025 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5055 | for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { |
5026 | ret = 0; | 5056 | if (!pfn_valid_within(pfn)) |
5057 | continue; | ||
5058 | |||
5059 | curr_page = pfn_to_page(iter); | ||
5060 | if (!page_count(curr_page) || PageLRU(curr_page)) | ||
5061 | continue; | ||
5062 | |||
5063 | immobile++; | ||
5064 | } | ||
5065 | |||
5066 | if (arg.pages_found == immobile) | ||
5067 | ret = 0; | ||
5068 | |||
5027 | out: | 5069 | out: |
5070 | if (!ret) { | ||
5071 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5072 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5073 | } | ||
5074 | |||
5028 | spin_unlock_irqrestore(&zone->lock, flags); | 5075 | spin_unlock_irqrestore(&zone->lock, flags); |
5029 | if (!ret) | 5076 | if (!ret) |
5030 | drain_all_pages(); | 5077 | drain_all_pages(); |
@@ -5091,3 +5138,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5091 | spin_unlock_irqrestore(&zone->lock, flags); | 5138 | spin_unlock_irqrestore(&zone->lock, flags); |
5092 | } | 5139 | } |
5093 | #endif | 5140 | #endif |
5141 | |||
5142 | #ifdef CONFIG_MEMORY_FAILURE | ||
5143 | bool is_free_buddy_page(struct page *page) | ||
5144 | { | ||
5145 | struct zone *zone = page_zone(page); | ||
5146 | unsigned long pfn = page_to_pfn(page); | ||
5147 | unsigned long flags; | ||
5148 | int order; | ||
5149 | |||
5150 | spin_lock_irqsave(&zone->lock, flags); | ||
5151 | for (order = 0; order < MAX_ORDER; order++) { | ||
5152 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | ||
5153 | |||
5154 | if (PageBuddy(page_head) && page_order(page_head) >= order) | ||
5155 | break; | ||
5156 | } | ||
5157 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5158 | |||
5159 | return order < MAX_ORDER; | ||
5160 | } | ||
5161 | #endif | ||
diff --git a/mm/percpu.c b/mm/percpu.c index 442010cc91c6..083e7c91e5f6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work) | |||
1271 | */ | 1271 | */ |
1272 | void free_percpu(void *ptr) | 1272 | void free_percpu(void *ptr) |
1273 | { | 1273 | { |
1274 | void *addr = __pcpu_ptr_to_addr(ptr); | 1274 | void *addr; |
1275 | struct pcpu_chunk *chunk; | 1275 | struct pcpu_chunk *chunk; |
1276 | unsigned long flags; | 1276 | unsigned long flags; |
1277 | int off; | 1277 | int off; |
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr) | |||
1279 | if (!ptr) | 1279 | if (!ptr) |
1280 | return; | 1280 | return; |
1281 | 1281 | ||
1282 | addr = __pcpu_ptr_to_addr(ptr); | ||
1283 | |||
1282 | spin_lock_irqsave(&pcpu_lock, flags); | 1284 | spin_lock_irqsave(&pcpu_lock, flags); |
1283 | 1285 | ||
1284 | chunk = pcpu_chunk_addr_search(addr); | 1286 | chunk = pcpu_chunk_addr_search(addr); |
diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..033bc135a41f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping, | |||
547 | 547 | ||
548 | /* do read-ahead */ | 548 | /* do read-ahead */ |
549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
550 | |||
551 | #ifdef CONFIG_BLOCK | ||
552 | /* | ||
553 | * Normally the current page is !uptodate and lock_page() will be | ||
554 | * immediately called to implicitly unplug the device. However this | ||
555 | * is not always true for RAID conifgurations, where data arrives | ||
556 | * not strictly in their submission order. In this case we need to | ||
557 | * explicitly kick off the IO. | ||
558 | */ | ||
559 | if (PageUptodate(page)) | ||
560 | blk_run_backing_dev(mapping->backing_dev_info, NULL); | ||
561 | #endif | ||
550 | } | 562 | } |
551 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 563 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
diff --git a/mm/shmem.c b/mm/shmem.c index 4fb41c83daca..eef4ebea5158 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/swap.h> | 31 | #include <linux/swap.h> |
32 | #include <linux/ima.h> | ||
33 | 32 | ||
34 | static struct vfsmount *shm_mnt; | 33 | static struct vfsmount *shm_mnt; |
35 | 34 | ||
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt; | |||
42 | 41 | ||
43 | #include <linux/xattr.h> | 42 | #include <linux/xattr.h> |
44 | #include <linux/exportfs.h> | 43 | #include <linux/exportfs.h> |
44 | #include <linux/posix_acl.h> | ||
45 | #include <linux/generic_acl.h> | 45 | #include <linux/generic_acl.h> |
46 | #include <linux/mman.h> | 46 | #include <linux/mman.h> |
47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
810 | error = inode_setattr(inode, attr); | 810 | error = inode_setattr(inode, attr); |
811 | #ifdef CONFIG_TMPFS_POSIX_ACL | 811 | #ifdef CONFIG_TMPFS_POSIX_ACL |
812 | if (!error && (attr->ia_valid & ATTR_MODE)) | 812 | if (!error && (attr->ia_valid & ATTR_MODE)) |
813 | error = generic_acl_chmod(inode, &shmem_acl_ops); | 813 | error = generic_acl_chmod(inode); |
814 | #endif | 814 | #endif |
815 | if (page) | 815 | if (page) |
816 | page_cache_release(page); | 816 | page_cache_release(page); |
@@ -1824,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1824 | return error; | 1824 | return error; |
1825 | } | 1825 | } |
1826 | } | 1826 | } |
1827 | error = shmem_acl_init(inode, dir); | 1827 | #ifdef CONFIG_TMPFS_POSIX_ACL |
1828 | error = generic_acl_init(inode, dir); | ||
1828 | if (error) { | 1829 | if (error) { |
1829 | iput(inode); | 1830 | iput(inode); |
1830 | return error; | 1831 | return error; |
1831 | } | 1832 | } |
1833 | #else | ||
1834 | error = 0; | ||
1835 | #endif | ||
1832 | if (dir->i_mode & S_ISGID) { | 1836 | if (dir->i_mode & S_ISGID) { |
1833 | inode->i_gid = dir->i_gid; | 1837 | inode->i_gid = dir->i_gid; |
1834 | if (S_ISDIR(mode)) | 1838 | if (S_ISDIR(mode)) |
@@ -2043,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { | |||
2043 | * filesystem level, though. | 2047 | * filesystem level, though. |
2044 | */ | 2048 | */ |
2045 | 2049 | ||
2046 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | 2050 | static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, |
2047 | size_t list_len, const char *name, | 2051 | size_t list_len, const char *name, |
2048 | size_t name_len) | 2052 | size_t name_len, int handler_flags) |
2049 | { | 2053 | { |
2050 | return security_inode_listsecurity(inode, list, list_len); | 2054 | return security_inode_listsecurity(dentry->d_inode, list, list_len); |
2051 | } | 2055 | } |
2052 | 2056 | ||
2053 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | 2057 | static int shmem_xattr_security_get(struct dentry *dentry, const char *name, |
2054 | void *buffer, size_t size) | 2058 | void *buffer, size_t size, int handler_flags) |
2055 | { | 2059 | { |
2056 | if (strcmp(name, "") == 0) | 2060 | if (strcmp(name, "") == 0) |
2057 | return -EINVAL; | 2061 | return -EINVAL; |
2058 | return xattr_getsecurity(inode, name, buffer, size); | 2062 | return xattr_getsecurity(dentry->d_inode, name, buffer, size); |
2059 | } | 2063 | } |
2060 | 2064 | ||
2061 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | 2065 | static int shmem_xattr_security_set(struct dentry *dentry, const char *name, |
2062 | const void *value, size_t size, int flags) | 2066 | const void *value, size_t size, int flags, int handler_flags) |
2063 | { | 2067 | { |
2064 | if (strcmp(name, "") == 0) | 2068 | if (strcmp(name, "") == 0) |
2065 | return -EINVAL; | 2069 | return -EINVAL; |
2066 | return security_inode_setsecurity(inode, name, value, size, flags); | 2070 | return security_inode_setsecurity(dentry->d_inode, name, value, |
2071 | size, flags); | ||
2067 | } | 2072 | } |
2068 | 2073 | ||
2069 | static struct xattr_handler shmem_xattr_security_handler = { | 2074 | static struct xattr_handler shmem_xattr_security_handler = { |
@@ -2074,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = { | |||
2074 | }; | 2079 | }; |
2075 | 2080 | ||
2076 | static struct xattr_handler *shmem_xattr_handlers[] = { | 2081 | static struct xattr_handler *shmem_xattr_handlers[] = { |
2077 | &shmem_xattr_acl_access_handler, | 2082 | &generic_acl_access_handler, |
2078 | &shmem_xattr_acl_default_handler, | 2083 | &generic_acl_default_handler, |
2079 | &shmem_xattr_security_handler, | 2084 | &shmem_xattr_security_handler, |
2080 | NULL | 2085 | NULL |
2081 | }; | 2086 | }; |
@@ -2454,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
2454 | .getxattr = generic_getxattr, | 2459 | .getxattr = generic_getxattr, |
2455 | .listxattr = generic_listxattr, | 2460 | .listxattr = generic_listxattr, |
2456 | .removexattr = generic_removexattr, | 2461 | .removexattr = generic_removexattr, |
2457 | .check_acl = shmem_check_acl, | 2462 | .check_acl = generic_check_acl, |
2458 | #endif | 2463 | #endif |
2459 | 2464 | ||
2460 | }; | 2465 | }; |
@@ -2477,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2477 | .getxattr = generic_getxattr, | 2482 | .getxattr = generic_getxattr, |
2478 | .listxattr = generic_listxattr, | 2483 | .listxattr = generic_listxattr, |
2479 | .removexattr = generic_removexattr, | 2484 | .removexattr = generic_removexattr, |
2480 | .check_acl = shmem_check_acl, | 2485 | .check_acl = generic_check_acl, |
2481 | #endif | 2486 | #endif |
2482 | }; | 2487 | }; |
2483 | 2488 | ||
@@ -2488,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2488 | .getxattr = generic_getxattr, | 2493 | .getxattr = generic_getxattr, |
2489 | .listxattr = generic_listxattr, | 2494 | .listxattr = generic_listxattr, |
2490 | .removexattr = generic_removexattr, | 2495 | .removexattr = generic_removexattr, |
2491 | .check_acl = shmem_check_acl, | 2496 | .check_acl = generic_check_acl, |
2492 | #endif | 2497 | #endif |
2493 | }; | 2498 | }; |
2494 | 2499 | ||
@@ -2626,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2626 | int error; | 2631 | int error; |
2627 | struct file *file; | 2632 | struct file *file; |
2628 | struct inode *inode; | 2633 | struct inode *inode; |
2629 | struct dentry *dentry, *root; | 2634 | struct path path; |
2635 | struct dentry *root; | ||
2630 | struct qstr this; | 2636 | struct qstr this; |
2631 | 2637 | ||
2632 | if (IS_ERR(shm_mnt)) | 2638 | if (IS_ERR(shm_mnt)) |
@@ -2643,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2643 | this.len = strlen(name); | 2649 | this.len = strlen(name); |
2644 | this.hash = 0; /* will go */ | 2650 | this.hash = 0; /* will go */ |
2645 | root = shm_mnt->mnt_root; | 2651 | root = shm_mnt->mnt_root; |
2646 | dentry = d_alloc(root, &this); | 2652 | path.dentry = d_alloc(root, &this); |
2647 | if (!dentry) | 2653 | if (!path.dentry) |
2648 | goto put_memory; | 2654 | goto put_memory; |
2649 | 2655 | path.mnt = mntget(shm_mnt); | |
2650 | error = -ENFILE; | ||
2651 | file = get_empty_filp(); | ||
2652 | if (!file) | ||
2653 | goto put_dentry; | ||
2654 | 2656 | ||
2655 | error = -ENOSPC; | 2657 | error = -ENOSPC; |
2656 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); | 2658 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); |
2657 | if (!inode) | 2659 | if (!inode) |
2658 | goto close_file; | 2660 | goto put_dentry; |
2659 | 2661 | ||
2660 | d_instantiate(dentry, inode); | 2662 | d_instantiate(path.dentry, inode); |
2661 | inode->i_size = size; | 2663 | inode->i_size = size; |
2662 | inode->i_nlink = 0; /* It is unlinked */ | 2664 | inode->i_nlink = 0; /* It is unlinked */ |
2663 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
2664 | &shmem_file_operations); | ||
2665 | |||
2666 | #ifndef CONFIG_MMU | 2665 | #ifndef CONFIG_MMU |
2667 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2666 | error = ramfs_nommu_expand_for_mapping(inode, size); |
2668 | if (error) | 2667 | if (error) |
2669 | goto close_file; | 2668 | goto put_dentry; |
2670 | #endif | 2669 | #endif |
2671 | ima_counts_get(file); | 2670 | |
2671 | error = -ENFILE; | ||
2672 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | ||
2673 | &shmem_file_operations); | ||
2674 | if (!file) | ||
2675 | goto put_dentry; | ||
2676 | |||
2672 | return file; | 2677 | return file; |
2673 | 2678 | ||
2674 | close_file: | ||
2675 | put_filp(file); | ||
2676 | put_dentry: | 2679 | put_dentry: |
2677 | dput(dentry); | 2680 | path_put(&path); |
2678 | put_memory: | 2681 | put_memory: |
2679 | shmem_unacct_size(flags, size); | 2682 | shmem_unacct_size(flags, size); |
2680 | return ERR_PTR(error); | 2683 | return ERR_PTR(error); |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index df2c87fdae50..000000000000 --- a/mm/shmem_acl.c +++ /dev/null | |||
@@ -1,171 +0,0 @@ | |||
1 | /* | ||
2 | * mm/shmem_acl.c | ||
3 | * | ||
4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/fs.h> | ||
10 | #include <linux/shmem_fs.h> | ||
11 | #include <linux/xattr.h> | ||
12 | #include <linux/generic_acl.h> | ||
13 | |||
14 | /** | ||
15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
16 | */ | ||
17 | static struct posix_acl * | ||
18 | shmem_get_acl(struct inode *inode, int type) | ||
19 | { | ||
20 | struct posix_acl *acl = NULL; | ||
21 | |||
22 | spin_lock(&inode->i_lock); | ||
23 | switch(type) { | ||
24 | case ACL_TYPE_ACCESS: | ||
25 | acl = posix_acl_dup(inode->i_acl); | ||
26 | break; | ||
27 | |||
28 | case ACL_TYPE_DEFAULT: | ||
29 | acl = posix_acl_dup(inode->i_default_acl); | ||
30 | break; | ||
31 | } | ||
32 | spin_unlock(&inode->i_lock); | ||
33 | |||
34 | return acl; | ||
35 | } | ||
36 | |||
37 | /** | ||
38 | * shmem_set_acl - generic_acl_operations->setacl() operation | ||
39 | */ | ||
40 | static void | ||
41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
42 | { | ||
43 | struct posix_acl *free = NULL; | ||
44 | |||
45 | spin_lock(&inode->i_lock); | ||
46 | switch(type) { | ||
47 | case ACL_TYPE_ACCESS: | ||
48 | free = inode->i_acl; | ||
49 | inode->i_acl = posix_acl_dup(acl); | ||
50 | break; | ||
51 | |||
52 | case ACL_TYPE_DEFAULT: | ||
53 | free = inode->i_default_acl; | ||
54 | inode->i_default_acl = posix_acl_dup(acl); | ||
55 | break; | ||
56 | } | ||
57 | spin_unlock(&inode->i_lock); | ||
58 | posix_acl_release(free); | ||
59 | } | ||
60 | |||
61 | struct generic_acl_operations shmem_acl_ops = { | ||
62 | .getacl = shmem_get_acl, | ||
63 | .setacl = shmem_set_acl, | ||
64 | }; | ||
65 | |||
66 | /** | ||
67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
69 | * system.posix_acl_access xattr using the generic acl functions. | ||
70 | */ | ||
71 | |||
72 | static size_t | ||
73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
74 | const char *name, size_t name_len) | ||
75 | { | ||
76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
77 | list, list_size); | ||
78 | } | ||
79 | |||
80 | static int | ||
81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
82 | size_t size) | ||
83 | { | ||
84 | if (strcmp(name, "") != 0) | ||
85 | return -EINVAL; | ||
86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
87 | size); | ||
88 | } | ||
89 | |||
90 | static int | ||
91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
92 | size_t size, int flags) | ||
93 | { | ||
94 | if (strcmp(name, "") != 0) | ||
95 | return -EINVAL; | ||
96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
97 | size); | ||
98 | } | ||
99 | |||
100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
102 | .list = shmem_list_acl_access, | ||
103 | .get = shmem_get_acl_access, | ||
104 | .set = shmem_set_acl_access, | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
110 | * system.posix_acl_default xattr using the generic acl functions. | ||
111 | */ | ||
112 | |||
113 | static size_t | ||
114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
115 | const char *name, size_t name_len) | ||
116 | { | ||
117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
118 | list, list_size); | ||
119 | } | ||
120 | |||
121 | static int | ||
122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
123 | size_t size) | ||
124 | { | ||
125 | if (strcmp(name, "") != 0) | ||
126 | return -EINVAL; | ||
127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
128 | size); | ||
129 | } | ||
130 | |||
131 | static int | ||
132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
133 | size_t size, int flags) | ||
134 | { | ||
135 | if (strcmp(name, "") != 0) | ||
136 | return -EINVAL; | ||
137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
138 | size); | ||
139 | } | ||
140 | |||
141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
143 | .list = shmem_list_acl_default, | ||
144 | .get = shmem_get_acl_default, | ||
145 | .set = shmem_set_acl_default, | ||
146 | }; | ||
147 | |||
148 | /** | ||
149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
150 | */ | ||
151 | int | ||
152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
153 | { | ||
154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
159 | */ | ||
160 | int | ||
161 | shmem_check_acl(struct inode *inode, int mask) | ||
162 | { | ||
163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
164 | |||
165 | if (acl) { | ||
166 | int error = posix_acl_permission(inode, acl, mask); | ||
167 | posix_acl_release(acl); | ||
168 | return error; | ||
169 | } | ||
170 | return -EAGAIN; | ||
171 | } | ||
@@ -654,7 +654,7 @@ static void init_node_lock_keys(int q) | |||
654 | 654 | ||
655 | l3 = s->cs_cachep->nodelists[q]; | 655 | l3 = s->cs_cachep->nodelists[q]; |
656 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 656 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
657 | return; | 657 | continue; |
658 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 658 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); |
659 | alc = l3->alien; | 659 | alc = l3->alien; |
660 | /* | 660 | /* |
@@ -665,7 +665,7 @@ static void init_node_lock_keys(int q) | |||
665 | * for alloc_alien_cache, | 665 | * for alloc_alien_cache, |
666 | */ | 666 | */ |
667 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | 667 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) |
668 | return; | 668 | continue; |
669 | for_each_node(r) { | 669 | for_each_node(r) { |
670 | if (alc[r]) | 670 | if (alc[r]) |
671 | lockdep_set_class(&alc[r]->lock, | 671 | lockdep_set_class(&alc[r]->lock, |
@@ -1132,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1132 | if (nc) | 1132 | if (nc) |
1133 | free_block(cachep, nc->entry, nc->avail, node); | 1133 | free_block(cachep, nc->entry, nc->avail, node); |
1134 | 1134 | ||
1135 | if (!cpus_empty(*mask)) { | 1135 | if (!cpumask_empty(mask)) { |
1136 | spin_unlock_irq(&l3->list_lock); | 1136 | spin_unlock_irq(&l3->list_lock); |
1137 | goto free_array_cache; | 1137 | goto free_array_cache; |
1138 | } | 1138 | } |
@@ -2275,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2275 | /* | 2275 | /* |
2276 | * Determine if the slab management is 'on' or 'off' slab. | 2276 | * Determine if the slab management is 'on' or 'off' slab. |
2277 | * (bootstrapping cannot cope with offslab caches so don't do | 2277 | * (bootstrapping cannot cope with offslab caches so don't do |
2278 | * it too early on.) | 2278 | * it too early on. Always use on-slab management when |
2279 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | ||
2279 | */ | 2280 | */ |
2280 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) | 2281 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && |
2282 | !(flags & SLAB_NOLEAKTRACE)) | ||
2281 | /* | 2283 | /* |
2282 | * Size is large, assume best to place the slab management obj | 2284 | * Size is large, assume best to place the slab management obj |
2283 | * off-slab (should allow better packing of objs). | 2285 | * off-slab (should allow better packing of objs). |
@@ -2596,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2596 | * kmemleak does not treat the ->s_mem pointer as a reference | 2598 | * kmemleak does not treat the ->s_mem pointer as a reference |
2597 | * to the object. Otherwise we will not report the leak. | 2599 | * to the object. Otherwise we will not report the leak. |
2598 | */ | 2600 | */ |
2599 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | 2601 | kmemleak_scan_area(&slabp->list, sizeof(struct list_head), |
2600 | sizeof(struct list_head), local_flags); | 2602 | local_flags); |
2601 | if (!slabp) | 2603 | if (!slabp) |
2602 | return NULL; | 2604 | return NULL; |
2603 | } else { | 2605 | } else { |
diff --git a/mm/truncate.c b/mm/truncate.c index 342deee22684..e87e37244829 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
522 | */ | 522 | */ |
523 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | 523 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) |
524 | { | 524 | { |
525 | if (new < old) { | 525 | struct address_space *mapping = inode->i_mapping; |
526 | struct address_space *mapping = inode->i_mapping; | 526 | |
527 | 527 | /* | |
528 | /* | 528 | * unmap_mapping_range is called twice, first simply for |
529 | * unmap_mapping_range is called twice, first simply for | 529 | * efficiency so that truncate_inode_pages does fewer |
530 | * efficiency so that truncate_inode_pages does fewer | 530 | * single-page unmaps. However after this first call, and |
531 | * single-page unmaps. However after this first call, and | 531 | * before truncate_inode_pages finishes, it is possible for |
532 | * before truncate_inode_pages finishes, it is possible for | 532 | * private pages to be COWed, which remain after |
533 | * private pages to be COWed, which remain after | 533 | * truncate_inode_pages finishes, hence the second |
534 | * truncate_inode_pages finishes, hence the second | 534 | * unmap_mapping_range call must be made for correctness. |
535 | * unmap_mapping_range call must be made for correctness. | 535 | */ |
536 | */ | 536 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); |
537 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 537 | truncate_inode_pages(mapping, new); |
538 | truncate_inode_pages(mapping, new); | 538 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); |
539 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | ||
540 | } | ||
541 | } | 539 | } |
542 | EXPORT_SYMBOL(truncate_pagecache); | 540 | EXPORT_SYMBOL(truncate_pagecache); |
543 | 541 | ||
@@ -4,10 +4,6 @@ | |||
4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/hugetlb.h> | ||
8 | #include <linux/syscalls.h> | ||
9 | #include <linux/mman.h> | ||
10 | #include <linux/file.h> | ||
11 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
12 | 8 | ||
13 | #define CREATE_TRACE_POINTS | 9 | #define CREATE_TRACE_POINTS |
@@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n) | |||
224 | } | 220 | } |
225 | EXPORT_SYMBOL(strndup_user); | 221 | EXPORT_SYMBOL(strndup_user); |
226 | 222 | ||
227 | #ifndef HAVE_ARCH_PICK_MMAP_LAYOUT | 223 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
228 | void arch_pick_mmap_layout(struct mm_struct *mm) | 224 | void arch_pick_mmap_layout(struct mm_struct *mm) |
229 | { | 225 | { |
230 | mm->mmap_base = TASK_UNMAPPED_BASE; | 226 | mm->mmap_base = TASK_UNMAPPED_BASE; |
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, | |||
272 | } | 268 | } |
273 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 269 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
274 | 270 | ||
275 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
276 | unsigned long, prot, unsigned long, flags, | ||
277 | unsigned long, fd, unsigned long, pgoff) | ||
278 | { | ||
279 | struct file * file = NULL; | ||
280 | unsigned long retval = -EBADF; | ||
281 | |||
282 | if (!(flags & MAP_ANONYMOUS)) { | ||
283 | if (unlikely(flags & MAP_HUGETLB)) | ||
284 | return -EINVAL; | ||
285 | file = fget(fd); | ||
286 | if (!file) | ||
287 | goto out; | ||
288 | } else if (flags & MAP_HUGETLB) { | ||
289 | struct user_struct *user = NULL; | ||
290 | /* | ||
291 | * VM_NORESERVE is used because the reservations will be | ||
292 | * taken when vm_ops->mmap() is called | ||
293 | * A dummy user value is used because we are not locking | ||
294 | * memory so no accounting is necessary | ||
295 | */ | ||
296 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
297 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
298 | &user, HUGETLB_ANONHUGE_INODE); | ||
299 | if (IS_ERR(file)) | ||
300 | return PTR_ERR(file); | ||
301 | } | ||
302 | |||
303 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
304 | |||
305 | down_write(¤t->mm->mmap_sem); | ||
306 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
307 | up_write(¤t->mm->mmap_sem); | ||
308 | |||
309 | if (file) | ||
310 | fput(file); | ||
311 | out: | ||
312 | return retval; | ||
313 | } | ||
314 | |||
315 | /* Tracepoints definitions. */ | 271 | /* Tracepoints definitions. */ |
316 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 272 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
317 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 273 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 37e69295f250..ae007462b7f6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void) | |||
509 | 509 | ||
510 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | 510 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); |
511 | 511 | ||
512 | /* for per-CPU blocks */ | ||
513 | static void purge_fragmented_blocks_allcpus(void); | ||
514 | |||
512 | /* | 515 | /* |
513 | * Purges all lazily-freed vmap areas. | 516 | * Purges all lazily-freed vmap areas. |
514 | * | 517 | * |
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
539 | } else | 542 | } else |
540 | spin_lock(&purge_lock); | 543 | spin_lock(&purge_lock); |
541 | 544 | ||
545 | if (sync) | ||
546 | purge_fragmented_blocks_allcpus(); | ||
547 | |||
542 | rcu_read_lock(); | 548 | rcu_read_lock(); |
543 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | 549 | list_for_each_entry_rcu(va, &vmap_area_list, list) { |
544 | if (va->flags & VM_LAZY_FREE) { | 550 | if (va->flags & VM_LAZY_FREE) { |
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
555 | } | 561 | } |
556 | rcu_read_unlock(); | 562 | rcu_read_unlock(); |
557 | 563 | ||
558 | if (nr) { | 564 | if (nr) |
559 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
560 | atomic_sub(nr, &vmap_lazy_nr); | 565 | atomic_sub(nr, &vmap_lazy_nr); |
561 | } | ||
562 | 566 | ||
563 | if (nr || force_flush) | 567 | if (nr || force_flush) |
564 | flush_tlb_kernel_range(*start, *end); | 568 | flush_tlb_kernel_range(*start, *end); |
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false; | |||
669 | struct vmap_block_queue { | 673 | struct vmap_block_queue { |
670 | spinlock_t lock; | 674 | spinlock_t lock; |
671 | struct list_head free; | 675 | struct list_head free; |
672 | struct list_head dirty; | ||
673 | unsigned int nr_dirty; | ||
674 | }; | 676 | }; |
675 | 677 | ||
676 | struct vmap_block { | 678 | struct vmap_block { |
@@ -680,10 +682,9 @@ struct vmap_block { | |||
680 | unsigned long free, dirty; | 682 | unsigned long free, dirty; |
681 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | 683 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); |
682 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 684 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); |
683 | union { | 685 | struct list_head free_list; |
684 | struct list_head free_list; | 686 | struct rcu_head rcu_head; |
685 | struct rcu_head rcu_head; | 687 | struct list_head purge; |
686 | }; | ||
687 | }; | 688 | }; |
688 | 689 | ||
689 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | 690 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ |
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
759 | vbq = &get_cpu_var(vmap_block_queue); | 760 | vbq = &get_cpu_var(vmap_block_queue); |
760 | vb->vbq = vbq; | 761 | vb->vbq = vbq; |
761 | spin_lock(&vbq->lock); | 762 | spin_lock(&vbq->lock); |
762 | list_add(&vb->free_list, &vbq->free); | 763 | list_add_rcu(&vb->free_list, &vbq->free); |
763 | spin_unlock(&vbq->lock); | 764 | spin_unlock(&vbq->lock); |
764 | put_cpu_var(vmap_block_queue); | 765 | put_cpu_var(vmap_block_queue); |
765 | 766 | ||
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb) | |||
778 | struct vmap_block *tmp; | 779 | struct vmap_block *tmp; |
779 | unsigned long vb_idx; | 780 | unsigned long vb_idx; |
780 | 781 | ||
781 | BUG_ON(!list_empty(&vb->free_list)); | ||
782 | |||
783 | vb_idx = addr_to_vb_idx(vb->va->va_start); | 782 | vb_idx = addr_to_vb_idx(vb->va->va_start); |
784 | spin_lock(&vmap_block_tree_lock); | 783 | spin_lock(&vmap_block_tree_lock); |
785 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | 784 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); |
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb) | |||
790 | call_rcu(&vb->rcu_head, rcu_free_vb); | 789 | call_rcu(&vb->rcu_head, rcu_free_vb); |
791 | } | 790 | } |
792 | 791 | ||
792 | static void purge_fragmented_blocks(int cpu) | ||
793 | { | ||
794 | LIST_HEAD(purge); | ||
795 | struct vmap_block *vb; | ||
796 | struct vmap_block *n_vb; | ||
797 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
798 | |||
799 | rcu_read_lock(); | ||
800 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
801 | |||
802 | if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) | ||
803 | continue; | ||
804 | |||
805 | spin_lock(&vb->lock); | ||
806 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { | ||
807 | vb->free = 0; /* prevent further allocs after releasing lock */ | ||
808 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ | ||
809 | bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); | ||
810 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); | ||
811 | spin_lock(&vbq->lock); | ||
812 | list_del_rcu(&vb->free_list); | ||
813 | spin_unlock(&vbq->lock); | ||
814 | spin_unlock(&vb->lock); | ||
815 | list_add_tail(&vb->purge, &purge); | ||
816 | } else | ||
817 | spin_unlock(&vb->lock); | ||
818 | } | ||
819 | rcu_read_unlock(); | ||
820 | |||
821 | list_for_each_entry_safe(vb, n_vb, &purge, purge) { | ||
822 | list_del(&vb->purge); | ||
823 | free_vmap_block(vb); | ||
824 | } | ||
825 | } | ||
826 | |||
827 | static void purge_fragmented_blocks_thiscpu(void) | ||
828 | { | ||
829 | purge_fragmented_blocks(smp_processor_id()); | ||
830 | } | ||
831 | |||
832 | static void purge_fragmented_blocks_allcpus(void) | ||
833 | { | ||
834 | int cpu; | ||
835 | |||
836 | for_each_possible_cpu(cpu) | ||
837 | purge_fragmented_blocks(cpu); | ||
838 | } | ||
839 | |||
793 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | 840 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) |
794 | { | 841 | { |
795 | struct vmap_block_queue *vbq; | 842 | struct vmap_block_queue *vbq; |
796 | struct vmap_block *vb; | 843 | struct vmap_block *vb; |
797 | unsigned long addr = 0; | 844 | unsigned long addr = 0; |
798 | unsigned int order; | 845 | unsigned int order; |
846 | int purge = 0; | ||
799 | 847 | ||
800 | BUG_ON(size & ~PAGE_MASK); | 848 | BUG_ON(size & ~PAGE_MASK); |
801 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 849 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
@@ -808,24 +856,38 @@ again: | |||
808 | int i; | 856 | int i; |
809 | 857 | ||
810 | spin_lock(&vb->lock); | 858 | spin_lock(&vb->lock); |
859 | if (vb->free < 1UL << order) | ||
860 | goto next; | ||
861 | |||
811 | i = bitmap_find_free_region(vb->alloc_map, | 862 | i = bitmap_find_free_region(vb->alloc_map, |
812 | VMAP_BBMAP_BITS, order); | 863 | VMAP_BBMAP_BITS, order); |
813 | 864 | ||
814 | if (i >= 0) { | 865 | if (i < 0) { |
815 | addr = vb->va->va_start + (i << PAGE_SHIFT); | 866 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { |
816 | BUG_ON(addr_to_vb_idx(addr) != | 867 | /* fragmented and no outstanding allocations */ |
817 | addr_to_vb_idx(vb->va->va_start)); | 868 | BUG_ON(vb->dirty != VMAP_BBMAP_BITS); |
818 | vb->free -= 1UL << order; | 869 | purge = 1; |
819 | if (vb->free == 0) { | ||
820 | spin_lock(&vbq->lock); | ||
821 | list_del_init(&vb->free_list); | ||
822 | spin_unlock(&vbq->lock); | ||
823 | } | 870 | } |
824 | spin_unlock(&vb->lock); | 871 | goto next; |
825 | break; | 872 | } |
873 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
874 | BUG_ON(addr_to_vb_idx(addr) != | ||
875 | addr_to_vb_idx(vb->va->va_start)); | ||
876 | vb->free -= 1UL << order; | ||
877 | if (vb->free == 0) { | ||
878 | spin_lock(&vbq->lock); | ||
879 | list_del_rcu(&vb->free_list); | ||
880 | spin_unlock(&vbq->lock); | ||
826 | } | 881 | } |
827 | spin_unlock(&vb->lock); | 882 | spin_unlock(&vb->lock); |
883 | break; | ||
884 | next: | ||
885 | spin_unlock(&vb->lock); | ||
828 | } | 886 | } |
887 | |||
888 | if (purge) | ||
889 | purge_fragmented_blocks_thiscpu(); | ||
890 | |||
829 | put_cpu_var(vmap_block_queue); | 891 | put_cpu_var(vmap_block_queue); |
830 | rcu_read_unlock(); | 892 | rcu_read_unlock(); |
831 | 893 | ||
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size) | |||
862 | BUG_ON(!vb); | 924 | BUG_ON(!vb); |
863 | 925 | ||
864 | spin_lock(&vb->lock); | 926 | spin_lock(&vb->lock); |
865 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | 927 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); |
866 | 928 | ||
867 | vb->dirty += 1UL << order; | 929 | vb->dirty += 1UL << order; |
868 | if (vb->dirty == VMAP_BBMAP_BITS) { | 930 | if (vb->dirty == VMAP_BBMAP_BITS) { |
869 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | 931 | BUG_ON(vb->free); |
870 | spin_unlock(&vb->lock); | 932 | spin_unlock(&vb->lock); |
871 | free_vmap_block(vb); | 933 | free_vmap_block(vb); |
872 | } else | 934 | } else |
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void) | |||
1035 | vbq = &per_cpu(vmap_block_queue, i); | 1097 | vbq = &per_cpu(vmap_block_queue, i); |
1036 | spin_lock_init(&vbq->lock); | 1098 | spin_lock_init(&vbq->lock); |
1037 | INIT_LIST_HEAD(&vbq->free); | 1099 | INIT_LIST_HEAD(&vbq->free); |
1038 | INIT_LIST_HEAD(&vbq->dirty); | ||
1039 | vbq->nr_dirty = 0; | ||
1040 | } | 1100 | } |
1041 | 1101 | ||
1042 | /* Import existing vmlist entries. */ | 1102 | /* Import existing vmlist entries. */ |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 885207a6b6b7..c26986c85ce0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
1922 | if (!populated_zone(zone)) | 1922 | if (!populated_zone(zone)) |
1923 | continue; | 1923 | continue; |
1924 | 1924 | ||
1925 | if (zone_is_all_unreclaimable(zone)) | ||
1926 | continue; | ||
1927 | |||
1925 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 1928 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), |
1926 | 0, 0)) | 1929 | 0, 0)) |
1927 | return 1; | 1930 | return 1; |