diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-04-14 18:09:40 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-04-14 18:09:40 -0400 |
commit | 6b3a707736301c2128ca85ce85fb13f60b5e350a (patch) | |
tree | 2bf1892cf29121150adece8d1221ecd513a4e792 | |
parent | 4443f8e6ac7755cd775c70d08be8042dc2f936cb (diff) | |
parent | 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb (diff) |
Merge branch 'page-refs' (page ref overflow)
Merge page ref overflow branch.
Jann Horn reported that he can overflow the page ref count with
sufficient memory (and a filesystem that is intentionally extremely
slow).
Admittedly it's not exactly easy. To have more than four billion
references to a page requires a minimum of 32GB of kernel memory just
for the pointers to the pages, much less any metadata to keep track of
those pointers. Jann needed a total of 140GB of memory and a specially
crafted filesystem that leaves all reads pending (in order to not ever
free the page references and just keep adding more).
Still, we have a fairly straightforward way to limit the two obvious
user-controllable sources of page references: direct-IO like page
references gotten through get_user_pages(), and the splice pipe page
duplication. So let's just do that.
* branch page-refs:
fs: prevent page refcount overflow in pipe_buf_get
mm: prevent get_user_pages() from overflowing page refcount
mm: add 'try_get_page()' helper function
mm: make page ref count overflow check tighter and more explicit
-rw-r--r-- | fs/fuse/dev.c | 12 | ||||
-rw-r--r-- | fs/pipe.c | 4 | ||||
-rw-r--r-- | fs/splice.c | 12 | ||||
-rw-r--r-- | include/linux/mm.h | 15 | ||||
-rw-r--r-- | include/linux/pipe_fs_i.h | 10 | ||||
-rw-r--r-- | kernel/trace/trace.c | 6 | ||||
-rw-r--r-- | mm/gup.c | 48 | ||||
-rw-r--r-- | mm/hugetlb.c | 13 |
8 files changed, 92 insertions, 28 deletions
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8a63e52785e9..9971a35cf1ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, | |||
2056 | rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; | 2056 | rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; |
2057 | 2057 | ||
2058 | ret = -EINVAL; | 2058 | ret = -EINVAL; |
2059 | if (rem < len) { | 2059 | if (rem < len) |
2060 | pipe_unlock(pipe); | 2060 | goto out_free; |
2061 | goto out; | ||
2062 | } | ||
2063 | 2061 | ||
2064 | rem = len; | 2062 | rem = len; |
2065 | while (rem) { | 2063 | while (rem) { |
@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, | |||
2077 | pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); | 2075 | pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); |
2078 | pipe->nrbufs--; | 2076 | pipe->nrbufs--; |
2079 | } else { | 2077 | } else { |
2080 | pipe_buf_get(pipe, ibuf); | 2078 | if (!pipe_buf_get(pipe, ibuf)) |
2079 | goto out_free; | ||
2080 | |||
2081 | *obuf = *ibuf; | 2081 | *obuf = *ibuf; |
2082 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; | 2082 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; |
2083 | obuf->len = rem; | 2083 | obuf->len = rem; |
@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, | |||
2100 | ret = fuse_dev_do_write(fud, &cs, len); | 2100 | ret = fuse_dev_do_write(fud, &cs, len); |
2101 | 2101 | ||
2102 | pipe_lock(pipe); | 2102 | pipe_lock(pipe); |
2103 | out_free: | ||
2103 | for (idx = 0; idx < nbuf; idx++) | 2104 | for (idx = 0; idx < nbuf; idx++) |
2104 | pipe_buf_release(pipe, &bufs[idx]); | 2105 | pipe_buf_release(pipe, &bufs[idx]); |
2105 | pipe_unlock(pipe); | 2106 | pipe_unlock(pipe); |
2106 | 2107 | ||
2107 | out: | ||
2108 | kvfree(bufs); | 2108 | kvfree(bufs); |
2109 | return ret; | 2109 | return ret; |
2110 | } | 2110 | } |
@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); | |||
188 | * in the tee() system call, when we duplicate the buffers in one | 188 | * in the tee() system call, when we duplicate the buffers in one |
189 | * pipe into another. | 189 | * pipe into another. |
190 | */ | 190 | */ |
191 | void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) | 191 | bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) |
192 | { | 192 | { |
193 | get_page(buf->page); | 193 | return try_get_page(buf->page); |
194 | } | 194 | } |
195 | EXPORT_SYMBOL(generic_pipe_buf_get); | 195 | EXPORT_SYMBOL(generic_pipe_buf_get); |
196 | 196 | ||
diff --git a/fs/splice.c b/fs/splice.c index 3ee7e82df48f..98943d9b219c 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -1593,7 +1593,11 @@ retry: | |||
1593 | * Get a reference to this pipe buffer, | 1593 | * Get a reference to this pipe buffer, |
1594 | * so we can copy the contents over. | 1594 | * so we can copy the contents over. |
1595 | */ | 1595 | */ |
1596 | pipe_buf_get(ipipe, ibuf); | 1596 | if (!pipe_buf_get(ipipe, ibuf)) { |
1597 | if (ret == 0) | ||
1598 | ret = -EFAULT; | ||
1599 | break; | ||
1600 | } | ||
1597 | *obuf = *ibuf; | 1601 | *obuf = *ibuf; |
1598 | 1602 | ||
1599 | /* | 1603 | /* |
@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, | |||
1667 | * Get a reference to this pipe buffer, | 1671 | * Get a reference to this pipe buffer, |
1668 | * so we can copy the contents over. | 1672 | * so we can copy the contents over. |
1669 | */ | 1673 | */ |
1670 | pipe_buf_get(ipipe, ibuf); | 1674 | if (!pipe_buf_get(ipipe, ibuf)) { |
1675 | if (ret == 0) | ||
1676 | ret = -EFAULT; | ||
1677 | break; | ||
1678 | } | ||
1671 | 1679 | ||
1672 | obuf = opipe->bufs + nbuf; | 1680 | obuf = opipe->bufs + nbuf; |
1673 | *obuf = *ibuf; | 1681 | *obuf = *ibuf; |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 76769749b5a5..6b10c21630f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page) | |||
966 | } | 966 | } |
967 | #endif /* CONFIG_DEV_PAGEMAP_OPS */ | 967 | #endif /* CONFIG_DEV_PAGEMAP_OPS */ |
968 | 968 | ||
969 | /* 127: arbitrary random number, small enough to assemble well */ | ||
970 | #define page_ref_zero_or_close_to_overflow(page) \ | ||
971 | ((unsigned int) page_ref_count(page) + 127u <= 127u) | ||
972 | |||
969 | static inline void get_page(struct page *page) | 973 | static inline void get_page(struct page *page) |
970 | { | 974 | { |
971 | page = compound_head(page); | 975 | page = compound_head(page); |
@@ -973,8 +977,17 @@ static inline void get_page(struct page *page) | |||
973 | * Getting a normal page or the head of a compound page | 977 | * Getting a normal page or the head of a compound page |
974 | * requires to already have an elevated page->_refcount. | 978 | * requires to already have an elevated page->_refcount. |
975 | */ | 979 | */ |
976 | VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); | 980 | VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); |
981 | page_ref_inc(page); | ||
982 | } | ||
983 | |||
984 | static inline __must_check bool try_get_page(struct page *page) | ||
985 | { | ||
986 | page = compound_head(page); | ||
987 | if (WARN_ON_ONCE(page_ref_count(page) <= 0)) | ||
988 | return false; | ||
977 | page_ref_inc(page); | 989 | page_ref_inc(page); |
990 | return true; | ||
978 | } | 991 | } |
979 | 992 | ||
980 | static inline void put_page(struct page *page) | 993 | static inline void put_page(struct page *page) |
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 787d224ff43e..abb2dac3da9b 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h | |||
@@ -101,18 +101,20 @@ struct pipe_buf_operations { | |||
101 | /* | 101 | /* |
102 | * Get a reference to the pipe buffer. | 102 | * Get a reference to the pipe buffer. |
103 | */ | 103 | */ |
104 | void (*get)(struct pipe_inode_info *, struct pipe_buffer *); | 104 | bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); |
105 | }; | 105 | }; |
106 | 106 | ||
107 | /** | 107 | /** |
108 | * pipe_buf_get - get a reference to a pipe_buffer | 108 | * pipe_buf_get - get a reference to a pipe_buffer |
109 | * @pipe: the pipe that the buffer belongs to | 109 | * @pipe: the pipe that the buffer belongs to |
110 | * @buf: the buffer to get a reference to | 110 | * @buf: the buffer to get a reference to |
111 | * | ||
112 | * Return: %true if the reference was successfully obtained. | ||
111 | */ | 113 | */ |
112 | static inline void pipe_buf_get(struct pipe_inode_info *pipe, | 114 | static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, |
113 | struct pipe_buffer *buf) | 115 | struct pipe_buffer *buf) |
114 | { | 116 | { |
115 | buf->ops->get(pipe, buf); | 117 | return buf->ops->get(pipe, buf); |
116 | } | 118 | } |
117 | 119 | ||
118 | /** | 120 | /** |
@@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void); | |||
171 | void free_pipe_info(struct pipe_inode_info *); | 173 | void free_pipe_info(struct pipe_inode_info *); |
172 | 174 | ||
173 | /* Generic pipe buffer ops functions */ | 175 | /* Generic pipe buffer ops functions */ |
174 | void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); | 176 | bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); |
175 | int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); | 177 | int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); |
176 | int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); | 178 | int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); |
177 | void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); | 179 | void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..6c24755655c7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, | |||
7041 | buf->private = 0; | 7041 | buf->private = 0; |
7042 | } | 7042 | } |
7043 | 7043 | ||
7044 | static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, | 7044 | static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, |
7045 | struct pipe_buffer *buf) | 7045 | struct pipe_buffer *buf) |
7046 | { | 7046 | { |
7047 | struct buffer_ref *ref = (struct buffer_ref *)buf->private; | 7047 | struct buffer_ref *ref = (struct buffer_ref *)buf->private; |
7048 | 7048 | ||
7049 | if (ref->ref > INT_MAX/2) | ||
7050 | return false; | ||
7051 | |||
7049 | ref->ref++; | 7052 | ref->ref++; |
7053 | return true; | ||
7050 | } | 7054 | } |
7051 | 7055 | ||
7052 | /* Pipe buffer operations for a buffer. */ | 7056 | /* Pipe buffer operations for a buffer. */ |
@@ -160,8 +160,12 @@ retry: | |||
160 | goto retry; | 160 | goto retry; |
161 | } | 161 | } |
162 | 162 | ||
163 | if (flags & FOLL_GET) | 163 | if (flags & FOLL_GET) { |
164 | get_page(page); | 164 | if (unlikely(!try_get_page(page))) { |
165 | page = ERR_PTR(-ENOMEM); | ||
166 | goto out; | ||
167 | } | ||
168 | } | ||
165 | if (flags & FOLL_TOUCH) { | 169 | if (flags & FOLL_TOUCH) { |
166 | if ((flags & FOLL_WRITE) && | 170 | if ((flags & FOLL_WRITE) && |
167 | !pte_dirty(pte) && !PageDirty(page)) | 171 | !pte_dirty(pte) && !PageDirty(page)) |
@@ -298,7 +302,10 @@ retry_locked: | |||
298 | if (pmd_trans_unstable(pmd)) | 302 | if (pmd_trans_unstable(pmd)) |
299 | ret = -EBUSY; | 303 | ret = -EBUSY; |
300 | } else { | 304 | } else { |
301 | get_page(page); | 305 | if (unlikely(!try_get_page(page))) { |
306 | spin_unlock(ptl); | ||
307 | return ERR_PTR(-ENOMEM); | ||
308 | } | ||
302 | spin_unlock(ptl); | 309 | spin_unlock(ptl); |
303 | lock_page(page); | 310 | lock_page(page); |
304 | ret = split_huge_page(page); | 311 | ret = split_huge_page(page); |
@@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, | |||
500 | if (is_device_public_page(*page)) | 507 | if (is_device_public_page(*page)) |
501 | goto unmap; | 508 | goto unmap; |
502 | } | 509 | } |
503 | get_page(*page); | 510 | if (unlikely(!try_get_page(*page))) { |
511 | ret = -ENOMEM; | ||
512 | goto unmap; | ||
513 | } | ||
504 | out: | 514 | out: |
505 | ret = 0; | 515 | ret = 0; |
506 | unmap: | 516 | unmap: |
@@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | |||
1545 | } | 1555 | } |
1546 | } | 1556 | } |
1547 | 1557 | ||
1558 | /* | ||
1559 | * Return the compund head page with ref appropriately incremented, | ||
1560 | * or NULL if that failed. | ||
1561 | */ | ||
1562 | static inline struct page *try_get_compound_head(struct page *page, int refs) | ||
1563 | { | ||
1564 | struct page *head = compound_head(page); | ||
1565 | if (WARN_ON_ONCE(page_ref_count(head) < 0)) | ||
1566 | return NULL; | ||
1567 | if (unlikely(!page_cache_add_speculative(head, refs))) | ||
1568 | return NULL; | ||
1569 | return head; | ||
1570 | } | ||
1571 | |||
1548 | #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL | 1572 | #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL |
1549 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | 1573 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, |
1550 | int write, struct page **pages, int *nr) | 1574 | int write, struct page **pages, int *nr) |
@@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1579 | 1603 | ||
1580 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 1604 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
1581 | page = pte_page(pte); | 1605 | page = pte_page(pte); |
1582 | head = compound_head(page); | ||
1583 | 1606 | ||
1584 | if (!page_cache_get_speculative(head)) | 1607 | head = try_get_compound_head(page, 1); |
1608 | if (!head) | ||
1585 | goto pte_unmap; | 1609 | goto pte_unmap; |
1586 | 1610 | ||
1587 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | 1611 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { |
@@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | |||
1720 | refs++; | 1744 | refs++; |
1721 | } while (addr += PAGE_SIZE, addr != end); | 1745 | } while (addr += PAGE_SIZE, addr != end); |
1722 | 1746 | ||
1723 | head = compound_head(pmd_page(orig)); | 1747 | head = try_get_compound_head(pmd_page(orig), refs); |
1724 | if (!page_cache_add_speculative(head, refs)) { | 1748 | if (!head) { |
1725 | *nr -= refs; | 1749 | *nr -= refs; |
1726 | return 0; | 1750 | return 0; |
1727 | } | 1751 | } |
@@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | |||
1758 | refs++; | 1782 | refs++; |
1759 | } while (addr += PAGE_SIZE, addr != end); | 1783 | } while (addr += PAGE_SIZE, addr != end); |
1760 | 1784 | ||
1761 | head = compound_head(pud_page(orig)); | 1785 | head = try_get_compound_head(pud_page(orig), refs); |
1762 | if (!page_cache_add_speculative(head, refs)) { | 1786 | if (!head) { |
1763 | *nr -= refs; | 1787 | *nr -= refs; |
1764 | return 0; | 1788 | return 0; |
1765 | } | 1789 | } |
@@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1795 | refs++; | 1819 | refs++; |
1796 | } while (addr += PAGE_SIZE, addr != end); | 1820 | } while (addr += PAGE_SIZE, addr != end); |
1797 | 1821 | ||
1798 | head = compound_head(pgd_page(orig)); | 1822 | head = try_get_compound_head(pgd_page(orig), refs); |
1799 | if (!page_cache_add_speculative(head, refs)) { | 1823 | if (!head) { |
1800 | *nr -= refs; | 1824 | *nr -= refs; |
1801 | return 0; | 1825 | return 0; |
1802 | } | 1826 | } |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 97b1e0290c66..6cdc7b2d9100 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
4299 | 4299 | ||
4300 | pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; | 4300 | pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; |
4301 | page = pte_page(huge_ptep_get(pte)); | 4301 | page = pte_page(huge_ptep_get(pte)); |
4302 | |||
4303 | /* | ||
4304 | * Instead of doing 'try_get_page()' below in the same_page | ||
4305 | * loop, just check the count once here. | ||
4306 | */ | ||
4307 | if (unlikely(page_count(page) <= 0)) { | ||
4308 | if (pages) { | ||
4309 | spin_unlock(ptl); | ||
4310 | remainder = 0; | ||
4311 | err = -ENOMEM; | ||
4312 | break; | ||
4313 | } | ||
4314 | } | ||
4302 | same_page: | 4315 | same_page: |
4303 | if (pages) { | 4316 | if (pages) { |
4304 | pages[i] = mem_map_offset(page, pfn_offset); | 4317 | pages[i] = mem_map_offset(page, pfn_offset); |