summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-04-14 18:09:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-04-14 18:09:40 -0400
commit6b3a707736301c2128ca85ce85fb13f60b5e350a (patch)
tree2bf1892cf29121150adece8d1221ecd513a4e792
parent4443f8e6ac7755cd775c70d08be8042dc2f936cb (diff)
parent15fab63e1e57be9fdb5eec1bbc5916e9825e9acb (diff)
Merge branch 'page-refs' (page ref overflow)
Merge page ref overflow branch. Jann Horn reported that he can overflow the page ref count with sufficient memory (and a filesystem that is intentionally extremely slow). Admittedly it's not exactly easy. To have more than four billion references to a page requires a minimum of 32GB of kernel memory just for the pointers to the pages, much less any metadata to keep track of those pointers. Jann needed a total of 140GB of memory and a specially crafted filesystem that leaves all reads pending (in order to not ever free the page references and just keep adding more). Still, we have a fairly straightforward way to limit the two obvious user-controllable sources of page references: direct-IO like page references gotten through get_user_pages(), and the splice pipe page duplication. So let's just do that. * branch page-refs: fs: prevent page refcount overflow in pipe_buf_get mm: prevent get_user_pages() from overflowing page refcount mm: add 'try_get_page()' helper function mm: make page ref count overflow check tighter and more explicit
-rw-r--r--fs/fuse/dev.c12
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/splice.c12
-rw-r--r--include/linux/mm.h15
-rw-r--r--include/linux/pipe_fs_i.h10
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--mm/gup.c48
-rw-r--r--mm/hugetlb.c13
8 files changed, 92 insertions, 28 deletions
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8a63e52785e9..9971a35cf1ef 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
2056 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; 2056 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
2057 2057
2058 ret = -EINVAL; 2058 ret = -EINVAL;
2059 if (rem < len) { 2059 if (rem < len)
2060 pipe_unlock(pipe); 2060 goto out_free;
2061 goto out;
2062 }
2063 2061
2064 rem = len; 2062 rem = len;
2065 while (rem) { 2063 while (rem) {
@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
2077 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); 2075 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
2078 pipe->nrbufs--; 2076 pipe->nrbufs--;
2079 } else { 2077 } else {
2080 pipe_buf_get(pipe, ibuf); 2078 if (!pipe_buf_get(pipe, ibuf))
2079 goto out_free;
2080
2081 *obuf = *ibuf; 2081 *obuf = *ibuf;
2082 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 2082 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
2083 obuf->len = rem; 2083 obuf->len = rem;
@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
2100 ret = fuse_dev_do_write(fud, &cs, len); 2100 ret = fuse_dev_do_write(fud, &cs, len);
2101 2101
2102 pipe_lock(pipe); 2102 pipe_lock(pipe);
2103out_free:
2103 for (idx = 0; idx < nbuf; idx++) 2104 for (idx = 0; idx < nbuf; idx++)
2104 pipe_buf_release(pipe, &bufs[idx]); 2105 pipe_buf_release(pipe, &bufs[idx]);
2105 pipe_unlock(pipe); 2106 pipe_unlock(pipe);
2106 2107
2107out:
2108 kvfree(bufs); 2108 kvfree(bufs);
2109 return ret; 2109 return ret;
2110} 2110}
diff --git a/fs/pipe.c b/fs/pipe.c
index 070aad543382..41065901106b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
188 * in the tee() system call, when we duplicate the buffers in one 188 * in the tee() system call, when we duplicate the buffers in one
189 * pipe into another. 189 * pipe into another.
190 */ 190 */
191void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 191bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
192{ 192{
193 get_page(buf->page); 193 return try_get_page(buf->page);
194} 194}
195EXPORT_SYMBOL(generic_pipe_buf_get); 195EXPORT_SYMBOL(generic_pipe_buf_get);
196 196
diff --git a/fs/splice.c b/fs/splice.c
index 3ee7e82df48f..98943d9b219c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1593,7 +1593,11 @@ retry:
1593 * Get a reference to this pipe buffer, 1593 * Get a reference to this pipe buffer,
1594 * so we can copy the contents over. 1594 * so we can copy the contents over.
1595 */ 1595 */
1596 pipe_buf_get(ipipe, ibuf); 1596 if (!pipe_buf_get(ipipe, ibuf)) {
1597 if (ret == 0)
1598 ret = -EFAULT;
1599 break;
1600 }
1597 *obuf = *ibuf; 1601 *obuf = *ibuf;
1598 1602
1599 /* 1603 /*
@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1667 * Get a reference to this pipe buffer, 1671 * Get a reference to this pipe buffer,
1668 * so we can copy the contents over. 1672 * so we can copy the contents over.
1669 */ 1673 */
1670 pipe_buf_get(ipipe, ibuf); 1674 if (!pipe_buf_get(ipipe, ibuf)) {
1675 if (ret == 0)
1676 ret = -EFAULT;
1677 break;
1678 }
1671 1679
1672 obuf = opipe->bufs + nbuf; 1680 obuf = opipe->bufs + nbuf;
1673 *obuf = *ibuf; 1681 *obuf = *ibuf;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 76769749b5a5..6b10c21630f5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
966} 966}
967#endif /* CONFIG_DEV_PAGEMAP_OPS */ 967#endif /* CONFIG_DEV_PAGEMAP_OPS */
968 968
969/* 127: arbitrary random number, small enough to assemble well */
970#define page_ref_zero_or_close_to_overflow(page) \
971 ((unsigned int) page_ref_count(page) + 127u <= 127u)
972
969static inline void get_page(struct page *page) 973static inline void get_page(struct page *page)
970{ 974{
971 page = compound_head(page); 975 page = compound_head(page);
@@ -973,8 +977,17 @@ static inline void get_page(struct page *page)
973 * Getting a normal page or the head of a compound page 977 * Getting a normal page or the head of a compound page
974 * requires to already have an elevated page->_refcount. 978 * requires to already have an elevated page->_refcount.
975 */ 979 */
976 VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); 980 VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
981 page_ref_inc(page);
982}
983
984static inline __must_check bool try_get_page(struct page *page)
985{
986 page = compound_head(page);
987 if (WARN_ON_ONCE(page_ref_count(page) <= 0))
988 return false;
977 page_ref_inc(page); 989 page_ref_inc(page);
990 return true;
978} 991}
979 992
980static inline void put_page(struct page *page) 993static inline void put_page(struct page *page)
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 787d224ff43e..abb2dac3da9b 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -101,18 +101,20 @@ struct pipe_buf_operations {
101 /* 101 /*
102 * Get a reference to the pipe buffer. 102 * Get a reference to the pipe buffer.
103 */ 103 */
104 void (*get)(struct pipe_inode_info *, struct pipe_buffer *); 104 bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
105}; 105};
106 106
107/** 107/**
108 * pipe_buf_get - get a reference to a pipe_buffer 108 * pipe_buf_get - get a reference to a pipe_buffer
109 * @pipe: the pipe that the buffer belongs to 109 * @pipe: the pipe that the buffer belongs to
110 * @buf: the buffer to get a reference to 110 * @buf: the buffer to get a reference to
111 *
112 * Return: %true if the reference was successfully obtained.
111 */ 113 */
112static inline void pipe_buf_get(struct pipe_inode_info *pipe, 114static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
113 struct pipe_buffer *buf) 115 struct pipe_buffer *buf)
114{ 116{
115 buf->ops->get(pipe, buf); 117 return buf->ops->get(pipe, buf);
116} 118}
117 119
118/** 120/**
@@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void);
171void free_pipe_info(struct pipe_inode_info *); 173void free_pipe_info(struct pipe_inode_info *);
172 174
173/* Generic pipe buffer ops functions */ 175/* Generic pipe buffer ops functions */
174void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); 176bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
175int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); 177int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
176int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); 178int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
177void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); 179void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..6c24755655c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
7041 buf->private = 0; 7041 buf->private = 0;
7042} 7042}
7043 7043
7044static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, 7044static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
7045 struct pipe_buffer *buf) 7045 struct pipe_buffer *buf)
7046{ 7046{
7047 struct buffer_ref *ref = (struct buffer_ref *)buf->private; 7047 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
7048 7048
7049 if (ref->ref > INT_MAX/2)
7050 return false;
7051
7049 ref->ref++; 7052 ref->ref++;
7053 return true;
7050} 7054}
7051 7055
7052/* Pipe buffer operations for a buffer. */ 7056/* Pipe buffer operations for a buffer. */
diff --git a/mm/gup.c b/mm/gup.c
index f84e22685aaa..91819b8ad9cc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -160,8 +160,12 @@ retry:
160 goto retry; 160 goto retry;
161 } 161 }
162 162
163 if (flags & FOLL_GET) 163 if (flags & FOLL_GET) {
164 get_page(page); 164 if (unlikely(!try_get_page(page))) {
165 page = ERR_PTR(-ENOMEM);
166 goto out;
167 }
168 }
165 if (flags & FOLL_TOUCH) { 169 if (flags & FOLL_TOUCH) {
166 if ((flags & FOLL_WRITE) && 170 if ((flags & FOLL_WRITE) &&
167 !pte_dirty(pte) && !PageDirty(page)) 171 !pte_dirty(pte) && !PageDirty(page))
@@ -298,7 +302,10 @@ retry_locked:
298 if (pmd_trans_unstable(pmd)) 302 if (pmd_trans_unstable(pmd))
299 ret = -EBUSY; 303 ret = -EBUSY;
300 } else { 304 } else {
301 get_page(page); 305 if (unlikely(!try_get_page(page))) {
306 spin_unlock(ptl);
307 return ERR_PTR(-ENOMEM);
308 }
302 spin_unlock(ptl); 309 spin_unlock(ptl);
303 lock_page(page); 310 lock_page(page);
304 ret = split_huge_page(page); 311 ret = split_huge_page(page);
@@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
500 if (is_device_public_page(*page)) 507 if (is_device_public_page(*page))
501 goto unmap; 508 goto unmap;
502 } 509 }
503 get_page(*page); 510 if (unlikely(!try_get_page(*page))) {
511 ret = -ENOMEM;
512 goto unmap;
513 }
504out: 514out:
505 ret = 0; 515 ret = 0;
506unmap: 516unmap:
@@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
1545 } 1555 }
1546} 1556}
1547 1557
1558/*
1559 * Return the compund head page with ref appropriately incremented,
1560 * or NULL if that failed.
1561 */
1562static inline struct page *try_get_compound_head(struct page *page, int refs)
1563{
1564 struct page *head = compound_head(page);
1565 if (WARN_ON_ONCE(page_ref_count(head) < 0))
1566 return NULL;
1567 if (unlikely(!page_cache_add_speculative(head, refs)))
1568 return NULL;
1569 return head;
1570}
1571
1548#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL 1572#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1549static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1573static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1550 int write, struct page **pages, int *nr) 1574 int write, struct page **pages, int *nr)
@@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1579 1603
1580 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1604 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1581 page = pte_page(pte); 1605 page = pte_page(pte);
1582 head = compound_head(page);
1583 1606
1584 if (!page_cache_get_speculative(head)) 1607 head = try_get_compound_head(page, 1);
1608 if (!head)
1585 goto pte_unmap; 1609 goto pte_unmap;
1586 1610
1587 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1611 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
@@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1720 refs++; 1744 refs++;
1721 } while (addr += PAGE_SIZE, addr != end); 1745 } while (addr += PAGE_SIZE, addr != end);
1722 1746
1723 head = compound_head(pmd_page(orig)); 1747 head = try_get_compound_head(pmd_page(orig), refs);
1724 if (!page_cache_add_speculative(head, refs)) { 1748 if (!head) {
1725 *nr -= refs; 1749 *nr -= refs;
1726 return 0; 1750 return 0;
1727 } 1751 }
@@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1758 refs++; 1782 refs++;
1759 } while (addr += PAGE_SIZE, addr != end); 1783 } while (addr += PAGE_SIZE, addr != end);
1760 1784
1761 head = compound_head(pud_page(orig)); 1785 head = try_get_compound_head(pud_page(orig), refs);
1762 if (!page_cache_add_speculative(head, refs)) { 1786 if (!head) {
1763 *nr -= refs; 1787 *nr -= refs;
1764 return 0; 1788 return 0;
1765 } 1789 }
@@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1795 refs++; 1819 refs++;
1796 } while (addr += PAGE_SIZE, addr != end); 1820 } while (addr += PAGE_SIZE, addr != end);
1797 1821
1798 head = compound_head(pgd_page(orig)); 1822 head = try_get_compound_head(pgd_page(orig), refs);
1799 if (!page_cache_add_speculative(head, refs)) { 1823 if (!head) {
1800 *nr -= refs; 1824 *nr -= refs;
1801 return 0; 1825 return 0;
1802 } 1826 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 97b1e0290c66..6cdc7b2d9100 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
4299 4299
4300 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 4300 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
4301 page = pte_page(huge_ptep_get(pte)); 4301 page = pte_page(huge_ptep_get(pte));
4302
4303 /*
4304 * Instead of doing 'try_get_page()' below in the same_page
4305 * loop, just check the count once here.
4306 */
4307 if (unlikely(page_count(page) <= 0)) {
4308 if (pages) {
4309 spin_unlock(ptl);
4310 remainder = 0;
4311 err = -ENOMEM;
4312 break;
4313 }
4314 }
4302same_page: 4315same_page:
4303 if (pages) { 4316 if (pages) {
4304 pages[i] = mem_map_offset(page, pfn_offset); 4317 pages[i] = mem_map_offset(page, pfn_offset);