aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c741
1 files changed, 413 insertions, 328 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 2c012eee133d..b3db3779a30a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -36,6 +36,7 @@
36#include <linux/uio.h> 36#include <linux/uio.h>
37#include <linux/khugepaged.h> 37#include <linux/khugepaged.h>
38#include <linux/hugetlb.h> 38#include <linux/hugetlb.h>
39#include <linux/frontswap.h>
39 40
40#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ 41#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
41 42
@@ -123,6 +124,10 @@ static unsigned long shmem_default_max_inodes(void)
123static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 124static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
124static int shmem_replace_page(struct page **pagep, gfp_t gfp, 125static int shmem_replace_page(struct page **pagep, gfp_t gfp,
125 struct shmem_inode_info *info, pgoff_t index); 126 struct shmem_inode_info *info, pgoff_t index);
127static int shmem_swapin_page(struct inode *inode, pgoff_t index,
128 struct page **pagep, enum sgp_type sgp,
129 gfp_t gfp, struct vm_area_struct *vma,
130 vm_fault_t *fault_type);
126static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 131static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
127 struct page **pagep, enum sgp_type sgp, 132 struct page **pagep, enum sgp_type sgp,
128 gfp_t gfp, struct vm_area_struct *vma, 133 gfp_t gfp, struct vm_area_struct *vma,
@@ -1089,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode)
1089 clear_inode(inode); 1094 clear_inode(inode);
1090} 1095}
1091 1096
1092static unsigned long find_swap_entry(struct xarray *xa, void *item) 1097extern struct swap_info_struct *swap_info[];
1098
1099static int shmem_find_swap_entries(struct address_space *mapping,
1100 pgoff_t start, unsigned int nr_entries,
1101 struct page **entries, pgoff_t *indices,
1102 bool frontswap)
1093{ 1103{
1094 XA_STATE(xas, xa, 0); 1104 XA_STATE(xas, &mapping->i_pages, start);
1095 unsigned int checked = 0; 1105 struct page *page;
1096 void *entry; 1106 unsigned int ret = 0;
1107
1108 if (!nr_entries)
1109 return 0;
1097 1110
1098 rcu_read_lock(); 1111 rcu_read_lock();
1099 xas_for_each(&xas, entry, ULONG_MAX) { 1112 xas_for_each(&xas, page, ULONG_MAX) {
1100 if (xas_retry(&xas, entry)) 1113 if (xas_retry(&xas, page))
1101 continue; 1114 continue;
1102 if (entry == item) 1115
1103 break; 1116 if (!xa_is_value(page))
1104 checked++;
1105 if ((checked % XA_CHECK_SCHED) != 0)
1106 continue; 1117 continue;
1107 xas_pause(&xas); 1118
1108 cond_resched_rcu(); 1119 if (frontswap) {
1120 swp_entry_t entry = radix_to_swp_entry(page);
1121
1122 if (!frontswap_test(swap_info[swp_type(entry)],
1123 swp_offset(entry)))
1124 continue;
1125 }
1126
1127 indices[ret] = xas.xa_index;
1128 entries[ret] = page;
1129
1130 if (need_resched()) {
1131 xas_pause(&xas);
1132 cond_resched_rcu();
1133 }
1134 if (++ret == nr_entries)
1135 break;
1109 } 1136 }
1110 rcu_read_unlock(); 1137 rcu_read_unlock();
1111 1138
1112 return entry ? xas.xa_index : -1; 1139 return ret;
1113} 1140}
1114 1141
1115/* 1142/*
1116 * If swap found in inode, free it and move page from swapcache to filecache. 1143 * Move the swapped pages for an inode to page cache. Returns the count
1144 * of pages swapped in, or the error in case of failure.
1117 */ 1145 */
1118static int shmem_unuse_inode(struct shmem_inode_info *info, 1146static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1119 swp_entry_t swap, struct page **pagep) 1147 pgoff_t *indices)
1120{ 1148{
1121 struct address_space *mapping = info->vfs_inode.i_mapping; 1149 int i = 0;
1122 void *radswap; 1150 int ret = 0;
1123 pgoff_t index;
1124 gfp_t gfp;
1125 int error = 0; 1151 int error = 0;
1152 struct address_space *mapping = inode->i_mapping;
1126 1153
1127 radswap = swp_to_radix_entry(swap); 1154 for (i = 0; i < pvec.nr; i++) {
1128 index = find_swap_entry(&mapping->i_pages, radswap); 1155 struct page *page = pvec.pages[i];
1129 if (index == -1)
1130 return -EAGAIN; /* tell shmem_unuse we found nothing */
1131 1156
1132 /* 1157 if (!xa_is_value(page))
1133 * Move _head_ to start search for next from here. 1158 continue;
1134 * But be careful: shmem_evict_inode checks list_empty without taking 1159 error = shmem_swapin_page(inode, indices[i],
1135 * mutex, and there's an instant in list_move_tail when info->swaplist 1160 &page, SGP_CACHE,
1136 * would appear empty, if it were the only one on shmem_swaplist. 1161 mapping_gfp_mask(mapping),
1137 */ 1162 NULL, NULL);
1138 if (shmem_swaplist.next != &info->swaplist) 1163 if (error == 0) {
1139 list_move_tail(&shmem_swaplist, &info->swaplist); 1164 unlock_page(page);
1140 1165 put_page(page);
1141 gfp = mapping_gfp_mask(mapping); 1166 ret++;
1142 if (shmem_should_replace_page(*pagep, gfp)) { 1167 }
1143 mutex_unlock(&shmem_swaplist_mutex); 1168 if (error == -ENOMEM)
1144 error = shmem_replace_page(pagep, gfp, info, index); 1169 break;
1145 mutex_lock(&shmem_swaplist_mutex); 1170 error = 0;
1146 /*
1147 * We needed to drop mutex to make that restrictive page
1148 * allocation, but the inode might have been freed while we
1149 * dropped it: although a racing shmem_evict_inode() cannot
1150 * complete without emptying the page cache, our page lock
1151 * on this swapcache page is not enough to prevent that -
1152 * free_swap_and_cache() of our swap entry will only
1153 * trylock_page(), removing swap from page cache whatever.
1154 *
1155 * We must not proceed to shmem_add_to_page_cache() if the
1156 * inode has been freed, but of course we cannot rely on
1157 * inode or mapping or info to check that. However, we can
1158 * safely check if our swap entry is still in use (and here
1159 * it can't have got reused for another page): if it's still
1160 * in use, then the inode cannot have been freed yet, and we
1161 * can safely proceed (if it's no longer in use, that tells
1162 * nothing about the inode, but we don't need to unuse swap).
1163 */
1164 if (!page_swapcount(*pagep))
1165 error = -ENOENT;
1166 } 1171 }
1172 return error ? error : ret;
1173}
1167 1174
1168 /* 1175/*
1169 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 1176 * If swap found in inode, free it and move page from swapcache to filecache.
1170 * but also to hold up shmem_evict_inode(): so inode cannot be freed 1177 */
1171 * beneath us (pagelock doesn't help until the page is in pagecache). 1178static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1172 */ 1179 bool frontswap, unsigned long *fs_pages_to_unuse)
1173 if (!error) 1180{
1174 error = shmem_add_to_page_cache(*pagep, mapping, index, 1181 struct address_space *mapping = inode->i_mapping;
1175 radswap, gfp); 1182 pgoff_t start = 0;
1176 if (error != -ENOMEM) { 1183 struct pagevec pvec;
1177 /* 1184 pgoff_t indices[PAGEVEC_SIZE];
1178 * Truncation and eviction use free_swap_and_cache(), which 1185 bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1179 * only does trylock page: if we raced, best clean up here. 1186 int ret = 0;
1180 */ 1187
1181 delete_from_swap_cache(*pagep); 1188 pagevec_init(&pvec);
1182 set_page_dirty(*pagep); 1189 do {
1183 if (!error) { 1190 unsigned int nr_entries = PAGEVEC_SIZE;
1184 spin_lock_irq(&info->lock); 1191
1185 info->swapped--; 1192 if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1186 spin_unlock_irq(&info->lock); 1193 nr_entries = *fs_pages_to_unuse;
1187 swap_free(swap); 1194
1195 pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1196 pvec.pages, indices,
1197 frontswap);
1198 if (pvec.nr == 0) {
1199 ret = 0;
1200 break;
1188 } 1201 }
1189 } 1202
1190 return error; 1203 ret = shmem_unuse_swap_entries(inode, pvec, indices);
1204 if (ret < 0)
1205 break;
1206
1207 if (frontswap_partial) {
1208 *fs_pages_to_unuse -= ret;
1209 if (*fs_pages_to_unuse == 0) {
1210 ret = FRONTSWAP_PAGES_UNUSED;
1211 break;
1212 }
1213 }
1214
1215 start = indices[pvec.nr - 1];
1216 } while (true);
1217
1218 return ret;
1191} 1219}
1192 1220
1193/* 1221/*
1194 * Search through swapped inodes to find and replace swap by page. 1222 * Read all the shared memory data that resides in the swap
1223 * device 'type' back into memory, so the swap device can be
1224 * unused.
1195 */ 1225 */
1196int shmem_unuse(swp_entry_t swap, struct page *page) 1226int shmem_unuse(unsigned int type, bool frontswap,
1227 unsigned long *fs_pages_to_unuse)
1197{ 1228{
1198 struct list_head *this, *next; 1229 struct shmem_inode_info *info, *next;
1199 struct shmem_inode_info *info; 1230 struct inode *inode;
1200 struct mem_cgroup *memcg; 1231 struct inode *prev_inode = NULL;
1201 int error = 0; 1232 int error = 0;
1202 1233
1203 /* 1234 if (list_empty(&shmem_swaplist))
1204 * There's a faint possibility that swap page was replaced before 1235 return 0;
1205 * caller locked it: caller will come back later with the right page. 1236
1206 */ 1237 mutex_lock(&shmem_swaplist_mutex);
1207 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
1208 goto out;
1209 1238
1210 /* 1239 /*
1211 * Charge page using GFP_KERNEL while we can wait, before taking 1240 * The extra refcount on the inode is necessary to safely dereference
1212 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1241 * p->next after re-acquiring the lock. New shmem inodes with swap
1213 * Charged back to the user (not to caller) when swap account is used. 1242 * get added to the end of the list and we will scan them all.
1214 */ 1243 */
1215 error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, 1244 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1216 &memcg, false); 1245 if (!info->swapped) {
1217 if (error)
1218 goto out;
1219 /* No memory allocation: swap entry occupies the slot for the page */
1220 error = -EAGAIN;
1221
1222 mutex_lock(&shmem_swaplist_mutex);
1223 list_for_each_safe(this, next, &shmem_swaplist) {
1224 info = list_entry(this, struct shmem_inode_info, swaplist);
1225 if (info->swapped)
1226 error = shmem_unuse_inode(info, swap, &page);
1227 else
1228 list_del_init(&info->swaplist); 1246 list_del_init(&info->swaplist);
1247 continue;
1248 }
1249
1250 inode = igrab(&info->vfs_inode);
1251 if (!inode)
1252 continue;
1253
1254 mutex_unlock(&shmem_swaplist_mutex);
1255 if (prev_inode)
1256 iput(prev_inode);
1257 prev_inode = inode;
1258
1259 error = shmem_unuse_inode(inode, type, frontswap,
1260 fs_pages_to_unuse);
1229 cond_resched(); 1261 cond_resched();
1230 if (error != -EAGAIN) 1262
1263 mutex_lock(&shmem_swaplist_mutex);
1264 next = list_next_entry(info, swaplist);
1265 if (!info->swapped)
1266 list_del_init(&info->swaplist);
1267 if (error)
1231 break; 1268 break;
1232 /* found nothing in this: move on to search the next */
1233 } 1269 }
1234 mutex_unlock(&shmem_swaplist_mutex); 1270 mutex_unlock(&shmem_swaplist_mutex);
1235 1271
1236 if (error) { 1272 if (prev_inode)
1237 if (error != -ENOMEM) 1273 iput(prev_inode);
1238 error = 0; 1274
1239 mem_cgroup_cancel_charge(page, memcg, false);
1240 } else
1241 mem_cgroup_commit_charge(page, memcg, true, false);
1242out:
1243 unlock_page(page);
1244 put_page(page);
1245 return error; 1275 return error;
1246} 1276}
1247 1277
@@ -1325,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1325 */ 1355 */
1326 mutex_lock(&shmem_swaplist_mutex); 1356 mutex_lock(&shmem_swaplist_mutex);
1327 if (list_empty(&info->swaplist)) 1357 if (list_empty(&info->swaplist))
1328 list_add_tail(&info->swaplist, &shmem_swaplist); 1358 list_add(&info->swaplist, &shmem_swaplist);
1329 1359
1330 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1360 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1331 spin_lock_irq(&info->lock); 1361 spin_lock_irq(&info->lock);
@@ -1576,6 +1606,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1576} 1606}
1577 1607
1578/* 1608/*
1609 * Swap in the page pointed to by *pagep.
1610 * Caller has to make sure that *pagep contains a valid swapped page.
1611 * Returns 0 and the page in pagep if success. On failure, returns the
1612 * the error code and NULL in *pagep.
1613 */
1614static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1615 struct page **pagep, enum sgp_type sgp,
1616 gfp_t gfp, struct vm_area_struct *vma,
1617 vm_fault_t *fault_type)
1618{
1619 struct address_space *mapping = inode->i_mapping;
1620 struct shmem_inode_info *info = SHMEM_I(inode);
1621 struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
1622 struct mem_cgroup *memcg;
1623 struct page *page;
1624 swp_entry_t swap;
1625 int error;
1626
1627 VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1628 swap = radix_to_swp_entry(*pagep);
1629 *pagep = NULL;
1630
1631 /* Look it up and read it in.. */
1632 page = lookup_swap_cache(swap, NULL, 0);
1633 if (!page) {
1634 /* Or update major stats only when swapin succeeds?? */
1635 if (fault_type) {
1636 *fault_type |= VM_FAULT_MAJOR;
1637 count_vm_event(PGMAJFAULT);
1638 count_memcg_event_mm(charge_mm, PGMAJFAULT);
1639 }
1640 /* Here we actually start the io */
1641 page = shmem_swapin(swap, gfp, info, index);
1642 if (!page) {
1643 error = -ENOMEM;
1644 goto failed;
1645 }
1646 }
1647
1648 /* We have to do this with page locked to prevent races */
1649 lock_page(page);
1650 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1651 !shmem_confirm_swap(mapping, index, swap)) {
1652 error = -EEXIST;
1653 goto unlock;
1654 }
1655 if (!PageUptodate(page)) {
1656 error = -EIO;
1657 goto failed;
1658 }
1659 wait_on_page_writeback(page);
1660
1661 if (shmem_should_replace_page(page, gfp)) {
1662 error = shmem_replace_page(&page, gfp, info, index);
1663 if (error)
1664 goto failed;
1665 }
1666
1667 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1668 false);
1669 if (!error) {
1670 error = shmem_add_to_page_cache(page, mapping, index,
1671 swp_to_radix_entry(swap), gfp);
1672 /*
1673 * We already confirmed swap under page lock, and make
1674 * no memory allocation here, so usually no possibility
1675 * of error; but free_swap_and_cache() only trylocks a
1676 * page, so it is just possible that the entry has been
1677 * truncated or holepunched since swap was confirmed.
1678 * shmem_undo_range() will have done some of the
1679 * unaccounting, now delete_from_swap_cache() will do
1680 * the rest.
1681 */
1682 if (error) {
1683 mem_cgroup_cancel_charge(page, memcg, false);
1684 delete_from_swap_cache(page);
1685 }
1686 }
1687 if (error)
1688 goto failed;
1689
1690 mem_cgroup_commit_charge(page, memcg, true, false);
1691
1692 spin_lock_irq(&info->lock);
1693 info->swapped--;
1694 shmem_recalc_inode(inode);
1695 spin_unlock_irq(&info->lock);
1696
1697 if (sgp == SGP_WRITE)
1698 mark_page_accessed(page);
1699
1700 delete_from_swap_cache(page);
1701 set_page_dirty(page);
1702 swap_free(swap);
1703
1704 *pagep = page;
1705 return 0;
1706failed:
1707 if (!shmem_confirm_swap(mapping, index, swap))
1708 error = -EEXIST;
1709unlock:
1710 if (page) {
1711 unlock_page(page);
1712 put_page(page);
1713 }
1714
1715 return error;
1716}
1717
1718/*
1579 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1719 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1580 * 1720 *
1581 * If we allocate a new one we do not mark it dirty. That's up to the 1721 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -1596,7 +1736,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1596 struct mm_struct *charge_mm; 1736 struct mm_struct *charge_mm;
1597 struct mem_cgroup *memcg; 1737 struct mem_cgroup *memcg;
1598 struct page *page; 1738 struct page *page;
1599 swp_entry_t swap;
1600 enum sgp_type sgp_huge = sgp; 1739 enum sgp_type sgp_huge = sgp;
1601 pgoff_t hindex = index; 1740 pgoff_t hindex = index;
1602 int error; 1741 int error;
@@ -1608,17 +1747,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1608 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) 1747 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
1609 sgp = SGP_CACHE; 1748 sgp = SGP_CACHE;
1610repeat: 1749repeat:
1611 swap.val = 0; 1750 if (sgp <= SGP_CACHE &&
1751 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1752 return -EINVAL;
1753 }
1754
1755 sbinfo = SHMEM_SB(inode->i_sb);
1756 charge_mm = vma ? vma->vm_mm : current->mm;
1757
1612 page = find_lock_entry(mapping, index); 1758 page = find_lock_entry(mapping, index);
1613 if (xa_is_value(page)) { 1759 if (xa_is_value(page)) {
1614 swap = radix_to_swp_entry(page); 1760 error = shmem_swapin_page(inode, index, &page,
1615 page = NULL; 1761 sgp, gfp, vma, fault_type);
1616 } 1762 if (error == -EEXIST)
1763 goto repeat;
1617 1764
1618 if (sgp <= SGP_CACHE && 1765 *pagep = page;
1619 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1766 return error;
1620 error = -EINVAL;
1621 goto unlock;
1622 } 1767 }
1623 1768
1624 if (page && sgp == SGP_WRITE) 1769 if (page && sgp == SGP_WRITE)
@@ -1632,7 +1777,7 @@ repeat:
1632 put_page(page); 1777 put_page(page);
1633 page = NULL; 1778 page = NULL;
1634 } 1779 }
1635 if (page || (sgp == SGP_READ && !swap.val)) { 1780 if (page || sgp == SGP_READ) {
1636 *pagep = page; 1781 *pagep = page;
1637 return 0; 1782 return 0;
1638 } 1783 }
@@ -1641,215 +1786,138 @@ repeat:
1641 * Fast cache lookup did not find it: 1786 * Fast cache lookup did not find it:
1642 * bring it back from swap or allocate. 1787 * bring it back from swap or allocate.
1643 */ 1788 */
1644 sbinfo = SHMEM_SB(inode->i_sb);
1645 charge_mm = vma ? vma->vm_mm : current->mm;
1646
1647 if (swap.val) {
1648 /* Look it up and read it in.. */
1649 page = lookup_swap_cache(swap, NULL, 0);
1650 if (!page) {
1651 /* Or update major stats only when swapin succeeds?? */
1652 if (fault_type) {
1653 *fault_type |= VM_FAULT_MAJOR;
1654 count_vm_event(PGMAJFAULT);
1655 count_memcg_event_mm(charge_mm, PGMAJFAULT);
1656 }
1657 /* Here we actually start the io */
1658 page = shmem_swapin(swap, gfp, info, index);
1659 if (!page) {
1660 error = -ENOMEM;
1661 goto failed;
1662 }
1663 }
1664
1665 /* We have to do this with page locked to prevent races */
1666 lock_page(page);
1667 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1668 !shmem_confirm_swap(mapping, index, swap)) {
1669 error = -EEXIST; /* try again */
1670 goto unlock;
1671 }
1672 if (!PageUptodate(page)) {
1673 error = -EIO;
1674 goto failed;
1675 }
1676 wait_on_page_writeback(page);
1677
1678 if (shmem_should_replace_page(page, gfp)) {
1679 error = shmem_replace_page(&page, gfp, info, index);
1680 if (error)
1681 goto failed;
1682 }
1683
1684 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1685 false);
1686 if (!error) {
1687 error = shmem_add_to_page_cache(page, mapping, index,
1688 swp_to_radix_entry(swap), gfp);
1689 /*
1690 * We already confirmed swap under page lock, and make
1691 * no memory allocation here, so usually no possibility
1692 * of error; but free_swap_and_cache() only trylocks a
1693 * page, so it is just possible that the entry has been
1694 * truncated or holepunched since swap was confirmed.
1695 * shmem_undo_range() will have done some of the
1696 * unaccounting, now delete_from_swap_cache() will do
1697 * the rest.
1698 * Reset swap.val? No, leave it so "failed" goes back to
1699 * "repeat": reading a hole and writing should succeed.
1700 */
1701 if (error) {
1702 mem_cgroup_cancel_charge(page, memcg, false);
1703 delete_from_swap_cache(page);
1704 }
1705 }
1706 if (error)
1707 goto failed;
1708
1709 mem_cgroup_commit_charge(page, memcg, true, false);
1710
1711 spin_lock_irq(&info->lock);
1712 info->swapped--;
1713 shmem_recalc_inode(inode);
1714 spin_unlock_irq(&info->lock);
1715
1716 if (sgp == SGP_WRITE)
1717 mark_page_accessed(page);
1718 1789
1719 delete_from_swap_cache(page); 1790 if (vma && userfaultfd_missing(vma)) {
1720 set_page_dirty(page); 1791 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1721 swap_free(swap); 1792 return 0;
1722 1793 }
1723 } else {
1724 if (vma && userfaultfd_missing(vma)) {
1725 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1726 return 0;
1727 }
1728 1794
1729 /* shmem_symlink() */ 1795 /* shmem_symlink() */
1730 if (mapping->a_ops != &shmem_aops) 1796 if (mapping->a_ops != &shmem_aops)
1731 goto alloc_nohuge; 1797 goto alloc_nohuge;
1732 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) 1798 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1733 goto alloc_nohuge; 1799 goto alloc_nohuge;
1734 if (shmem_huge == SHMEM_HUGE_FORCE) 1800 if (shmem_huge == SHMEM_HUGE_FORCE)
1801 goto alloc_huge;
1802 switch (sbinfo->huge) {
1803 loff_t i_size;
1804 pgoff_t off;
1805 case SHMEM_HUGE_NEVER:
1806 goto alloc_nohuge;
1807 case SHMEM_HUGE_WITHIN_SIZE:
1808 off = round_up(index, HPAGE_PMD_NR);
1809 i_size = round_up(i_size_read(inode), PAGE_SIZE);
1810 if (i_size >= HPAGE_PMD_SIZE &&
1811 i_size >> PAGE_SHIFT >= off)
1735 goto alloc_huge; 1812 goto alloc_huge;
1736 switch (sbinfo->huge) { 1813 /* fallthrough */
1737 loff_t i_size; 1814 case SHMEM_HUGE_ADVISE:
1738 pgoff_t off; 1815 if (sgp_huge == SGP_HUGE)
1739 case SHMEM_HUGE_NEVER: 1816 goto alloc_huge;
1740 goto alloc_nohuge; 1817 /* TODO: implement fadvise() hints */
1741 case SHMEM_HUGE_WITHIN_SIZE: 1818 goto alloc_nohuge;
1742 off = round_up(index, HPAGE_PMD_NR); 1819 }
1743 i_size = round_up(i_size_read(inode), PAGE_SIZE);
1744 if (i_size >= HPAGE_PMD_SIZE &&
1745 i_size >> PAGE_SHIFT >= off)
1746 goto alloc_huge;
1747 /* fallthrough */
1748 case SHMEM_HUGE_ADVISE:
1749 if (sgp_huge == SGP_HUGE)
1750 goto alloc_huge;
1751 /* TODO: implement fadvise() hints */
1752 goto alloc_nohuge;
1753 }
1754 1820
1755alloc_huge: 1821alloc_huge:
1756 page = shmem_alloc_and_acct_page(gfp, inode, index, true); 1822 page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1757 if (IS_ERR(page)) { 1823 if (IS_ERR(page)) {
1758alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, 1824alloc_nohuge:
1759 index, false); 1825 page = shmem_alloc_and_acct_page(gfp, inode,
1760 } 1826 index, false);
1761 if (IS_ERR(page)) { 1827 }
1762 int retry = 5; 1828 if (IS_ERR(page)) {
1763 error = PTR_ERR(page); 1829 int retry = 5;
1764 page = NULL;
1765 if (error != -ENOSPC)
1766 goto failed;
1767 /*
1768 * Try to reclaim some spece by splitting a huge page
1769 * beyond i_size on the filesystem.
1770 */
1771 while (retry--) {
1772 int ret;
1773 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1774 if (ret == SHRINK_STOP)
1775 break;
1776 if (ret)
1777 goto alloc_nohuge;
1778 }
1779 goto failed;
1780 }
1781
1782 if (PageTransHuge(page))
1783 hindex = round_down(index, HPAGE_PMD_NR);
1784 else
1785 hindex = index;
1786 1830
1787 if (sgp == SGP_WRITE) 1831 error = PTR_ERR(page);
1788 __SetPageReferenced(page); 1832 page = NULL;
1833 if (error != -ENOSPC)
1834 goto unlock;
1835 /*
1836 * Try to reclaim some space by splitting a huge page
1837 * beyond i_size on the filesystem.
1838 */
1839 while (retry--) {
1840 int ret;
1789 1841
1790 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1842 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1791 PageTransHuge(page)); 1843 if (ret == SHRINK_STOP)
1792 if (error) 1844 break;
1793 goto unacct; 1845 if (ret)
1794 error = shmem_add_to_page_cache(page, mapping, hindex, 1846 goto alloc_nohuge;
1795 NULL, gfp & GFP_RECLAIM_MASK);
1796 if (error) {
1797 mem_cgroup_cancel_charge(page, memcg,
1798 PageTransHuge(page));
1799 goto unacct;
1800 } 1847 }
1801 mem_cgroup_commit_charge(page, memcg, false, 1848 goto unlock;
1802 PageTransHuge(page)); 1849 }
1803 lru_cache_add_anon(page);
1804 1850
1805 spin_lock_irq(&info->lock); 1851 if (PageTransHuge(page))
1806 info->alloced += 1 << compound_order(page); 1852 hindex = round_down(index, HPAGE_PMD_NR);
1807 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1853 else
1808 shmem_recalc_inode(inode); 1854 hindex = index;
1809 spin_unlock_irq(&info->lock);
1810 alloced = true;
1811 1855
1812 if (PageTransHuge(page) && 1856 if (sgp == SGP_WRITE)
1813 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1857 __SetPageReferenced(page);
1814 hindex + HPAGE_PMD_NR - 1) { 1858
1815 /* 1859 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1816 * Part of the huge page is beyond i_size: subject 1860 PageTransHuge(page));
1817 * to shrink under memory pressure. 1861 if (error)
1818 */ 1862 goto unacct;
1819 spin_lock(&sbinfo->shrinklist_lock); 1863 error = shmem_add_to_page_cache(page, mapping, hindex,
1820 /* 1864 NULL, gfp & GFP_RECLAIM_MASK);
1821 * _careful to defend against unlocked access to 1865 if (error) {
1822 * ->shrink_list in shmem_unused_huge_shrink() 1866 mem_cgroup_cancel_charge(page, memcg,
1823 */ 1867 PageTransHuge(page));
1824 if (list_empty_careful(&info->shrinklist)) { 1868 goto unacct;
1825 list_add_tail(&info->shrinklist, 1869 }
1826 &sbinfo->shrinklist); 1870 mem_cgroup_commit_charge(page, memcg, false,
1827 sbinfo->shrinklist_len++; 1871 PageTransHuge(page));
1828 } 1872 lru_cache_add_anon(page);
1829 spin_unlock(&sbinfo->shrinklist_lock);
1830 }
1831 1873
1874 spin_lock_irq(&info->lock);
1875 info->alloced += 1 << compound_order(page);
1876 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1877 shmem_recalc_inode(inode);
1878 spin_unlock_irq(&info->lock);
1879 alloced = true;
1880
1881 if (PageTransHuge(page) &&
1882 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1883 hindex + HPAGE_PMD_NR - 1) {
1832 /* 1884 /*
1833 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1885 * Part of the huge page is beyond i_size: subject
1886 * to shrink under memory pressure.
1834 */ 1887 */
1835 if (sgp == SGP_FALLOC) 1888 spin_lock(&sbinfo->shrinklist_lock);
1836 sgp = SGP_WRITE;
1837clear:
1838 /* 1889 /*
1839 * Let SGP_WRITE caller clear ends if write does not fill page; 1890 * _careful to defend against unlocked access to
1840 * but SGP_FALLOC on a page fallocated earlier must initialize 1891 * ->shrink_list in shmem_unused_huge_shrink()
1841 * it now, lest undo on failure cancel our earlier guarantee.
1842 */ 1892 */
1843 if (sgp != SGP_WRITE && !PageUptodate(page)) { 1893 if (list_empty_careful(&info->shrinklist)) {
1844 struct page *head = compound_head(page); 1894 list_add_tail(&info->shrinklist,
1845 int i; 1895 &sbinfo->shrinklist);
1896 sbinfo->shrinklist_len++;
1897 }
1898 spin_unlock(&sbinfo->shrinklist_lock);
1899 }
1846 1900
1847 for (i = 0; i < (1 << compound_order(head)); i++) { 1901 /*
1848 clear_highpage(head + i); 1902 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1849 flush_dcache_page(head + i); 1903 */
1850 } 1904 if (sgp == SGP_FALLOC)
1851 SetPageUptodate(head); 1905 sgp = SGP_WRITE;
1906clear:
1907 /*
1908 * Let SGP_WRITE caller clear ends if write does not fill page;
1909 * but SGP_FALLOC on a page fallocated earlier must initialize
1910 * it now, lest undo on failure cancel our earlier guarantee.
1911 */
1912 if (sgp != SGP_WRITE && !PageUptodate(page)) {
1913 struct page *head = compound_head(page);
1914 int i;
1915
1916 for (i = 0; i < (1 << compound_order(head)); i++) {
1917 clear_highpage(head + i);
1918 flush_dcache_page(head + i);
1852 } 1919 }
1920 SetPageUptodate(head);
1853 } 1921 }
1854 1922
1855 /* Perhaps the file has been truncated since we checked */ 1923 /* Perhaps the file has been truncated since we checked */
@@ -1879,9 +1947,6 @@ unacct:
1879 put_page(page); 1947 put_page(page);
1880 goto alloc_nohuge; 1948 goto alloc_nohuge;
1881 } 1949 }
1882failed:
1883 if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1884 error = -EEXIST;
1885unlock: 1950unlock:
1886 if (page) { 1951 if (page) {
1887 unlock_page(page); 1952 unlock_page(page);
@@ -2125,6 +2190,24 @@ out_nomem:
2125 2190
2126static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2191static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2127{ 2192{
2193 struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2194
2195 if (info->seals & F_SEAL_FUTURE_WRITE) {
2196 /*
2197 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2198 * "future write" seal active.
2199 */
2200 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2201 return -EPERM;
2202
2203 /*
2204 * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
2205 * read-only mapping, take care to not allow mprotect to revert
2206 * protections.
2207 */
2208 vma->vm_flags &= ~(VM_MAYWRITE);
2209 }
2210
2128 file_accessed(file); 2211 file_accessed(file);
2129 vma->vm_ops = &shmem_vm_ops; 2212 vma->vm_ops = &shmem_vm_ops;
2130 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && 2213 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
@@ -2375,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
2375 pgoff_t index = pos >> PAGE_SHIFT; 2458 pgoff_t index = pos >> PAGE_SHIFT;
2376 2459
2377 /* i_mutex is held by caller */ 2460 /* i_mutex is held by caller */
2378 if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { 2461 if (unlikely(info->seals & (F_SEAL_GROW |
2379 if (info->seals & F_SEAL_WRITE) 2462 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2463 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2380 return -EPERM; 2464 return -EPERM;
2381 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2465 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2382 return -EPERM; 2466 return -EPERM;
@@ -2639,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2639 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 2723 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2640 2724
2641 /* protected by i_mutex */ 2725 /* protected by i_mutex */
2642 if (info->seals & F_SEAL_WRITE) { 2726 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2643 error = -EPERM; 2727 error = -EPERM;
2644 goto out; 2728 goto out;
2645 } 2729 }
@@ -3847,7 +3931,8 @@ int __init shmem_init(void)
3847 return 0; 3931 return 0;
3848} 3932}
3849 3933
3850int shmem_unuse(swp_entry_t swap, struct page *page) 3934int shmem_unuse(unsigned int type, bool frontswap,
3935 unsigned long *fs_pages_to_unuse)
3851{ 3936{
3852 return 0; 3937 return 0;
3853} 3938}