aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c256
1 files changed, 99 insertions, 157 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index c244e93a70fa..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
264} 264}
265 265
266/* 266/*
267 * Sometimes, before we decide whether to proceed or to fail, we must check
268 * that an entry was not already brought back from swap by a racing thread.
269 *
270 * Checking page is not enough: by the time a SwapCache page is locked, it
271 * might be reused, and again be SwapCache, using the same swap as before.
272 */
273static bool shmem_confirm_swap(struct address_space *mapping,
274 pgoff_t index, swp_entry_t swap)
275{
276 void *item;
277
278 rcu_read_lock();
279 item = radix_tree_lookup(&mapping->page_tree, index);
280 rcu_read_unlock();
281 return item == swp_to_radix_entry(swap);
282}
283
284/*
267 * Like add_to_page_cache_locked, but error if expected item has gone. 285 * Like add_to_page_cache_locked, but error if expected item has gone.
268 */ 286 */
269static int shmem_add_to_page_cache(struct page *page, 287static int shmem_add_to_page_cache(struct page *page,
270 struct address_space *mapping, 288 struct address_space *mapping,
271 pgoff_t index, gfp_t gfp, void *expected) 289 pgoff_t index, gfp_t gfp, void *expected)
272{ 290{
273 int error = 0; 291 int error;
274 292
275 VM_BUG_ON(!PageLocked(page)); 293 VM_BUG_ON(!PageLocked(page));
276 VM_BUG_ON(!PageSwapBacked(page)); 294 VM_BUG_ON(!PageSwapBacked(page));
277 295
296 page_cache_get(page);
297 page->mapping = mapping;
298 page->index = index;
299
300 spin_lock_irq(&mapping->tree_lock);
278 if (!expected) 301 if (!expected)
279 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 302 error = radix_tree_insert(&mapping->page_tree, index, page);
303 else
304 error = shmem_radix_tree_replace(mapping, index, expected,
305 page);
280 if (!error) { 306 if (!error) {
281 page_cache_get(page); 307 mapping->nrpages++;
282 page->mapping = mapping; 308 __inc_zone_page_state(page, NR_FILE_PAGES);
283 page->index = index; 309 __inc_zone_page_state(page, NR_SHMEM);
284 310 spin_unlock_irq(&mapping->tree_lock);
285 spin_lock_irq(&mapping->tree_lock); 311 } else {
286 if (!expected) 312 page->mapping = NULL;
287 error = radix_tree_insert(&mapping->page_tree, 313 spin_unlock_irq(&mapping->tree_lock);
288 index, page); 314 page_cache_release(page);
289 else
290 error = shmem_radix_tree_replace(mapping, index,
291 expected, page);
292 if (!error) {
293 mapping->nrpages++;
294 __inc_zone_page_state(page, NR_FILE_PAGES);
295 __inc_zone_page_state(page, NR_SHMEM);
296 spin_unlock_irq(&mapping->tree_lock);
297 } else {
298 page->mapping = NULL;
299 spin_unlock_irq(&mapping->tree_lock);
300 page_cache_release(page);
301 }
302 if (!expected)
303 radix_tree_preload_end();
304 } 315 }
305 if (error)
306 mem_cgroup_uncharge_cache_page(page);
307 return error; 316 return error;
308} 317}
309 318
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
683 mutex_lock(&shmem_swaplist_mutex); 692 mutex_lock(&shmem_swaplist_mutex);
684 /* 693 /*
685 * We needed to drop mutex to make that restrictive page 694 * We needed to drop mutex to make that restrictive page
686 * allocation; but the inode might already be freed by now, 695 * allocation, but the inode might have been freed while we
687 * and we cannot refer to inode or mapping or info to check. 696 * dropped it: although a racing shmem_evict_inode() cannot
688 * However, we do hold page lock on the PageSwapCache page, 697 * complete without emptying the radix_tree, our page lock
689 * so can check if that still has our reference remaining. 698 * on this swapcache page is not enough to prevent that -
699 * free_swap_and_cache() of our swap entry will only
700 * trylock_page(), removing swap from radix_tree whatever.
701 *
702 * We must not proceed to shmem_add_to_page_cache() if the
703 * inode has been freed, but of course we cannot rely on
704 * inode or mapping or info to check that. However, we can
705 * safely check if our swap entry is still in use (and here
706 * it can't have got reused for another page): if it's still
707 * in use, then the inode cannot have been freed yet, and we
708 * can safely proceed (if it's no longer in use, that tells
709 * nothing about the inode, but we don't need to unuse swap).
690 */ 710 */
691 if (!page_swapcount(*pagep)) 711 if (!page_swapcount(*pagep))
692 error = -ENOENT; 712 error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
730 750
731 /* 751 /*
732 * There's a faint possibility that swap page was replaced before 752 * There's a faint possibility that swap page was replaced before
733 * caller locked it: it will come back later with the right page. 753 * caller locked it: caller will come back later with the right page.
734 */ 754 */
735 if (unlikely(!PageSwapCache(page))) 755 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
736 goto out; 756 goto out;
737 757
738 /* 758 /*
@@ -909,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
909 929
910 /* Create a pseudo vma that just contains the policy */ 930 /* Create a pseudo vma that just contains the policy */
911 pvma.vm_start = 0; 931 pvma.vm_start = 0;
912 pvma.vm_pgoff = index; 932 /* Bias interleave by inode number to distribute better across nodes */
933 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
913 pvma.vm_ops = NULL; 934 pvma.vm_ops = NULL;
914 pvma.vm_policy = spol; 935 pvma.vm_policy = spol;
915 return swapin_readahead(swap, gfp, &pvma, 0); 936 return swapin_readahead(swap, gfp, &pvma, 0);
@@ -922,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
922 943
923 /* Create a pseudo vma that just contains the policy */ 944 /* Create a pseudo vma that just contains the policy */
924 pvma.vm_start = 0; 945 pvma.vm_start = 0;
925 pvma.vm_pgoff = index; 946 /* Bias interleave by inode number to distribute better across nodes */
947 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
926 pvma.vm_ops = NULL; 948 pvma.vm_ops = NULL;
927 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 949 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
928 950
@@ -995,21 +1017,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
995 newpage = shmem_alloc_page(gfp, info, index); 1017 newpage = shmem_alloc_page(gfp, info, index);
996 if (!newpage) 1018 if (!newpage)
997 return -ENOMEM; 1019 return -ENOMEM;
998 VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
999 1020
1000 *pagep = newpage;
1001 page_cache_get(newpage); 1021 page_cache_get(newpage);
1002 copy_highpage(newpage, oldpage); 1022 copy_highpage(newpage, oldpage);
1023 flush_dcache_page(newpage);
1003 1024
1004 VM_BUG_ON(!PageLocked(oldpage));
1005 __set_page_locked(newpage); 1025 __set_page_locked(newpage);
1006 VM_BUG_ON(!PageUptodate(oldpage));
1007 SetPageUptodate(newpage); 1026 SetPageUptodate(newpage);
1008 VM_BUG_ON(!PageSwapBacked(oldpage));
1009 SetPageSwapBacked(newpage); 1027 SetPageSwapBacked(newpage);
1010 VM_BUG_ON(!swap_index);
1011 set_page_private(newpage, swap_index); 1028 set_page_private(newpage, swap_index);
1012 VM_BUG_ON(!PageSwapCache(oldpage));
1013 SetPageSwapCache(newpage); 1029 SetPageSwapCache(newpage);
1014 1030
1015 /* 1031 /*
@@ -1019,13 +1035,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1019 spin_lock_irq(&swap_mapping->tree_lock); 1035 spin_lock_irq(&swap_mapping->tree_lock);
1020 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1036 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1021 newpage); 1037 newpage);
1022 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1038 if (!error) {
1023 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1039 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1040 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1041 }
1024 spin_unlock_irq(&swap_mapping->tree_lock); 1042 spin_unlock_irq(&swap_mapping->tree_lock);
1025 BUG_ON(error);
1026 1043
1027 mem_cgroup_replace_page_cache(oldpage, newpage); 1044 if (unlikely(error)) {
1028 lru_cache_add_anon(newpage); 1045 /*
1046 * Is this possible? I think not, now that our callers check
1047 * both PageSwapCache and page_private after getting page lock;
1048 * but be defensive. Reverse old to newpage for clear and free.
1049 */
1050 oldpage = newpage;
1051 } else {
1052 mem_cgroup_replace_page_cache(oldpage, newpage);
1053 lru_cache_add_anon(newpage);
1054 *pagep = newpage;
1055 }
1029 1056
1030 ClearPageSwapCache(oldpage); 1057 ClearPageSwapCache(oldpage);
1031 set_page_private(oldpage, 0); 1058 set_page_private(oldpage, 0);
@@ -1033,7 +1060,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1033 unlock_page(oldpage); 1060 unlock_page(oldpage);
1034 page_cache_release(oldpage); 1061 page_cache_release(oldpage);
1035 page_cache_release(oldpage); 1062 page_cache_release(oldpage);
1036 return 0; 1063 return error;
1037} 1064}
1038 1065
1039/* 1066/*
@@ -1107,9 +1134,10 @@ repeat:
1107 1134
1108 /* We have to do this with page locked to prevent races */ 1135 /* We have to do this with page locked to prevent races */
1109 lock_page(page); 1136 lock_page(page);
1110 if (!PageSwapCache(page) || page->mapping) { 1137 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1138 !shmem_confirm_swap(mapping, index, swap)) {
1111 error = -EEXIST; /* try again */ 1139 error = -EEXIST; /* try again */
1112 goto failed; 1140 goto unlock;
1113 } 1141 }
1114 if (!PageUptodate(page)) { 1142 if (!PageUptodate(page)) {
1115 error = -EIO; 1143 error = -EIO;
@@ -1125,9 +1153,12 @@ repeat:
1125 1153
1126 error = mem_cgroup_cache_charge(page, current->mm, 1154 error = mem_cgroup_cache_charge(page, current->mm,
1127 gfp & GFP_RECLAIM_MASK); 1155 gfp & GFP_RECLAIM_MASK);
1128 if (!error) 1156 if (!error) {
1129 error = shmem_add_to_page_cache(page, mapping, index, 1157 error = shmem_add_to_page_cache(page, mapping, index,
1130 gfp, swp_to_radix_entry(swap)); 1158 gfp, swp_to_radix_entry(swap));
1159 /* We already confirmed swap, and make no allocation */
1160 VM_BUG_ON(error);
1161 }
1131 if (error) 1162 if (error)
1132 goto failed; 1163 goto failed;
1133 1164
@@ -1164,11 +1195,18 @@ repeat:
1164 __set_page_locked(page); 1195 __set_page_locked(page);
1165 error = mem_cgroup_cache_charge(page, current->mm, 1196 error = mem_cgroup_cache_charge(page, current->mm,
1166 gfp & GFP_RECLAIM_MASK); 1197 gfp & GFP_RECLAIM_MASK);
1167 if (!error)
1168 error = shmem_add_to_page_cache(page, mapping, index,
1169 gfp, NULL);
1170 if (error) 1198 if (error)
1171 goto decused; 1199 goto decused;
1200 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1201 if (!error) {
1202 error = shmem_add_to_page_cache(page, mapping, index,
1203 gfp, NULL);
1204 radix_tree_preload_end();
1205 }
1206 if (error) {
1207 mem_cgroup_uncharge_cache_page(page);
1208 goto decused;
1209 }
1172 lru_cache_add_anon(page); 1210 lru_cache_add_anon(page);
1173 1211
1174 spin_lock(&info->lock); 1212 spin_lock(&info->lock);
@@ -1228,14 +1266,10 @@ decused:
1228unacct: 1266unacct:
1229 shmem_unacct_blocks(info->flags, 1); 1267 shmem_unacct_blocks(info->flags, 1);
1230failed: 1268failed:
1231 if (swap.val && error != -EINVAL) { 1269 if (swap.val && error != -EINVAL &&
1232 struct page *test = find_get_page(mapping, index); 1270 !shmem_confirm_swap(mapping, index, swap))
1233 if (test && !radix_tree_exceptional_entry(test)) 1271 error = -EEXIST;
1234 page_cache_release(test); 1272unlock:
1235 /* Have another try if the entry has changed */
1236 if (test != swp_to_radix_entry(swap))
1237 error = -EEXIST;
1238 }
1239 if (page) { 1273 if (page) {
1240 unlock_page(page); 1274 unlock_page(page);
1241 page_cache_release(page); 1275 page_cache_release(page);
@@ -1247,7 +1281,7 @@ failed:
1247 spin_unlock(&info->lock); 1281 spin_unlock(&info->lock);
1248 goto repeat; 1282 goto repeat;
1249 } 1283 }
1250 if (error == -EEXIST) 1284 if (error == -EEXIST) /* from above or from radix_tree_insert */
1251 goto repeat; 1285 goto repeat;
1252 return error; 1286 return error;
1253} 1287}
@@ -1675,98 +1709,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1675 return error; 1709 return error;
1676} 1710}
1677 1711
1678/*
1679 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1680 */
1681static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1682 pgoff_t index, pgoff_t end, int origin)
1683{
1684 struct page *page;
1685 struct pagevec pvec;
1686 pgoff_t indices[PAGEVEC_SIZE];
1687 bool done = false;
1688 int i;
1689
1690 pagevec_init(&pvec, 0);
1691 pvec.nr = 1; /* start small: we may be there already */
1692 while (!done) {
1693 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1694 pvec.nr, pvec.pages, indices);
1695 if (!pvec.nr) {
1696 if (origin == SEEK_DATA)
1697 index = end;
1698 break;
1699 }
1700 for (i = 0; i < pvec.nr; i++, index++) {
1701 if (index < indices[i]) {
1702 if (origin == SEEK_HOLE) {
1703 done = true;
1704 break;
1705 }
1706 index = indices[i];
1707 }
1708 page = pvec.pages[i];
1709 if (page && !radix_tree_exceptional_entry(page)) {
1710 if (!PageUptodate(page))
1711 page = NULL;
1712 }
1713 if (index >= end ||
1714 (page && origin == SEEK_DATA) ||
1715 (!page && origin == SEEK_HOLE)) {
1716 done = true;
1717 break;
1718 }
1719 }
1720 shmem_deswap_pagevec(&pvec);
1721 pagevec_release(&pvec);
1722 pvec.nr = PAGEVEC_SIZE;
1723 cond_resched();
1724 }
1725 return index;
1726}
1727
1728static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
1729{
1730 struct address_space *mapping;
1731 struct inode *inode;
1732 pgoff_t start, end;
1733 loff_t new_offset;
1734
1735 if (origin != SEEK_DATA && origin != SEEK_HOLE)
1736 return generic_file_llseek_size(file, offset, origin,
1737 MAX_LFS_FILESIZE);
1738 mapping = file->f_mapping;
1739 inode = mapping->host;
1740 mutex_lock(&inode->i_mutex);
1741 /* We're holding i_mutex so we can access i_size directly */
1742
1743 if (offset < 0)
1744 offset = -EINVAL;
1745 else if (offset >= inode->i_size)
1746 offset = -ENXIO;
1747 else {
1748 start = offset >> PAGE_CACHE_SHIFT;
1749 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1750 new_offset = shmem_seek_hole_data(mapping, start, end, origin);
1751 new_offset <<= PAGE_CACHE_SHIFT;
1752 if (new_offset > offset) {
1753 if (new_offset < inode->i_size)
1754 offset = new_offset;
1755 else if (origin == SEEK_DATA)
1756 offset = -ENXIO;
1757 else
1758 offset = inode->i_size;
1759 }
1760 }
1761
1762 if (offset >= 0 && offset != file->f_pos) {
1763 file->f_pos = offset;
1764 file->f_version = 0;
1765 }
1766 mutex_unlock(&inode->i_mutex);
1767 return offset;
1768}
1769
1770static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1712static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1771 loff_t len) 1713 loff_t len)
1772{ 1714{
@@ -1937,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1937} 1879}
1938 1880
1939static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 1881static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1940 struct nameidata *nd) 1882 bool excl)
1941{ 1883{
1942 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 1884 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1943} 1885}
@@ -2770,7 +2712,7 @@ static const struct address_space_operations shmem_aops = {
2770static const struct file_operations shmem_file_operations = { 2712static const struct file_operations shmem_file_operations = {
2771 .mmap = shmem_mmap, 2713 .mmap = shmem_mmap,
2772#ifdef CONFIG_TMPFS 2714#ifdef CONFIG_TMPFS
2773 .llseek = shmem_file_llseek, 2715 .llseek = generic_file_llseek,
2774 .read = do_sync_read, 2716 .read = do_sync_read,
2775 .write = do_sync_write, 2717 .write = do_sync_write,
2776 .aio_read = shmem_file_aio_read, 2718 .aio_read = shmem_file_aio_read,