diff options
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 256 |
1 files changed, 99 insertions, 157 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index c244e93a70fa..d4e184e2a38e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
264 | } | 264 | } |
265 | 265 | ||
266 | /* | 266 | /* |
267 | * Sometimes, before we decide whether to proceed or to fail, we must check | ||
268 | * that an entry was not already brought back from swap by a racing thread. | ||
269 | * | ||
270 | * Checking page is not enough: by the time a SwapCache page is locked, it | ||
271 | * might be reused, and again be SwapCache, using the same swap as before. | ||
272 | */ | ||
273 | static bool shmem_confirm_swap(struct address_space *mapping, | ||
274 | pgoff_t index, swp_entry_t swap) | ||
275 | { | ||
276 | void *item; | ||
277 | |||
278 | rcu_read_lock(); | ||
279 | item = radix_tree_lookup(&mapping->page_tree, index); | ||
280 | rcu_read_unlock(); | ||
281 | return item == swp_to_radix_entry(swap); | ||
282 | } | ||
283 | |||
284 | /* | ||
267 | * Like add_to_page_cache_locked, but error if expected item has gone. | 285 | * Like add_to_page_cache_locked, but error if expected item has gone. |
268 | */ | 286 | */ |
269 | static int shmem_add_to_page_cache(struct page *page, | 287 | static int shmem_add_to_page_cache(struct page *page, |
270 | struct address_space *mapping, | 288 | struct address_space *mapping, |
271 | pgoff_t index, gfp_t gfp, void *expected) | 289 | pgoff_t index, gfp_t gfp, void *expected) |
272 | { | 290 | { |
273 | int error = 0; | 291 | int error; |
274 | 292 | ||
275 | VM_BUG_ON(!PageLocked(page)); | 293 | VM_BUG_ON(!PageLocked(page)); |
276 | VM_BUG_ON(!PageSwapBacked(page)); | 294 | VM_BUG_ON(!PageSwapBacked(page)); |
277 | 295 | ||
296 | page_cache_get(page); | ||
297 | page->mapping = mapping; | ||
298 | page->index = index; | ||
299 | |||
300 | spin_lock_irq(&mapping->tree_lock); | ||
278 | if (!expected) | 301 | if (!expected) |
279 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 302 | error = radix_tree_insert(&mapping->page_tree, index, page); |
303 | else | ||
304 | error = shmem_radix_tree_replace(mapping, index, expected, | ||
305 | page); | ||
280 | if (!error) { | 306 | if (!error) { |
281 | page_cache_get(page); | 307 | mapping->nrpages++; |
282 | page->mapping = mapping; | 308 | __inc_zone_page_state(page, NR_FILE_PAGES); |
283 | page->index = index; | 309 | __inc_zone_page_state(page, NR_SHMEM); |
284 | 310 | spin_unlock_irq(&mapping->tree_lock); | |
285 | spin_lock_irq(&mapping->tree_lock); | 311 | } else { |
286 | if (!expected) | 312 | page->mapping = NULL; |
287 | error = radix_tree_insert(&mapping->page_tree, | 313 | spin_unlock_irq(&mapping->tree_lock); |
288 | index, page); | 314 | page_cache_release(page); |
289 | else | ||
290 | error = shmem_radix_tree_replace(mapping, index, | ||
291 | expected, page); | ||
292 | if (!error) { | ||
293 | mapping->nrpages++; | ||
294 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
295 | __inc_zone_page_state(page, NR_SHMEM); | ||
296 | spin_unlock_irq(&mapping->tree_lock); | ||
297 | } else { | ||
298 | page->mapping = NULL; | ||
299 | spin_unlock_irq(&mapping->tree_lock); | ||
300 | page_cache_release(page); | ||
301 | } | ||
302 | if (!expected) | ||
303 | radix_tree_preload_end(); | ||
304 | } | 315 | } |
305 | if (error) | ||
306 | mem_cgroup_uncharge_cache_page(page); | ||
307 | return error; | 316 | return error; |
308 | } | 317 | } |
309 | 318 | ||
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
683 | mutex_lock(&shmem_swaplist_mutex); | 692 | mutex_lock(&shmem_swaplist_mutex); |
684 | /* | 693 | /* |
685 | * We needed to drop mutex to make that restrictive page | 694 | * We needed to drop mutex to make that restrictive page |
686 | * allocation; but the inode might already be freed by now, | 695 | * allocation, but the inode might have been freed while we |
687 | * and we cannot refer to inode or mapping or info to check. | 696 | * dropped it: although a racing shmem_evict_inode() cannot |
688 | * However, we do hold page lock on the PageSwapCache page, | 697 | * complete without emptying the radix_tree, our page lock |
689 | * so can check if that still has our reference remaining. | 698 | * on this swapcache page is not enough to prevent that - |
699 | * free_swap_and_cache() of our swap entry will only | ||
700 | * trylock_page(), removing swap from radix_tree whatever. | ||
701 | * | ||
702 | * We must not proceed to shmem_add_to_page_cache() if the | ||
703 | * inode has been freed, but of course we cannot rely on | ||
704 | * inode or mapping or info to check that. However, we can | ||
705 | * safely check if our swap entry is still in use (and here | ||
706 | * it can't have got reused for another page): if it's still | ||
707 | * in use, then the inode cannot have been freed yet, and we | ||
708 | * can safely proceed (if it's no longer in use, that tells | ||
709 | * nothing about the inode, but we don't need to unuse swap). | ||
690 | */ | 710 | */ |
691 | if (!page_swapcount(*pagep)) | 711 | if (!page_swapcount(*pagep)) |
692 | error = -ENOENT; | 712 | error = -ENOENT; |
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
730 | 750 | ||
731 | /* | 751 | /* |
732 | * There's a faint possibility that swap page was replaced before | 752 | * There's a faint possibility that swap page was replaced before |
733 | * caller locked it: it will come back later with the right page. | 753 | * caller locked it: caller will come back later with the right page. |
734 | */ | 754 | */ |
735 | if (unlikely(!PageSwapCache(page))) | 755 | if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) |
736 | goto out; | 756 | goto out; |
737 | 757 | ||
738 | /* | 758 | /* |
@@ -909,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, | |||
909 | 929 | ||
910 | /* Create a pseudo vma that just contains the policy */ | 930 | /* Create a pseudo vma that just contains the policy */ |
911 | pvma.vm_start = 0; | 931 | pvma.vm_start = 0; |
912 | pvma.vm_pgoff = index; | 932 | /* Bias interleave by inode number to distribute better across nodes */ |
933 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
913 | pvma.vm_ops = NULL; | 934 | pvma.vm_ops = NULL; |
914 | pvma.vm_policy = spol; | 935 | pvma.vm_policy = spol; |
915 | return swapin_readahead(swap, gfp, &pvma, 0); | 936 | return swapin_readahead(swap, gfp, &pvma, 0); |
@@ -922,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
922 | 943 | ||
923 | /* Create a pseudo vma that just contains the policy */ | 944 | /* Create a pseudo vma that just contains the policy */ |
924 | pvma.vm_start = 0; | 945 | pvma.vm_start = 0; |
925 | pvma.vm_pgoff = index; | 946 | /* Bias interleave by inode number to distribute better across nodes */ |
947 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
926 | pvma.vm_ops = NULL; | 948 | pvma.vm_ops = NULL; |
927 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | 949 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
928 | 950 | ||
@@ -995,21 +1017,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
995 | newpage = shmem_alloc_page(gfp, info, index); | 1017 | newpage = shmem_alloc_page(gfp, info, index); |
996 | if (!newpage) | 1018 | if (!newpage) |
997 | return -ENOMEM; | 1019 | return -ENOMEM; |
998 | VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | ||
999 | 1020 | ||
1000 | *pagep = newpage; | ||
1001 | page_cache_get(newpage); | 1021 | page_cache_get(newpage); |
1002 | copy_highpage(newpage, oldpage); | 1022 | copy_highpage(newpage, oldpage); |
1023 | flush_dcache_page(newpage); | ||
1003 | 1024 | ||
1004 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1005 | __set_page_locked(newpage); | 1025 | __set_page_locked(newpage); |
1006 | VM_BUG_ON(!PageUptodate(oldpage)); | ||
1007 | SetPageUptodate(newpage); | 1026 | SetPageUptodate(newpage); |
1008 | VM_BUG_ON(!PageSwapBacked(oldpage)); | ||
1009 | SetPageSwapBacked(newpage); | 1027 | SetPageSwapBacked(newpage); |
1010 | VM_BUG_ON(!swap_index); | ||
1011 | set_page_private(newpage, swap_index); | 1028 | set_page_private(newpage, swap_index); |
1012 | VM_BUG_ON(!PageSwapCache(oldpage)); | ||
1013 | SetPageSwapCache(newpage); | 1029 | SetPageSwapCache(newpage); |
1014 | 1030 | ||
1015 | /* | 1031 | /* |
@@ -1019,13 +1035,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1019 | spin_lock_irq(&swap_mapping->tree_lock); | 1035 | spin_lock_irq(&swap_mapping->tree_lock); |
1020 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | 1036 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, |
1021 | newpage); | 1037 | newpage); |
1022 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 1038 | if (!error) { |
1023 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | 1039 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
1040 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
1041 | } | ||
1024 | spin_unlock_irq(&swap_mapping->tree_lock); | 1042 | spin_unlock_irq(&swap_mapping->tree_lock); |
1025 | BUG_ON(error); | ||
1026 | 1043 | ||
1027 | mem_cgroup_replace_page_cache(oldpage, newpage); | 1044 | if (unlikely(error)) { |
1028 | lru_cache_add_anon(newpage); | 1045 | /* |
1046 | * Is this possible? I think not, now that our callers check | ||
1047 | * both PageSwapCache and page_private after getting page lock; | ||
1048 | * but be defensive. Reverse old to newpage for clear and free. | ||
1049 | */ | ||
1050 | oldpage = newpage; | ||
1051 | } else { | ||
1052 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
1053 | lru_cache_add_anon(newpage); | ||
1054 | *pagep = newpage; | ||
1055 | } | ||
1029 | 1056 | ||
1030 | ClearPageSwapCache(oldpage); | 1057 | ClearPageSwapCache(oldpage); |
1031 | set_page_private(oldpage, 0); | 1058 | set_page_private(oldpage, 0); |
@@ -1033,7 +1060,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1033 | unlock_page(oldpage); | 1060 | unlock_page(oldpage); |
1034 | page_cache_release(oldpage); | 1061 | page_cache_release(oldpage); |
1035 | page_cache_release(oldpage); | 1062 | page_cache_release(oldpage); |
1036 | return 0; | 1063 | return error; |
1037 | } | 1064 | } |
1038 | 1065 | ||
1039 | /* | 1066 | /* |
@@ -1107,9 +1134,10 @@ repeat: | |||
1107 | 1134 | ||
1108 | /* We have to do this with page locked to prevent races */ | 1135 | /* We have to do this with page locked to prevent races */ |
1109 | lock_page(page); | 1136 | lock_page(page); |
1110 | if (!PageSwapCache(page) || page->mapping) { | 1137 | if (!PageSwapCache(page) || page_private(page) != swap.val || |
1138 | !shmem_confirm_swap(mapping, index, swap)) { | ||
1111 | error = -EEXIST; /* try again */ | 1139 | error = -EEXIST; /* try again */ |
1112 | goto failed; | 1140 | goto unlock; |
1113 | } | 1141 | } |
1114 | if (!PageUptodate(page)) { | 1142 | if (!PageUptodate(page)) { |
1115 | error = -EIO; | 1143 | error = -EIO; |
@@ -1125,9 +1153,12 @@ repeat: | |||
1125 | 1153 | ||
1126 | error = mem_cgroup_cache_charge(page, current->mm, | 1154 | error = mem_cgroup_cache_charge(page, current->mm, |
1127 | gfp & GFP_RECLAIM_MASK); | 1155 | gfp & GFP_RECLAIM_MASK); |
1128 | if (!error) | 1156 | if (!error) { |
1129 | error = shmem_add_to_page_cache(page, mapping, index, | 1157 | error = shmem_add_to_page_cache(page, mapping, index, |
1130 | gfp, swp_to_radix_entry(swap)); | 1158 | gfp, swp_to_radix_entry(swap)); |
1159 | /* We already confirmed swap, and make no allocation */ | ||
1160 | VM_BUG_ON(error); | ||
1161 | } | ||
1131 | if (error) | 1162 | if (error) |
1132 | goto failed; | 1163 | goto failed; |
1133 | 1164 | ||
@@ -1164,11 +1195,18 @@ repeat: | |||
1164 | __set_page_locked(page); | 1195 | __set_page_locked(page); |
1165 | error = mem_cgroup_cache_charge(page, current->mm, | 1196 | error = mem_cgroup_cache_charge(page, current->mm, |
1166 | gfp & GFP_RECLAIM_MASK); | 1197 | gfp & GFP_RECLAIM_MASK); |
1167 | if (!error) | ||
1168 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1169 | gfp, NULL); | ||
1170 | if (error) | 1198 | if (error) |
1171 | goto decused; | 1199 | goto decused; |
1200 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | ||
1201 | if (!error) { | ||
1202 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1203 | gfp, NULL); | ||
1204 | radix_tree_preload_end(); | ||
1205 | } | ||
1206 | if (error) { | ||
1207 | mem_cgroup_uncharge_cache_page(page); | ||
1208 | goto decused; | ||
1209 | } | ||
1172 | lru_cache_add_anon(page); | 1210 | lru_cache_add_anon(page); |
1173 | 1211 | ||
1174 | spin_lock(&info->lock); | 1212 | spin_lock(&info->lock); |
@@ -1228,14 +1266,10 @@ decused: | |||
1228 | unacct: | 1266 | unacct: |
1229 | shmem_unacct_blocks(info->flags, 1); | 1267 | shmem_unacct_blocks(info->flags, 1); |
1230 | failed: | 1268 | failed: |
1231 | if (swap.val && error != -EINVAL) { | 1269 | if (swap.val && error != -EINVAL && |
1232 | struct page *test = find_get_page(mapping, index); | 1270 | !shmem_confirm_swap(mapping, index, swap)) |
1233 | if (test && !radix_tree_exceptional_entry(test)) | 1271 | error = -EEXIST; |
1234 | page_cache_release(test); | 1272 | unlock: |
1235 | /* Have another try if the entry has changed */ | ||
1236 | if (test != swp_to_radix_entry(swap)) | ||
1237 | error = -EEXIST; | ||
1238 | } | ||
1239 | if (page) { | 1273 | if (page) { |
1240 | unlock_page(page); | 1274 | unlock_page(page); |
1241 | page_cache_release(page); | 1275 | page_cache_release(page); |
@@ -1247,7 +1281,7 @@ failed: | |||
1247 | spin_unlock(&info->lock); | 1281 | spin_unlock(&info->lock); |
1248 | goto repeat; | 1282 | goto repeat; |
1249 | } | 1283 | } |
1250 | if (error == -EEXIST) | 1284 | if (error == -EEXIST) /* from above or from radix_tree_insert */ |
1251 | goto repeat; | 1285 | goto repeat; |
1252 | return error; | 1286 | return error; |
1253 | } | 1287 | } |
@@ -1675,98 +1709,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1675 | return error; | 1709 | return error; |
1676 | } | 1710 | } |
1677 | 1711 | ||
1678 | /* | ||
1679 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
1680 | */ | ||
1681 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
1682 | pgoff_t index, pgoff_t end, int origin) | ||
1683 | { | ||
1684 | struct page *page; | ||
1685 | struct pagevec pvec; | ||
1686 | pgoff_t indices[PAGEVEC_SIZE]; | ||
1687 | bool done = false; | ||
1688 | int i; | ||
1689 | |||
1690 | pagevec_init(&pvec, 0); | ||
1691 | pvec.nr = 1; /* start small: we may be there already */ | ||
1692 | while (!done) { | ||
1693 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
1694 | pvec.nr, pvec.pages, indices); | ||
1695 | if (!pvec.nr) { | ||
1696 | if (origin == SEEK_DATA) | ||
1697 | index = end; | ||
1698 | break; | ||
1699 | } | ||
1700 | for (i = 0; i < pvec.nr; i++, index++) { | ||
1701 | if (index < indices[i]) { | ||
1702 | if (origin == SEEK_HOLE) { | ||
1703 | done = true; | ||
1704 | break; | ||
1705 | } | ||
1706 | index = indices[i]; | ||
1707 | } | ||
1708 | page = pvec.pages[i]; | ||
1709 | if (page && !radix_tree_exceptional_entry(page)) { | ||
1710 | if (!PageUptodate(page)) | ||
1711 | page = NULL; | ||
1712 | } | ||
1713 | if (index >= end || | ||
1714 | (page && origin == SEEK_DATA) || | ||
1715 | (!page && origin == SEEK_HOLE)) { | ||
1716 | done = true; | ||
1717 | break; | ||
1718 | } | ||
1719 | } | ||
1720 | shmem_deswap_pagevec(&pvec); | ||
1721 | pagevec_release(&pvec); | ||
1722 | pvec.nr = PAGEVEC_SIZE; | ||
1723 | cond_resched(); | ||
1724 | } | ||
1725 | return index; | ||
1726 | } | ||
1727 | |||
1728 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) | ||
1729 | { | ||
1730 | struct address_space *mapping; | ||
1731 | struct inode *inode; | ||
1732 | pgoff_t start, end; | ||
1733 | loff_t new_offset; | ||
1734 | |||
1735 | if (origin != SEEK_DATA && origin != SEEK_HOLE) | ||
1736 | return generic_file_llseek_size(file, offset, origin, | ||
1737 | MAX_LFS_FILESIZE); | ||
1738 | mapping = file->f_mapping; | ||
1739 | inode = mapping->host; | ||
1740 | mutex_lock(&inode->i_mutex); | ||
1741 | /* We're holding i_mutex so we can access i_size directly */ | ||
1742 | |||
1743 | if (offset < 0) | ||
1744 | offset = -EINVAL; | ||
1745 | else if (offset >= inode->i_size) | ||
1746 | offset = -ENXIO; | ||
1747 | else { | ||
1748 | start = offset >> PAGE_CACHE_SHIFT; | ||
1749 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1750 | new_offset = shmem_seek_hole_data(mapping, start, end, origin); | ||
1751 | new_offset <<= PAGE_CACHE_SHIFT; | ||
1752 | if (new_offset > offset) { | ||
1753 | if (new_offset < inode->i_size) | ||
1754 | offset = new_offset; | ||
1755 | else if (origin == SEEK_DATA) | ||
1756 | offset = -ENXIO; | ||
1757 | else | ||
1758 | offset = inode->i_size; | ||
1759 | } | ||
1760 | } | ||
1761 | |||
1762 | if (offset >= 0 && offset != file->f_pos) { | ||
1763 | file->f_pos = offset; | ||
1764 | file->f_version = 0; | ||
1765 | } | ||
1766 | mutex_unlock(&inode->i_mutex); | ||
1767 | return offset; | ||
1768 | } | ||
1769 | |||
1770 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1712 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
1771 | loff_t len) | 1713 | loff_t len) |
1772 | { | 1714 | { |
@@ -1937,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
1937 | } | 1879 | } |
1938 | 1880 | ||
1939 | static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, | 1881 | static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, |
1940 | struct nameidata *nd) | 1882 | bool excl) |
1941 | { | 1883 | { |
1942 | return shmem_mknod(dir, dentry, mode | S_IFREG, 0); | 1884 | return shmem_mknod(dir, dentry, mode | S_IFREG, 0); |
1943 | } | 1885 | } |
@@ -2770,7 +2712,7 @@ static const struct address_space_operations shmem_aops = { | |||
2770 | static const struct file_operations shmem_file_operations = { | 2712 | static const struct file_operations shmem_file_operations = { |
2771 | .mmap = shmem_mmap, | 2713 | .mmap = shmem_mmap, |
2772 | #ifdef CONFIG_TMPFS | 2714 | #ifdef CONFIG_TMPFS |
2773 | .llseek = shmem_file_llseek, | 2715 | .llseek = generic_file_llseek, |
2774 | .read = do_sync_read, | 2716 | .read = do_sync_read, |
2775 | .write = do_sync_write, | 2717 | .write = do_sync_write, |
2776 | .aio_read = shmem_file_aio_read, | 2718 | .aio_read = shmem_file_aio_read, |