diff options
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 122 |
1 files changed, 105 insertions, 17 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index f484c276e994..af68b15a8fc1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt; | |||
80 | #define SHORT_SYMLINK_LEN 128 | 80 | #define SHORT_SYMLINK_LEN 128 |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * shmem_fallocate and shmem_writepage communicate via inode->i_private | 83 | * shmem_fallocate communicates with shmem_fault or shmem_writepage via |
84 | * (with i_mutex making sure that it has only one user at a time): | 84 | * inode->i_private (with i_mutex making sure that it has only one user at |
85 | * we would prefer not to enlarge the shmem inode just for that. | 85 | * a time): we would prefer not to enlarge the shmem inode just for that. |
86 | */ | 86 | */ |
87 | struct shmem_falloc { | 87 | struct shmem_falloc { |
88 | wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ | ||
88 | pgoff_t start; /* start of range currently being fallocated */ | 89 | pgoff_t start; /* start of range currently being fallocated */ |
89 | pgoff_t next; /* the next page offset to be fallocated */ | 90 | pgoff_t next; /* the next page offset to be fallocated */ |
90 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ | 91 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ |
@@ -467,23 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
467 | return; | 468 | return; |
468 | 469 | ||
469 | index = start; | 470 | index = start; |
470 | for ( ; ; ) { | 471 | while (index < end) { |
471 | cond_resched(); | 472 | cond_resched(); |
472 | 473 | ||
473 | pvec.nr = find_get_entries(mapping, index, | 474 | pvec.nr = find_get_entries(mapping, index, |
474 | min(end - index, (pgoff_t)PAGEVEC_SIZE), | 475 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
475 | pvec.pages, indices); | 476 | pvec.pages, indices); |
476 | if (!pvec.nr) { | 477 | if (!pvec.nr) { |
477 | if (index == start || unfalloc) | 478 | /* If all gone or hole-punch or unfalloc, we're done */ |
479 | if (index == start || end != -1) | ||
478 | break; | 480 | break; |
481 | /* But if truncating, restart to make sure all gone */ | ||
479 | index = start; | 482 | index = start; |
480 | continue; | 483 | continue; |
481 | } | 484 | } |
482 | if ((index == start || unfalloc) && indices[0] >= end) { | ||
483 | pagevec_remove_exceptionals(&pvec); | ||
484 | pagevec_release(&pvec); | ||
485 | break; | ||
486 | } | ||
487 | mem_cgroup_uncharge_start(); | 485 | mem_cgroup_uncharge_start(); |
488 | for (i = 0; i < pagevec_count(&pvec); i++) { | 486 | for (i = 0; i < pagevec_count(&pvec); i++) { |
489 | struct page *page = pvec.pages[i]; | 487 | struct page *page = pvec.pages[i]; |
@@ -495,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
495 | if (radix_tree_exceptional_entry(page)) { | 493 | if (radix_tree_exceptional_entry(page)) { |
496 | if (unfalloc) | 494 | if (unfalloc) |
497 | continue; | 495 | continue; |
498 | nr_swaps_freed += !shmem_free_swap(mapping, | 496 | if (shmem_free_swap(mapping, index, page)) { |
499 | index, page); | 497 | /* Swap was replaced by page: retry */ |
498 | index--; | ||
499 | break; | ||
500 | } | ||
501 | nr_swaps_freed++; | ||
500 | continue; | 502 | continue; |
501 | } | 503 | } |
502 | 504 | ||
@@ -505,6 +507,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
505 | if (page->mapping == mapping) { | 507 | if (page->mapping == mapping) { |
506 | VM_BUG_ON_PAGE(PageWriteback(page), page); | 508 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
507 | truncate_inode_page(mapping, page); | 509 | truncate_inode_page(mapping, page); |
510 | } else { | ||
511 | /* Page was replaced by swap: retry */ | ||
512 | unlock_page(page); | ||
513 | index--; | ||
514 | break; | ||
508 | } | 515 | } |
509 | } | 516 | } |
510 | unlock_page(page); | 517 | unlock_page(page); |
@@ -759,6 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
759 | spin_lock(&inode->i_lock); | 766 | spin_lock(&inode->i_lock); |
760 | shmem_falloc = inode->i_private; | 767 | shmem_falloc = inode->i_private; |
761 | if (shmem_falloc && | 768 | if (shmem_falloc && |
769 | !shmem_falloc->waitq && | ||
762 | index >= shmem_falloc->start && | 770 | index >= shmem_falloc->start && |
763 | index < shmem_falloc->next) | 771 | index < shmem_falloc->next) |
764 | shmem_falloc->nr_unswapped++; | 772 | shmem_falloc->nr_unswapped++; |
@@ -1027,6 +1035,9 @@ repeat: | |||
1027 | goto failed; | 1035 | goto failed; |
1028 | } | 1036 | } |
1029 | 1037 | ||
1038 | if (page && sgp == SGP_WRITE) | ||
1039 | mark_page_accessed(page); | ||
1040 | |||
1030 | /* fallocated page? */ | 1041 | /* fallocated page? */ |
1031 | if (page && !PageUptodate(page)) { | 1042 | if (page && !PageUptodate(page)) { |
1032 | if (sgp != SGP_READ) | 1043 | if (sgp != SGP_READ) |
@@ -1108,6 +1119,9 @@ repeat: | |||
1108 | shmem_recalc_inode(inode); | 1119 | shmem_recalc_inode(inode); |
1109 | spin_unlock(&info->lock); | 1120 | spin_unlock(&info->lock); |
1110 | 1121 | ||
1122 | if (sgp == SGP_WRITE) | ||
1123 | mark_page_accessed(page); | ||
1124 | |||
1111 | delete_from_swap_cache(page); | 1125 | delete_from_swap_cache(page); |
1112 | set_page_dirty(page); | 1126 | set_page_dirty(page); |
1113 | swap_free(swap); | 1127 | swap_free(swap); |
@@ -1134,6 +1148,9 @@ repeat: | |||
1134 | 1148 | ||
1135 | __SetPageSwapBacked(page); | 1149 | __SetPageSwapBacked(page); |
1136 | __set_page_locked(page); | 1150 | __set_page_locked(page); |
1151 | if (sgp == SGP_WRITE) | ||
1152 | init_page_accessed(page); | ||
1153 | |||
1137 | error = mem_cgroup_charge_file(page, current->mm, | 1154 | error = mem_cgroup_charge_file(page, current->mm, |
1138 | gfp & GFP_RECLAIM_MASK); | 1155 | gfp & GFP_RECLAIM_MASK); |
1139 | if (error) | 1156 | if (error) |
@@ -1233,6 +1250,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1233 | int error; | 1250 | int error; |
1234 | int ret = VM_FAULT_LOCKED; | 1251 | int ret = VM_FAULT_LOCKED; |
1235 | 1252 | ||
1253 | /* | ||
1254 | * Trinity finds that probing a hole which tmpfs is punching can | ||
1255 | * prevent the hole-punch from ever completing: which in turn | ||
1256 | * locks writers out with its hold on i_mutex. So refrain from | ||
1257 | * faulting pages into the hole while it's being punched. Although | ||
1258 | * shmem_undo_range() does remove the additions, it may be unable to | ||
1259 | * keep up, as each new page needs its own unmap_mapping_range() call, | ||
1260 | * and the i_mmap tree grows ever slower to scan if new vmas are added. | ||
1261 | * | ||
1262 | * It does not matter if we sometimes reach this check just before the | ||
1263 | * hole-punch begins, so that one fault then races with the punch: | ||
1264 | * we just need to make racing faults a rare case. | ||
1265 | * | ||
1266 | * The implementation below would be much simpler if we just used a | ||
1267 | * standard mutex or completion: but we cannot take i_mutex in fault, | ||
1268 | * and bloating every shmem inode for this unlikely case would be sad. | ||
1269 | */ | ||
1270 | if (unlikely(inode->i_private)) { | ||
1271 | struct shmem_falloc *shmem_falloc; | ||
1272 | |||
1273 | spin_lock(&inode->i_lock); | ||
1274 | shmem_falloc = inode->i_private; | ||
1275 | if (shmem_falloc && | ||
1276 | shmem_falloc->waitq && | ||
1277 | vmf->pgoff >= shmem_falloc->start && | ||
1278 | vmf->pgoff < shmem_falloc->next) { | ||
1279 | wait_queue_head_t *shmem_falloc_waitq; | ||
1280 | DEFINE_WAIT(shmem_fault_wait); | ||
1281 | |||
1282 | ret = VM_FAULT_NOPAGE; | ||
1283 | if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && | ||
1284 | !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { | ||
1285 | /* It's polite to up mmap_sem if we can */ | ||
1286 | up_read(&vma->vm_mm->mmap_sem); | ||
1287 | ret = VM_FAULT_RETRY; | ||
1288 | } | ||
1289 | |||
1290 | shmem_falloc_waitq = shmem_falloc->waitq; | ||
1291 | prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, | ||
1292 | TASK_UNINTERRUPTIBLE); | ||
1293 | spin_unlock(&inode->i_lock); | ||
1294 | schedule(); | ||
1295 | |||
1296 | /* | ||
1297 | * shmem_falloc_waitq points into the shmem_fallocate() | ||
1298 | * stack of the hole-punching task: shmem_falloc_waitq | ||
1299 | * is usually invalid by the time we reach here, but | ||
1300 | * finish_wait() does not dereference it in that case; | ||
1301 | * though i_lock needed lest racing with wake_up_all(). | ||
1302 | */ | ||
1303 | spin_lock(&inode->i_lock); | ||
1304 | finish_wait(shmem_falloc_waitq, &shmem_fault_wait); | ||
1305 | spin_unlock(&inode->i_lock); | ||
1306 | return ret; | ||
1307 | } | ||
1308 | spin_unlock(&inode->i_lock); | ||
1309 | } | ||
1310 | |||
1236 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1311 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1237 | if (error) | 1312 | if (error) |
1238 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1313 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
@@ -1372,13 +1447,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1372 | loff_t pos, unsigned len, unsigned flags, | 1447 | loff_t pos, unsigned len, unsigned flags, |
1373 | struct page **pagep, void **fsdata) | 1448 | struct page **pagep, void **fsdata) |
1374 | { | 1449 | { |
1375 | int ret; | ||
1376 | struct inode *inode = mapping->host; | 1450 | struct inode *inode = mapping->host; |
1377 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1451 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1378 | ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1452 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1379 | if (ret == 0 && *pagep) | ||
1380 | init_page_accessed(*pagep); | ||
1381 | return ret; | ||
1382 | } | 1453 | } |
1383 | 1454 | ||
1384 | static int | 1455 | static int |
@@ -1724,18 +1795,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |||
1724 | pgoff_t start, index, end; | 1795 | pgoff_t start, index, end; |
1725 | int error; | 1796 | int error; |
1726 | 1797 | ||
1798 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) | ||
1799 | return -EOPNOTSUPP; | ||
1800 | |||
1727 | mutex_lock(&inode->i_mutex); | 1801 | mutex_lock(&inode->i_mutex); |
1728 | 1802 | ||
1729 | if (mode & FALLOC_FL_PUNCH_HOLE) { | 1803 | if (mode & FALLOC_FL_PUNCH_HOLE) { |
1730 | struct address_space *mapping = file->f_mapping; | 1804 | struct address_space *mapping = file->f_mapping; |
1731 | loff_t unmap_start = round_up(offset, PAGE_SIZE); | 1805 | loff_t unmap_start = round_up(offset, PAGE_SIZE); |
1732 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | 1806 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; |
1807 | DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); | ||
1808 | |||
1809 | shmem_falloc.waitq = &shmem_falloc_waitq; | ||
1810 | shmem_falloc.start = unmap_start >> PAGE_SHIFT; | ||
1811 | shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; | ||
1812 | spin_lock(&inode->i_lock); | ||
1813 | inode->i_private = &shmem_falloc; | ||
1814 | spin_unlock(&inode->i_lock); | ||
1733 | 1815 | ||
1734 | if ((u64)unmap_end > (u64)unmap_start) | 1816 | if ((u64)unmap_end > (u64)unmap_start) |
1735 | unmap_mapping_range(mapping, unmap_start, | 1817 | unmap_mapping_range(mapping, unmap_start, |
1736 | 1 + unmap_end - unmap_start, 0); | 1818 | 1 + unmap_end - unmap_start, 0); |
1737 | shmem_truncate_range(inode, offset, offset + len - 1); | 1819 | shmem_truncate_range(inode, offset, offset + len - 1); |
1738 | /* No need to unmap again: hole-punching leaves COWed pages */ | 1820 | /* No need to unmap again: hole-punching leaves COWed pages */ |
1821 | |||
1822 | spin_lock(&inode->i_lock); | ||
1823 | inode->i_private = NULL; | ||
1824 | wake_up_all(&shmem_falloc_waitq); | ||
1825 | spin_unlock(&inode->i_lock); | ||
1739 | error = 0; | 1826 | error = 0; |
1740 | goto out; | 1827 | goto out; |
1741 | } | 1828 | } |
@@ -1753,6 +1840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |||
1753 | goto out; | 1840 | goto out; |
1754 | } | 1841 | } |
1755 | 1842 | ||
1843 | shmem_falloc.waitq = NULL; | ||
1756 | shmem_falloc.start = start; | 1844 | shmem_falloc.start = start; |
1757 | shmem_falloc.next = start; | 1845 | shmem_falloc.next = start; |
1758 | shmem_falloc.nr_falloced = 0; | 1846 | shmem_falloc.nr_falloced = 0; |