aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c117
1 files changed, 77 insertions, 40 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 8f419cff9e34..af68b15a8fc1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -85,7 +85,7 @@ static struct vfsmount *shm_mnt;
85 * a time): we would prefer not to enlarge the shmem inode just for that. 85 * a time): we would prefer not to enlarge the shmem inode just for that.
86 */ 86 */
87struct shmem_falloc { 87struct shmem_falloc {
88 int mode; /* FALLOC_FL mode currently operating */ 88 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
89 pgoff_t start; /* start of range currently being fallocated */ 89 pgoff_t start; /* start of range currently being fallocated */
90 pgoff_t next; /* the next page offset to be fallocated */ 90 pgoff_t next; /* the next page offset to be fallocated */
91 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 91 pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -468,23 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
468 return; 468 return;
469 469
470 index = start; 470 index = start;
471 for ( ; ; ) { 471 while (index < end) {
472 cond_resched(); 472 cond_resched();
473 473
474 pvec.nr = find_get_entries(mapping, index, 474 pvec.nr = find_get_entries(mapping, index,
475 min(end - index, (pgoff_t)PAGEVEC_SIZE), 475 min(end - index, (pgoff_t)PAGEVEC_SIZE),
476 pvec.pages, indices); 476 pvec.pages, indices);
477 if (!pvec.nr) { 477 if (!pvec.nr) {
478 if (index == start || unfalloc) 478 /* If all gone or hole-punch or unfalloc, we're done */
479 if (index == start || end != -1)
479 break; 480 break;
481 /* But if truncating, restart to make sure all gone */
480 index = start; 482 index = start;
481 continue; 483 continue;
482 } 484 }
483 if ((index == start || unfalloc) && indices[0] >= end) {
484 pagevec_remove_exceptionals(&pvec);
485 pagevec_release(&pvec);
486 break;
487 }
488 mem_cgroup_uncharge_start(); 485 mem_cgroup_uncharge_start();
489 for (i = 0; i < pagevec_count(&pvec); i++) { 486 for (i = 0; i < pagevec_count(&pvec); i++) {
490 struct page *page = pvec.pages[i]; 487 struct page *page = pvec.pages[i];
@@ -496,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
496 if (radix_tree_exceptional_entry(page)) { 493 if (radix_tree_exceptional_entry(page)) {
497 if (unfalloc) 494 if (unfalloc)
498 continue; 495 continue;
499 nr_swaps_freed += !shmem_free_swap(mapping, 496 if (shmem_free_swap(mapping, index, page)) {
500 index, page); 497 /* Swap was replaced by page: retry */
498 index--;
499 break;
500 }
501 nr_swaps_freed++;
501 continue; 502 continue;
502 } 503 }
503 504
@@ -506,6 +507,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
506 if (page->mapping == mapping) { 507 if (page->mapping == mapping) {
507 VM_BUG_ON_PAGE(PageWriteback(page), page); 508 VM_BUG_ON_PAGE(PageWriteback(page), page);
508 truncate_inode_page(mapping, page); 509 truncate_inode_page(mapping, page);
510 } else {
511 /* Page was replaced by swap: retry */
512 unlock_page(page);
513 index--;
514 break;
509 } 515 }
510 } 516 }
511 unlock_page(page); 517 unlock_page(page);
@@ -760,7 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
760 spin_lock(&inode->i_lock); 766 spin_lock(&inode->i_lock);
761 shmem_falloc = inode->i_private; 767 shmem_falloc = inode->i_private;
762 if (shmem_falloc && 768 if (shmem_falloc &&
763 !shmem_falloc->mode && 769 !shmem_falloc->waitq &&
764 index >= shmem_falloc->start && 770 index >= shmem_falloc->start &&
765 index < shmem_falloc->next) 771 index < shmem_falloc->next)
766 shmem_falloc->nr_unswapped++; 772 shmem_falloc->nr_unswapped++;
@@ -1029,6 +1035,9 @@ repeat:
1029 goto failed; 1035 goto failed;
1030 } 1036 }
1031 1037
1038 if (page && sgp == SGP_WRITE)
1039 mark_page_accessed(page);
1040
1032 /* fallocated page? */ 1041 /* fallocated page? */
1033 if (page && !PageUptodate(page)) { 1042 if (page && !PageUptodate(page)) {
1034 if (sgp != SGP_READ) 1043 if (sgp != SGP_READ)
@@ -1110,6 +1119,9 @@ repeat:
1110 shmem_recalc_inode(inode); 1119 shmem_recalc_inode(inode);
1111 spin_unlock(&info->lock); 1120 spin_unlock(&info->lock);
1112 1121
1122 if (sgp == SGP_WRITE)
1123 mark_page_accessed(page);
1124
1113 delete_from_swap_cache(page); 1125 delete_from_swap_cache(page);
1114 set_page_dirty(page); 1126 set_page_dirty(page);
1115 swap_free(swap); 1127 swap_free(swap);
@@ -1136,6 +1148,9 @@ repeat:
1136 1148
1137 __SetPageSwapBacked(page); 1149 __SetPageSwapBacked(page);
1138 __set_page_locked(page); 1150 __set_page_locked(page);
1151 if (sgp == SGP_WRITE)
1152 init_page_accessed(page);
1153
1139 error = mem_cgroup_charge_file(page, current->mm, 1154 error = mem_cgroup_charge_file(page, current->mm,
1140 gfp & GFP_RECLAIM_MASK); 1155 gfp & GFP_RECLAIM_MASK);
1141 if (error) 1156 if (error)
@@ -1239,38 +1254,58 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1239 * Trinity finds that probing a hole which tmpfs is punching can 1254 * Trinity finds that probing a hole which tmpfs is punching can
1240 * prevent the hole-punch from ever completing: which in turn 1255 * prevent the hole-punch from ever completing: which in turn
1241 * locks writers out with its hold on i_mutex. So refrain from 1256 * locks writers out with its hold on i_mutex. So refrain from
1242 * faulting pages into the hole while it's being punched, and 1257 * faulting pages into the hole while it's being punched. Although
1243 * wait on i_mutex to be released if vmf->flags permits. 1258 * shmem_undo_range() does remove the additions, it may be unable to
1259 * keep up, as each new page needs its own unmap_mapping_range() call,
1260 * and the i_mmap tree grows ever slower to scan if new vmas are added.
1261 *
1262 * It does not matter if we sometimes reach this check just before the
1263 * hole-punch begins, so that one fault then races with the punch:
1264 * we just need to make racing faults a rare case.
1265 *
1266 * The implementation below would be much simpler if we just used a
1267 * standard mutex or completion: but we cannot take i_mutex in fault,
1268 * and bloating every shmem inode for this unlikely case would be sad.
1244 */ 1269 */
1245 if (unlikely(inode->i_private)) { 1270 if (unlikely(inode->i_private)) {
1246 struct shmem_falloc *shmem_falloc; 1271 struct shmem_falloc *shmem_falloc;
1247 1272
1248 spin_lock(&inode->i_lock); 1273 spin_lock(&inode->i_lock);
1249 shmem_falloc = inode->i_private; 1274 shmem_falloc = inode->i_private;
1250 if (!shmem_falloc || 1275 if (shmem_falloc &&
1251 shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE || 1276 shmem_falloc->waitq &&
1252 vmf->pgoff < shmem_falloc->start || 1277 vmf->pgoff >= shmem_falloc->start &&
1253 vmf->pgoff >= shmem_falloc->next) 1278 vmf->pgoff < shmem_falloc->next) {
1254 shmem_falloc = NULL; 1279 wait_queue_head_t *shmem_falloc_waitq;
1255 spin_unlock(&inode->i_lock); 1280 DEFINE_WAIT(shmem_fault_wait);
1256 /* 1281
1257 * i_lock has protected us from taking shmem_falloc seriously 1282 ret = VM_FAULT_NOPAGE;
1258 * once return from shmem_fallocate() went back up that stack.
1259 * i_lock does not serialize with i_mutex at all, but it does
1260 * not matter if sometimes we wait unnecessarily, or sometimes
1261 * miss out on waiting: we just need to make those cases rare.
1262 */
1263 if (shmem_falloc) {
1264 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && 1283 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
1265 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { 1284 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
1285 /* It's polite to up mmap_sem if we can */
1266 up_read(&vma->vm_mm->mmap_sem); 1286 up_read(&vma->vm_mm->mmap_sem);
1267 mutex_lock(&inode->i_mutex); 1287 ret = VM_FAULT_RETRY;
1268 mutex_unlock(&inode->i_mutex);
1269 return VM_FAULT_RETRY;
1270 } 1288 }
1271 /* cond_resched? Leave that to GUP or return to user */ 1289
1272 return VM_FAULT_NOPAGE; 1290 shmem_falloc_waitq = shmem_falloc->waitq;
1291 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
1292 TASK_UNINTERRUPTIBLE);
1293 spin_unlock(&inode->i_lock);
1294 schedule();
1295
1296 /*
1297 * shmem_falloc_waitq points into the shmem_fallocate()
1298 * stack of the hole-punching task: shmem_falloc_waitq
1299 * is usually invalid by the time we reach here, but
1300 * finish_wait() does not dereference it in that case;
1301 * though i_lock needed lest racing with wake_up_all().
1302 */
1303 spin_lock(&inode->i_lock);
1304 finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
1305 spin_unlock(&inode->i_lock);
1306 return ret;
1273 } 1307 }
1308 spin_unlock(&inode->i_lock);
1274 } 1309 }
1275 1310
1276 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1311 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
@@ -1412,13 +1447,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1412 loff_t pos, unsigned len, unsigned flags, 1447 loff_t pos, unsigned len, unsigned flags,
1413 struct page **pagep, void **fsdata) 1448 struct page **pagep, void **fsdata)
1414{ 1449{
1415 int ret;
1416 struct inode *inode = mapping->host; 1450 struct inode *inode = mapping->host;
1417 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1451 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1418 ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1452 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1419 if (ret == 0 && *pagep)
1420 init_page_accessed(*pagep);
1421 return ret;
1422} 1453}
1423 1454
1424static int 1455static int
@@ -1769,13 +1800,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1769 1800
1770 mutex_lock(&inode->i_mutex); 1801 mutex_lock(&inode->i_mutex);
1771 1802
1772 shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
1773
1774 if (mode & FALLOC_FL_PUNCH_HOLE) { 1803 if (mode & FALLOC_FL_PUNCH_HOLE) {
1775 struct address_space *mapping = file->f_mapping; 1804 struct address_space *mapping = file->f_mapping;
1776 loff_t unmap_start = round_up(offset, PAGE_SIZE); 1805 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1777 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 1806 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1807 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
1778 1808
1809 shmem_falloc.waitq = &shmem_falloc_waitq;
1779 shmem_falloc.start = unmap_start >> PAGE_SHIFT; 1810 shmem_falloc.start = unmap_start >> PAGE_SHIFT;
1780 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 1811 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
1781 spin_lock(&inode->i_lock); 1812 spin_lock(&inode->i_lock);
@@ -1787,8 +1818,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1787 1 + unmap_end - unmap_start, 0); 1818 1 + unmap_end - unmap_start, 0);
1788 shmem_truncate_range(inode, offset, offset + len - 1); 1819 shmem_truncate_range(inode, offset, offset + len - 1);
1789 /* No need to unmap again: hole-punching leaves COWed pages */ 1820 /* No need to unmap again: hole-punching leaves COWed pages */
1821
1822 spin_lock(&inode->i_lock);
1823 inode->i_private = NULL;
1824 wake_up_all(&shmem_falloc_waitq);
1825 spin_unlock(&inode->i_lock);
1790 error = 0; 1826 error = 0;
1791 goto undone; 1827 goto out;
1792 } 1828 }
1793 1829
1794 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 1830 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
@@ -1804,6 +1840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1804 goto out; 1840 goto out;
1805 } 1841 }
1806 1842
1843 shmem_falloc.waitq = NULL;
1807 shmem_falloc.start = start; 1844 shmem_falloc.start = start;
1808 shmem_falloc.next = start; 1845 shmem_falloc.next = start;
1809 shmem_falloc.nr_falloced = 0; 1846 shmem_falloc.nr_falloced = 0;