aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c766
1 files changed, 537 insertions, 229 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 15c8413ee929..c6049e947cd9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/cpuset.h> 32#include <linux/cpuset.h>
33#include "filemap.h" 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
34#include "internal.h" 34#include "internal.h"
35 35
36/* 36/*
@@ -593,7 +593,7 @@ void fastcall __lock_page_nosync(struct page *page)
593 * Is there a pagecache struct page at the given (mapping, offset) tuple? 593 * Is there a pagecache struct page at the given (mapping, offset) tuple?
594 * If yes, increment its refcount and return it; if no, return NULL. 594 * If yes, increment its refcount and return it; if no, return NULL.
595 */ 595 */
596struct page * find_get_page(struct address_space *mapping, unsigned long offset) 596struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
597{ 597{
598 struct page *page; 598 struct page *page;
599 599
@@ -617,30 +617,31 @@ EXPORT_SYMBOL(find_get_page);
617 * Returns zero if the page was not present. find_lock_page() may sleep. 617 * Returns zero if the page was not present. find_lock_page() may sleep.
618 */ 618 */
619struct page *find_lock_page(struct address_space *mapping, 619struct page *find_lock_page(struct address_space *mapping,
620 unsigned long offset) 620 pgoff_t offset)
621{ 621{
622 struct page *page; 622 struct page *page;
623 623
624 read_lock_irq(&mapping->tree_lock);
625repeat: 624repeat:
625 read_lock_irq(&mapping->tree_lock);
626 page = radix_tree_lookup(&mapping->page_tree, offset); 626 page = radix_tree_lookup(&mapping->page_tree, offset);
627 if (page) { 627 if (page) {
628 page_cache_get(page); 628 page_cache_get(page);
629 if (TestSetPageLocked(page)) { 629 if (TestSetPageLocked(page)) {
630 read_unlock_irq(&mapping->tree_lock); 630 read_unlock_irq(&mapping->tree_lock);
631 __lock_page(page); 631 __lock_page(page);
632 read_lock_irq(&mapping->tree_lock);
633 632
634 /* Has the page been truncated while we slept? */ 633 /* Has the page been truncated while we slept? */
635 if (unlikely(page->mapping != mapping || 634 if (unlikely(page->mapping != mapping)) {
636 page->index != offset)) {
637 unlock_page(page); 635 unlock_page(page);
638 page_cache_release(page); 636 page_cache_release(page);
639 goto repeat; 637 goto repeat;
640 } 638 }
639 VM_BUG_ON(page->index != offset);
640 goto out;
641 } 641 }
642 } 642 }
643 read_unlock_irq(&mapping->tree_lock); 643 read_unlock_irq(&mapping->tree_lock);
644out:
644 return page; 645 return page;
645} 646}
646EXPORT_SYMBOL(find_lock_page); 647EXPORT_SYMBOL(find_lock_page);
@@ -663,29 +664,24 @@ EXPORT_SYMBOL(find_lock_page);
663 * memory exhaustion. 664 * memory exhaustion.
664 */ 665 */
665struct page *find_or_create_page(struct address_space *mapping, 666struct page *find_or_create_page(struct address_space *mapping,
666 unsigned long index, gfp_t gfp_mask) 667 pgoff_t index, gfp_t gfp_mask)
667{ 668{
668 struct page *page, *cached_page = NULL; 669 struct page *page;
669 int err; 670 int err;
670repeat: 671repeat:
671 page = find_lock_page(mapping, index); 672 page = find_lock_page(mapping, index);
672 if (!page) { 673 if (!page) {
673 if (!cached_page) { 674 page = __page_cache_alloc(gfp_mask);
674 cached_page = 675 if (!page)
675 __page_cache_alloc(gfp_mask); 676 return NULL;
676 if (!cached_page) 677 err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
677 return NULL; 678 if (unlikely(err)) {
679 page_cache_release(page);
680 page = NULL;
681 if (err == -EEXIST)
682 goto repeat;
678 } 683 }
679 err = add_to_page_cache_lru(cached_page, mapping,
680 index, gfp_mask);
681 if (!err) {
682 page = cached_page;
683 cached_page = NULL;
684 } else if (err == -EEXIST)
685 goto repeat;
686 } 684 }
687 if (cached_page)
688 page_cache_release(cached_page);
689 return page; 685 return page;
690} 686}
691EXPORT_SYMBOL(find_or_create_page); 687EXPORT_SYMBOL(find_or_create_page);
@@ -797,7 +793,7 @@ EXPORT_SYMBOL(find_get_pages_tag);
797 * and deadlock against the caller's locked page. 793 * and deadlock against the caller's locked page.
798 */ 794 */
799struct page * 795struct page *
800grab_cache_page_nowait(struct address_space *mapping, unsigned long index) 796grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
801{ 797{
802 struct page *page = find_get_page(mapping, index); 798 struct page *page = find_get_page(mapping, index);
803 799
@@ -859,34 +855,29 @@ static void shrink_readahead_size_eio(struct file *filp,
859 * It may be NULL. 855 * It may be NULL.
860 */ 856 */
861void do_generic_mapping_read(struct address_space *mapping, 857void do_generic_mapping_read(struct address_space *mapping,
862 struct file_ra_state *_ra, 858 struct file_ra_state *ra,
863 struct file *filp, 859 struct file *filp,
864 loff_t *ppos, 860 loff_t *ppos,
865 read_descriptor_t *desc, 861 read_descriptor_t *desc,
866 read_actor_t actor) 862 read_actor_t actor)
867{ 863{
868 struct inode *inode = mapping->host; 864 struct inode *inode = mapping->host;
869 unsigned long index; 865 pgoff_t index;
870 unsigned long offset; 866 pgoff_t last_index;
871 unsigned long last_index; 867 pgoff_t prev_index;
872 unsigned long next_index; 868 unsigned long offset; /* offset into pagecache page */
873 unsigned long prev_index;
874 unsigned int prev_offset; 869 unsigned int prev_offset;
875 struct page *cached_page;
876 int error; 870 int error;
877 struct file_ra_state ra = *_ra;
878 871
879 cached_page = NULL;
880 index = *ppos >> PAGE_CACHE_SHIFT; 872 index = *ppos >> PAGE_CACHE_SHIFT;
881 next_index = index; 873 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
882 prev_index = ra.prev_index; 874 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
883 prev_offset = ra.prev_offset;
884 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 875 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
885 offset = *ppos & ~PAGE_CACHE_MASK; 876 offset = *ppos & ~PAGE_CACHE_MASK;
886 877
887 for (;;) { 878 for (;;) {
888 struct page *page; 879 struct page *page;
889 unsigned long end_index; 880 pgoff_t end_index;
890 loff_t isize; 881 loff_t isize;
891 unsigned long nr, ret; 882 unsigned long nr, ret;
892 883
@@ -895,7 +886,7 @@ find_page:
895 page = find_get_page(mapping, index); 886 page = find_get_page(mapping, index);
896 if (!page) { 887 if (!page) {
897 page_cache_sync_readahead(mapping, 888 page_cache_sync_readahead(mapping,
898 &ra, filp, 889 ra, filp,
899 index, last_index - index); 890 index, last_index - index);
900 page = find_get_page(mapping, index); 891 page = find_get_page(mapping, index);
901 if (unlikely(page == NULL)) 892 if (unlikely(page == NULL))
@@ -903,7 +894,7 @@ find_page:
903 } 894 }
904 if (PageReadahead(page)) { 895 if (PageReadahead(page)) {
905 page_cache_async_readahead(mapping, 896 page_cache_async_readahead(mapping,
906 &ra, filp, page, 897 ra, filp, page,
907 index, last_index - index); 898 index, last_index - index);
908 } 899 }
909 if (!PageUptodate(page)) 900 if (!PageUptodate(page))
@@ -966,7 +957,6 @@ page_ok:
966 index += offset >> PAGE_CACHE_SHIFT; 957 index += offset >> PAGE_CACHE_SHIFT;
967 offset &= ~PAGE_CACHE_MASK; 958 offset &= ~PAGE_CACHE_MASK;
968 prev_offset = offset; 959 prev_offset = offset;
969 ra.prev_offset = offset;
970 960
971 page_cache_release(page); 961 page_cache_release(page);
972 if (ret == nr && desc->count) 962 if (ret == nr && desc->count)
@@ -1015,7 +1005,7 @@ readpage:
1015 } 1005 }
1016 unlock_page(page); 1006 unlock_page(page);
1017 error = -EIO; 1007 error = -EIO;
1018 shrink_readahead_size_eio(filp, &ra); 1008 shrink_readahead_size_eio(filp, ra);
1019 goto readpage_error; 1009 goto readpage_error;
1020 } 1010 }
1021 unlock_page(page); 1011 unlock_page(page);
@@ -1034,33 +1024,29 @@ no_cached_page:
1034 * Ok, it wasn't cached, so we need to create a new 1024 * Ok, it wasn't cached, so we need to create a new
1035 * page.. 1025 * page..
1036 */ 1026 */
1037 if (!cached_page) { 1027 page = page_cache_alloc_cold(mapping);
1038 cached_page = page_cache_alloc_cold(mapping); 1028 if (!page) {
1039 if (!cached_page) { 1029 desc->error = -ENOMEM;
1040 desc->error = -ENOMEM; 1030 goto out;
1041 goto out;
1042 }
1043 } 1031 }
1044 error = add_to_page_cache_lru(cached_page, mapping, 1032 error = add_to_page_cache_lru(page, mapping,
1045 index, GFP_KERNEL); 1033 index, GFP_KERNEL);
1046 if (error) { 1034 if (error) {
1035 page_cache_release(page);
1047 if (error == -EEXIST) 1036 if (error == -EEXIST)
1048 goto find_page; 1037 goto find_page;
1049 desc->error = error; 1038 desc->error = error;
1050 goto out; 1039 goto out;
1051 } 1040 }
1052 page = cached_page;
1053 cached_page = NULL;
1054 goto readpage; 1041 goto readpage;
1055 } 1042 }
1056 1043
1057out: 1044out:
1058 *_ra = ra; 1045 ra->prev_pos = prev_index;
1059 _ra->prev_index = prev_index; 1046 ra->prev_pos <<= PAGE_CACHE_SHIFT;
1047 ra->prev_pos |= prev_offset;
1060 1048
1061 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1049 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1062 if (cached_page)
1063 page_cache_release(cached_page);
1064 if (filp) 1050 if (filp)
1065 file_accessed(filp); 1051 file_accessed(filp);
1066} 1052}
@@ -1220,7 +1206,7 @@ EXPORT_SYMBOL(generic_file_aio_read);
1220 1206
1221static ssize_t 1207static ssize_t
1222do_readahead(struct address_space *mapping, struct file *filp, 1208do_readahead(struct address_space *mapping, struct file *filp,
1223 unsigned long index, unsigned long nr) 1209 pgoff_t index, unsigned long nr)
1224{ 1210{
1225 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1211 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1226 return -EINVAL; 1212 return -EINVAL;
@@ -1240,8 +1226,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1240 if (file) { 1226 if (file) {
1241 if (file->f_mode & FMODE_READ) { 1227 if (file->f_mode & FMODE_READ) {
1242 struct address_space *mapping = file->f_mapping; 1228 struct address_space *mapping = file->f_mapping;
1243 unsigned long start = offset >> PAGE_CACHE_SHIFT; 1229 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1244 unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1230 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1245 unsigned long len = end - start + 1; 1231 unsigned long len = end - start + 1;
1246 ret = do_readahead(mapping, file, start, len); 1232 ret = do_readahead(mapping, file, start, len);
1247 } 1233 }
@@ -1251,7 +1237,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1251} 1237}
1252 1238
1253#ifdef CONFIG_MMU 1239#ifdef CONFIG_MMU
1254static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1255/** 1240/**
1256 * page_cache_read - adds requested page to the page cache if not already there 1241 * page_cache_read - adds requested page to the page cache if not already there
1257 * @file: file to read 1242 * @file: file to read
@@ -1260,7 +1245,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1260 * This adds the requested page to the page cache if it isn't already there, 1245 * This adds the requested page to the page cache if it isn't already there,
1261 * and schedules an I/O to read in its contents from disk. 1246 * and schedules an I/O to read in its contents from disk.
1262 */ 1247 */
1263static int fastcall page_cache_read(struct file * file, unsigned long offset) 1248static int fastcall page_cache_read(struct file * file, pgoff_t offset)
1264{ 1249{
1265 struct address_space *mapping = file->f_mapping; 1250 struct address_space *mapping = file->f_mapping;
1266 struct page *page; 1251 struct page *page;
@@ -1349,7 +1334,7 @@ retry_find:
1349 * Do we miss much more than hit in this file? If so, 1334 * Do we miss much more than hit in this file? If so,
1350 * stop bothering with read-ahead. It will only hurt. 1335 * stop bothering with read-ahead. It will only hurt.
1351 */ 1336 */
1352 if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) 1337 if (ra->mmap_miss > MMAP_LOTSAMISS)
1353 goto no_cached_page; 1338 goto no_cached_page;
1354 1339
1355 /* 1340 /*
@@ -1375,7 +1360,7 @@ retry_find:
1375 } 1360 }
1376 1361
1377 if (!did_readaround) 1362 if (!did_readaround)
1378 ra->mmap_hit++; 1363 ra->mmap_miss--;
1379 1364
1380 /* 1365 /*
1381 * We have a locked page in the page cache, now we need to check 1366 * We have a locked page in the page cache, now we need to check
@@ -1396,7 +1381,7 @@ retry_find:
1396 * Found the page and have a reference on it. 1381 * Found the page and have a reference on it.
1397 */ 1382 */
1398 mark_page_accessed(page); 1383 mark_page_accessed(page);
1399 ra->prev_index = page->index; 1384 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1400 vmf->page = page; 1385 vmf->page = page;
1401 return ret | VM_FAULT_LOCKED; 1386 return ret | VM_FAULT_LOCKED;
1402 1387
@@ -1501,39 +1486,32 @@ EXPORT_SYMBOL(generic_file_mmap);
1501EXPORT_SYMBOL(generic_file_readonly_mmap); 1486EXPORT_SYMBOL(generic_file_readonly_mmap);
1502 1487
1503static struct page *__read_cache_page(struct address_space *mapping, 1488static struct page *__read_cache_page(struct address_space *mapping,
1504 unsigned long index, 1489 pgoff_t index,
1505 int (*filler)(void *,struct page*), 1490 int (*filler)(void *,struct page*),
1506 void *data) 1491 void *data)
1507{ 1492{
1508 struct page *page, *cached_page = NULL; 1493 struct page *page;
1509 int err; 1494 int err;
1510repeat: 1495repeat:
1511 page = find_get_page(mapping, index); 1496 page = find_get_page(mapping, index);
1512 if (!page) { 1497 if (!page) {
1513 if (!cached_page) { 1498 page = page_cache_alloc_cold(mapping);
1514 cached_page = page_cache_alloc_cold(mapping); 1499 if (!page)
1515 if (!cached_page) 1500 return ERR_PTR(-ENOMEM);
1516 return ERR_PTR(-ENOMEM); 1501 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1517 } 1502 if (unlikely(err)) {
1518 err = add_to_page_cache_lru(cached_page, mapping, 1503 page_cache_release(page);
1519 index, GFP_KERNEL); 1504 if (err == -EEXIST)
1520 if (err == -EEXIST) 1505 goto repeat;
1521 goto repeat;
1522 if (err < 0) {
1523 /* Presumably ENOMEM for radix tree node */ 1506 /* Presumably ENOMEM for radix tree node */
1524 page_cache_release(cached_page);
1525 return ERR_PTR(err); 1507 return ERR_PTR(err);
1526 } 1508 }
1527 page = cached_page;
1528 cached_page = NULL;
1529 err = filler(data, page); 1509 err = filler(data, page);
1530 if (err < 0) { 1510 if (err < 0) {
1531 page_cache_release(page); 1511 page_cache_release(page);
1532 page = ERR_PTR(err); 1512 page = ERR_PTR(err);
1533 } 1513 }
1534 } 1514 }
1535 if (cached_page)
1536 page_cache_release(cached_page);
1537 return page; 1515 return page;
1538} 1516}
1539 1517
@@ -1542,7 +1520,7 @@ repeat:
1542 * after submitting it to the filler. 1520 * after submitting it to the filler.
1543 */ 1521 */
1544struct page *read_cache_page_async(struct address_space *mapping, 1522struct page *read_cache_page_async(struct address_space *mapping,
1545 unsigned long index, 1523 pgoff_t index,
1546 int (*filler)(void *,struct page*), 1524 int (*filler)(void *,struct page*),
1547 void *data) 1525 void *data)
1548{ 1526{
@@ -1590,7 +1568,7 @@ EXPORT_SYMBOL(read_cache_page_async);
1590 * If the page does not get brought uptodate, return -EIO. 1568 * If the page does not get brought uptodate, return -EIO.
1591 */ 1569 */
1592struct page *read_cache_page(struct address_space *mapping, 1570struct page *read_cache_page(struct address_space *mapping,
1593 unsigned long index, 1571 pgoff_t index,
1594 int (*filler)(void *,struct page*), 1572 int (*filler)(void *,struct page*),
1595 void *data) 1573 void *data)
1596{ 1574{
@@ -1610,40 +1588,6 @@ struct page *read_cache_page(struct address_space *mapping,
1610EXPORT_SYMBOL(read_cache_page); 1588EXPORT_SYMBOL(read_cache_page);
1611 1589
1612/* 1590/*
1613 * If the page was newly created, increment its refcount and add it to the
1614 * caller's lru-buffering pagevec. This function is specifically for
1615 * generic_file_write().
1616 */
1617static inline struct page *
1618__grab_cache_page(struct address_space *mapping, unsigned long index,
1619 struct page **cached_page, struct pagevec *lru_pvec)
1620{
1621 int err;
1622 struct page *page;
1623repeat:
1624 page = find_lock_page(mapping, index);
1625 if (!page) {
1626 if (!*cached_page) {
1627 *cached_page = page_cache_alloc(mapping);
1628 if (!*cached_page)
1629 return NULL;
1630 }
1631 err = add_to_page_cache(*cached_page, mapping,
1632 index, GFP_KERNEL);
1633 if (err == -EEXIST)
1634 goto repeat;
1635 if (err == 0) {
1636 page = *cached_page;
1637 page_cache_get(page);
1638 if (!pagevec_add(lru_pvec, page))
1639 __pagevec_lru_add(lru_pvec);
1640 *cached_page = NULL;
1641 }
1642 }
1643 return page;
1644}
1645
1646/*
1647 * The logic we want is 1591 * The logic we want is
1648 * 1592 *
1649 * if suid or (sgid and xgrp) 1593 * if suid or (sgid and xgrp)
@@ -1691,8 +1635,7 @@ int remove_suid(struct dentry *dentry)
1691} 1635}
1692EXPORT_SYMBOL(remove_suid); 1636EXPORT_SYMBOL(remove_suid);
1693 1637
1694size_t 1638static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1695__filemap_copy_from_user_iovec_inatomic(char *vaddr,
1696 const struct iovec *iov, size_t base, size_t bytes) 1639 const struct iovec *iov, size_t base, size_t bytes)
1697{ 1640{
1698 size_t copied = 0, left = 0; 1641 size_t copied = 0, left = 0;
@@ -1715,6 +1658,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
1715} 1658}
1716 1659
1717/* 1660/*
1661 * Copy as much as we can into the page and return the number of bytes which
1662 * were sucessfully copied. If a fault is encountered then return the number of
1663 * bytes which were copied.
1664 */
1665size_t iov_iter_copy_from_user_atomic(struct page *page,
1666 struct iov_iter *i, unsigned long offset, size_t bytes)
1667{
1668 char *kaddr;
1669 size_t copied;
1670
1671 BUG_ON(!in_atomic());
1672 kaddr = kmap_atomic(page, KM_USER0);
1673 if (likely(i->nr_segs == 1)) {
1674 int left;
1675 char __user *buf = i->iov->iov_base + i->iov_offset;
1676 left = __copy_from_user_inatomic_nocache(kaddr + offset,
1677 buf, bytes);
1678 copied = bytes - left;
1679 } else {
1680 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1681 i->iov, i->iov_offset, bytes);
1682 }
1683 kunmap_atomic(kaddr, KM_USER0);
1684
1685 return copied;
1686}
1687EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1688
1689/*
1690 * This has the same sideeffects and return value as
1691 * iov_iter_copy_from_user_atomic().
1692 * The difference is that it attempts to resolve faults.
1693 * Page must not be locked.
1694 */
1695size_t iov_iter_copy_from_user(struct page *page,
1696 struct iov_iter *i, unsigned long offset, size_t bytes)
1697{
1698 char *kaddr;
1699 size_t copied;
1700
1701 kaddr = kmap(page);
1702 if (likely(i->nr_segs == 1)) {
1703 int left;
1704 char __user *buf = i->iov->iov_base + i->iov_offset;
1705 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
1706 copied = bytes - left;
1707 } else {
1708 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1709 i->iov, i->iov_offset, bytes);
1710 }
1711 kunmap(page);
1712 return copied;
1713}
1714EXPORT_SYMBOL(iov_iter_copy_from_user);
1715
1716static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
1717{
1718 if (likely(i->nr_segs == 1)) {
1719 i->iov_offset += bytes;
1720 } else {
1721 const struct iovec *iov = i->iov;
1722 size_t base = i->iov_offset;
1723
1724 while (bytes) {
1725 int copy = min(bytes, iov->iov_len - base);
1726
1727 bytes -= copy;
1728 base += copy;
1729 if (iov->iov_len == base) {
1730 iov++;
1731 base = 0;
1732 }
1733 }
1734 i->iov = iov;
1735 i->iov_offset = base;
1736 }
1737}
1738
1739void iov_iter_advance(struct iov_iter *i, size_t bytes)
1740{
1741 BUG_ON(i->count < bytes);
1742
1743 __iov_iter_advance_iov(i, bytes);
1744 i->count -= bytes;
1745}
1746EXPORT_SYMBOL(iov_iter_advance);
1747
1748/*
1749 * Fault in the first iovec of the given iov_iter, to a maximum length
1750 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1751 * accessed (ie. because it is an invalid address).
1752 *
1753 * writev-intensive code may want this to prefault several iovecs -- that
1754 * would be possible (callers must not rely on the fact that _only_ the
1755 * first iovec will be faulted with the current implementation).
1756 */
1757int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1758{
1759 char __user *buf = i->iov->iov_base + i->iov_offset;
1760 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1761 return fault_in_pages_readable(buf, bytes);
1762}
1763EXPORT_SYMBOL(iov_iter_fault_in_readable);
1764
1765/*
1766 * Return the count of just the current iov_iter segment.
1767 */
1768size_t iov_iter_single_seg_count(struct iov_iter *i)
1769{
1770 const struct iovec *iov = i->iov;
1771 if (i->nr_segs == 1)
1772 return i->count;
1773 else
1774 return min(i->count, iov->iov_len - i->iov_offset);
1775}
1776EXPORT_SYMBOL(iov_iter_single_seg_count);
1777
1778/*
1718 * Performs necessary checks before doing a write 1779 * Performs necessary checks before doing a write
1719 * 1780 *
1720 * Can adjust writing position or amount of bytes to write. 1781 * Can adjust writing position or amount of bytes to write.
@@ -1796,6 +1857,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1796} 1857}
1797EXPORT_SYMBOL(generic_write_checks); 1858EXPORT_SYMBOL(generic_write_checks);
1798 1859
1860int pagecache_write_begin(struct file *file, struct address_space *mapping,
1861 loff_t pos, unsigned len, unsigned flags,
1862 struct page **pagep, void **fsdata)
1863{
1864 const struct address_space_operations *aops = mapping->a_ops;
1865
1866 if (aops->write_begin) {
1867 return aops->write_begin(file, mapping, pos, len, flags,
1868 pagep, fsdata);
1869 } else {
1870 int ret;
1871 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1872 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1873 struct inode *inode = mapping->host;
1874 struct page *page;
1875again:
1876 page = __grab_cache_page(mapping, index);
1877 *pagep = page;
1878 if (!page)
1879 return -ENOMEM;
1880
1881 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
1882 /*
1883 * There is no way to resolve a short write situation
1884 * for a !Uptodate page (except by double copying in
1885 * the caller done by generic_perform_write_2copy).
1886 *
1887 * Instead, we have to bring it uptodate here.
1888 */
1889 ret = aops->readpage(file, page);
1890 page_cache_release(page);
1891 if (ret) {
1892 if (ret == AOP_TRUNCATED_PAGE)
1893 goto again;
1894 return ret;
1895 }
1896 goto again;
1897 }
1898
1899 ret = aops->prepare_write(file, page, offset, offset+len);
1900 if (ret) {
1901 unlock_page(page);
1902 page_cache_release(page);
1903 if (pos + len > inode->i_size)
1904 vmtruncate(inode, inode->i_size);
1905 }
1906 return ret;
1907 }
1908}
1909EXPORT_SYMBOL(pagecache_write_begin);
1910
1911int pagecache_write_end(struct file *file, struct address_space *mapping,
1912 loff_t pos, unsigned len, unsigned copied,
1913 struct page *page, void *fsdata)
1914{
1915 const struct address_space_operations *aops = mapping->a_ops;
1916 int ret;
1917
1918 if (aops->write_end) {
1919 mark_page_accessed(page);
1920 ret = aops->write_end(file, mapping, pos, len, copied,
1921 page, fsdata);
1922 } else {
1923 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1924 struct inode *inode = mapping->host;
1925
1926 flush_dcache_page(page);
1927 ret = aops->commit_write(file, page, offset, offset+len);
1928 unlock_page(page);
1929 mark_page_accessed(page);
1930 page_cache_release(page);
1931
1932 if (ret < 0) {
1933 if (pos + len > inode->i_size)
1934 vmtruncate(inode, inode->i_size);
1935 } else if (ret > 0)
1936 ret = min_t(size_t, copied, ret);
1937 else
1938 ret = copied;
1939 }
1940
1941 return ret;
1942}
1943EXPORT_SYMBOL(pagecache_write_end);
1944
1799ssize_t 1945ssize_t
1800generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 1946generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1801 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 1947 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1835,151 +1981,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1835} 1981}
1836EXPORT_SYMBOL(generic_file_direct_write); 1982EXPORT_SYMBOL(generic_file_direct_write);
1837 1983
1838ssize_t 1984/*
1839generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 1985 * Find or create a page at the given pagecache position. Return the locked
1840 unsigned long nr_segs, loff_t pos, loff_t *ppos, 1986 * page. This function is specifically for buffered writes.
1841 size_t count, ssize_t written) 1987 */
1988struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
1842{ 1989{
1843 struct file *file = iocb->ki_filp; 1990 int status;
1844 struct address_space * mapping = file->f_mapping; 1991 struct page *page;
1845 const struct address_space_operations *a_ops = mapping->a_ops; 1992repeat:
1846 struct inode *inode = mapping->host; 1993 page = find_lock_page(mapping, index);
1847 long status = 0; 1994 if (likely(page))
1848 struct page *page; 1995 return page;
1849 struct page *cached_page = NULL;
1850 size_t bytes;
1851 struct pagevec lru_pvec;
1852 const struct iovec *cur_iov = iov; /* current iovec */
1853 size_t iov_base = 0; /* offset in the current iovec */
1854 char __user *buf;
1855
1856 pagevec_init(&lru_pvec, 0);
1857 1996
1858 /* 1997 page = page_cache_alloc(mapping);
1859 * handle partial DIO write. Adjust cur_iov if needed. 1998 if (!page)
1860 */ 1999 return NULL;
1861 if (likely(nr_segs == 1)) 2000 status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1862 buf = iov->iov_base + written; 2001 if (unlikely(status)) {
1863 else { 2002 page_cache_release(page);
1864 filemap_set_next_iovec(&cur_iov, &iov_base, written); 2003 if (status == -EEXIST)
1865 buf = cur_iov->iov_base + iov_base; 2004 goto repeat;
2005 return NULL;
1866 } 2006 }
2007 return page;
2008}
2009EXPORT_SYMBOL(__grab_cache_page);
2010
2011static ssize_t generic_perform_write_2copy(struct file *file,
2012 struct iov_iter *i, loff_t pos)
2013{
2014 struct address_space *mapping = file->f_mapping;
2015 const struct address_space_operations *a_ops = mapping->a_ops;
2016 struct inode *inode = mapping->host;
2017 long status = 0;
2018 ssize_t written = 0;
1867 2019
1868 do { 2020 do {
1869 unsigned long index; 2021 struct page *src_page;
1870 unsigned long offset; 2022 struct page *page;
1871 size_t copied; 2023 pgoff_t index; /* Pagecache index for current page */
2024 unsigned long offset; /* Offset into pagecache page */
2025 unsigned long bytes; /* Bytes to write to page */
2026 size_t copied; /* Bytes copied from user */
1872 2027
1873 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 2028 offset = (pos & (PAGE_CACHE_SIZE - 1));
1874 index = pos >> PAGE_CACHE_SHIFT; 2029 index = pos >> PAGE_CACHE_SHIFT;
1875 bytes = PAGE_CACHE_SIZE - offset; 2030 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
1876 2031 iov_iter_count(i));
1877 /* Limit the size of the copy to the caller's write size */
1878 bytes = min(bytes, count);
1879 2032
1880 /* We only need to worry about prefaulting when writes are from 2033 /*
1881 * user-space. NFSd uses vfs_writev with several non-aligned 2034 * a non-NULL src_page indicates that we're doing the
1882 * segments in the vector, and limiting to one segment a time is 2035 * copy via get_user_pages and kmap.
1883 * a noticeable performance for re-write
1884 */ 2036 */
1885 if (!segment_eq(get_fs(), KERNEL_DS)) { 2037 src_page = NULL;
1886 /*
1887 * Limit the size of the copy to that of the current
1888 * segment, because fault_in_pages_readable() doesn't
1889 * know how to walk segments.
1890 */
1891 bytes = min(bytes, cur_iov->iov_len - iov_base);
1892 2038
1893 /* 2039 /*
1894 * Bring in the user page that we will copy from 2040 * Bring in the user page that we will copy from _first_.
1895 * _first_. Otherwise there's a nasty deadlock on 2041 * Otherwise there's a nasty deadlock on copying from the
1896 * copying from the same page as we're writing to, 2042 * same page as we're writing to, without it being marked
1897 * without it being marked up-to-date. 2043 * up-to-date.
1898 */ 2044 *
1899 fault_in_pages_readable(buf, bytes); 2045 * Not only is this an optimisation, but it is also required
2046 * to check that the address is actually valid, when atomic
2047 * usercopies are used, below.
2048 */
2049 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2050 status = -EFAULT;
2051 break;
1900 } 2052 }
1901 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); 2053
2054 page = __grab_cache_page(mapping, index);
1902 if (!page) { 2055 if (!page) {
1903 status = -ENOMEM; 2056 status = -ENOMEM;
1904 break; 2057 break;
1905 } 2058 }
1906 2059
1907 if (unlikely(bytes == 0)) { 2060 /*
1908 status = 0; 2061 * non-uptodate pages cannot cope with short copies, and we
1909 copied = 0; 2062 * cannot take a pagefault with the destination page locked.
1910 goto zero_length_segment; 2063 * So pin the source page to copy it.
1911 } 2064 */
2065 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2066 unlock_page(page);
1912 2067
1913 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2068 src_page = alloc_page(GFP_KERNEL);
1914 if (unlikely(status)) { 2069 if (!src_page) {
1915 loff_t isize = i_size_read(inode); 2070 page_cache_release(page);
2071 status = -ENOMEM;
2072 break;
2073 }
2074
2075 /*
2076 * Cannot get_user_pages with a page locked for the
2077 * same reason as we can't take a page fault with a
2078 * page locked (as explained below).
2079 */
2080 copied = iov_iter_copy_from_user(src_page, i,
2081 offset, bytes);
2082 if (unlikely(copied == 0)) {
2083 status = -EFAULT;
2084 page_cache_release(page);
2085 page_cache_release(src_page);
2086 break;
2087 }
2088 bytes = copied;
1916 2089
1917 if (status != AOP_TRUNCATED_PAGE) 2090 lock_page(page);
2091 /*
2092 * Can't handle the page going uptodate here, because
2093 * that means we would use non-atomic usercopies, which
2094 * zero out the tail of the page, which can cause
2095 * zeroes to become transiently visible. We could just
2096 * use a non-zeroing copy, but the APIs aren't too
2097 * consistent.
2098 */
2099 if (unlikely(!page->mapping || PageUptodate(page))) {
1918 unlock_page(page); 2100 unlock_page(page);
1919 page_cache_release(page); 2101 page_cache_release(page);
1920 if (status == AOP_TRUNCATED_PAGE) 2102 page_cache_release(src_page);
1921 continue; 2103 continue;
2104 }
2105 }
2106
2107 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2108 if (unlikely(status))
2109 goto fs_write_aop_error;
2110
2111 if (!src_page) {
1922 /* 2112 /*
1923 * prepare_write() may have instantiated a few blocks 2113 * Must not enter the pagefault handler here, because
1924 * outside i_size. Trim these off again. 2114 * we hold the page lock, so we might recursively
2115 * deadlock on the same lock, or get an ABBA deadlock
2116 * against a different lock, or against the mmap_sem
2117 * (which nests outside the page lock). So increment
2118 * preempt count, and use _atomic usercopies.
2119 *
2120 * The page is uptodate so we are OK to encounter a
2121 * short copy: if unmodified parts of the page are
2122 * marked dirty and written out to disk, it doesn't
2123 * really matter.
1925 */ 2124 */
1926 if (pos + bytes > isize) 2125 pagefault_disable();
1927 vmtruncate(inode, isize); 2126 copied = iov_iter_copy_from_user_atomic(page, i,
1928 break; 2127 offset, bytes);
2128 pagefault_enable();
2129 } else {
2130 void *src, *dst;
2131 src = kmap_atomic(src_page, KM_USER0);
2132 dst = kmap_atomic(page, KM_USER1);
2133 memcpy(dst + offset, src + offset, bytes);
2134 kunmap_atomic(dst, KM_USER1);
2135 kunmap_atomic(src, KM_USER0);
2136 copied = bytes;
1929 } 2137 }
1930 if (likely(nr_segs == 1))
1931 copied = filemap_copy_from_user(page, offset,
1932 buf, bytes);
1933 else
1934 copied = filemap_copy_from_user_iovec(page, offset,
1935 cur_iov, iov_base, bytes);
1936 flush_dcache_page(page); 2138 flush_dcache_page(page);
2139
1937 status = a_ops->commit_write(file, page, offset, offset+bytes); 2140 status = a_ops->commit_write(file, page, offset, offset+bytes);
1938 if (status == AOP_TRUNCATED_PAGE) { 2141 if (unlikely(status < 0))
1939 page_cache_release(page); 2142 goto fs_write_aop_error;
1940 continue; 2143 if (unlikely(status > 0)) /* filesystem did partial write */
1941 } 2144 copied = min_t(size_t, copied, status);
1942zero_length_segment: 2145
1943 if (likely(copied >= 0)) {
1944 if (!status)
1945 status = copied;
1946
1947 if (status >= 0) {
1948 written += status;
1949 count -= status;
1950 pos += status;
1951 buf += status;
1952 if (unlikely(nr_segs > 1)) {
1953 filemap_set_next_iovec(&cur_iov,
1954 &iov_base, status);
1955 if (count)
1956 buf = cur_iov->iov_base +
1957 iov_base;
1958 } else {
1959 iov_base += status;
1960 }
1961 }
1962 }
1963 if (unlikely(copied != bytes))
1964 if (status >= 0)
1965 status = -EFAULT;
1966 unlock_page(page); 2146 unlock_page(page);
1967 mark_page_accessed(page); 2147 mark_page_accessed(page);
1968 page_cache_release(page); 2148 page_cache_release(page);
1969 if (status < 0) 2149 if (src_page)
1970 break; 2150 page_cache_release(src_page);
2151
2152 iov_iter_advance(i, copied);
2153 pos += copied;
2154 written += copied;
2155
1971 balance_dirty_pages_ratelimited(mapping); 2156 balance_dirty_pages_ratelimited(mapping);
1972 cond_resched(); 2157 cond_resched();
1973 } while (count); 2158 continue;
1974 *ppos = pos;
1975 2159
1976 if (cached_page) 2160fs_write_aop_error:
1977 page_cache_release(cached_page); 2161 unlock_page(page);
2162 page_cache_release(page);
2163 if (src_page)
2164 page_cache_release(src_page);
2165
2166 /*
2167 * prepare_write() may have instantiated a few blocks
2168 * outside i_size. Trim these off again. Don't need
2169 * i_size_read because we hold i_mutex.
2170 */
2171 if (pos + bytes > inode->i_size)
2172 vmtruncate(inode, inode->i_size);
2173 break;
2174 } while (iov_iter_count(i));
2175
2176 return written ? written : status;
2177}
2178
2179static ssize_t generic_perform_write(struct file *file,
2180 struct iov_iter *i, loff_t pos)
2181{
2182 struct address_space *mapping = file->f_mapping;
2183 const struct address_space_operations *a_ops = mapping->a_ops;
2184 long status = 0;
2185 ssize_t written = 0;
2186 unsigned int flags = 0;
1978 2187
1979 /* 2188 /*
1980 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC 2189 * Copies from kernel address space cannot fail (NFSD is a big user).
1981 */ 2190 */
2191 if (segment_eq(get_fs(), KERNEL_DS))
2192 flags |= AOP_FLAG_UNINTERRUPTIBLE;
2193
2194 do {
2195 struct page *page;
2196 pgoff_t index; /* Pagecache index for current page */
2197 unsigned long offset; /* Offset into pagecache page */
2198 unsigned long bytes; /* Bytes to write to page */
2199 size_t copied; /* Bytes copied from user */
2200 void *fsdata;
2201
2202 offset = (pos & (PAGE_CACHE_SIZE - 1));
2203 index = pos >> PAGE_CACHE_SHIFT;
2204 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2205 iov_iter_count(i));
2206
2207again:
2208
2209 /*
2210 * Bring in the user page that we will copy from _first_.
2211 * Otherwise there's a nasty deadlock on copying from the
2212 * same page as we're writing to, without it being marked
2213 * up-to-date.
2214 *
2215 * Not only is this an optimisation, but it is also required
2216 * to check that the address is actually valid, when atomic
2217 * usercopies are used, below.
2218 */
2219 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2220 status = -EFAULT;
2221 break;
2222 }
2223
2224 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2225 &page, &fsdata);
2226 if (unlikely(status))
2227 break;
2228
2229 pagefault_disable();
2230 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2231 pagefault_enable();
2232 flush_dcache_page(page);
2233
2234 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2235 page, fsdata);
2236 if (unlikely(status < 0))
2237 break;
2238 copied = status;
2239
2240 cond_resched();
2241
2242 if (unlikely(copied == 0)) {
2243 /*
2244 * If we were unable to copy any data at all, we must
2245 * fall back to a single segment length write.
2246 *
2247 * If we didn't fallback here, we could livelock
2248 * because not all segments in the iov can be copied at
2249 * once without a pagefault.
2250 */
2251 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2252 iov_iter_single_seg_count(i));
2253 goto again;
2254 }
2255 iov_iter_advance(i, copied);
2256 pos += copied;
2257 written += copied;
2258
2259 balance_dirty_pages_ratelimited(mapping);
2260
2261 } while (iov_iter_count(i));
2262
2263 return written ? written : status;
2264}
2265
2266ssize_t
2267generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2268 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2269 size_t count, ssize_t written)
2270{
2271 struct file *file = iocb->ki_filp;
2272 struct address_space *mapping = file->f_mapping;
2273 const struct address_space_operations *a_ops = mapping->a_ops;
2274 struct inode *inode = mapping->host;
2275 ssize_t status;
2276 struct iov_iter i;
2277
2278 iov_iter_init(&i, iov, nr_segs, count, written);
2279 if (a_ops->write_begin)
2280 status = generic_perform_write(file, &i, pos);
2281 else
2282 status = generic_perform_write_2copy(file, &i, pos);
2283
1982 if (likely(status >= 0)) { 2284 if (likely(status >= 0)) {
2285 written += status;
2286 *ppos = pos + status;
2287
2288 /*
2289 * For now, when the user asks for O_SYNC, we'll actually give
2290 * O_DSYNC
2291 */
1983 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2292 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1984 if (!a_ops->writepage || !is_sync_kiocb(iocb)) 2293 if (!a_ops->writepage || !is_sync_kiocb(iocb))
1985 status = generic_osync_inode(inode, mapping, 2294 status = generic_osync_inode(inode, mapping,
@@ -1995,7 +2304,6 @@ zero_length_segment:
1995 if (unlikely(file->f_flags & O_DIRECT) && written) 2304 if (unlikely(file->f_flags & O_DIRECT) && written)
1996 status = filemap_write_and_wait(mapping); 2305 status = filemap_write_and_wait(mapping);
1997 2306
1998 pagevec_lru_add(&lru_pvec);
1999 return written ? written : status; 2307 return written ? written : status;
2000} 2308}
2001EXPORT_SYMBOL(generic_file_buffered_write); 2309EXPORT_SYMBOL(generic_file_buffered_write);