1 files changed, 460 insertions, 104 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index f99ff3e50bd6..bd106361be4b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
+#include <linux/falloc.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
        char value[0];
 };
+/*
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+        pgoff_t start;          /* start of range currently being fallocated */
+        pgoff_t next;           /* the next page offset to be fallocated */
+        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
+        pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
+};
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
        SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
-        SGP_WRITE,      /* may exceed i_size, may allocate page */
+        SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
+        SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
 };
 #ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                                struct shmem_inode_info *info, pgoff_t index);
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
@@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
 }
 /*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+                               pgoff_t index, swp_entry_t swap)
+{
+        void *item;
+        rcu_read_lock();
+        item = radix_tree_lookup(&mapping->page_tree, index);
+        rcu_read_unlock();
+        return item == swp_to_radix_entry(swap);
+}
+/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
 static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
                                   pgoff_t index, gfp_t gfp, void *expected)
 {
-        int error = 0;
+        int error;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapBacked(page));
+        page_cache_get(page);
+        page->mapping = mapping;
+        page->index = index;
+        spin_lock_irq(&mapping->tree_lock);
        if (!expected)
-                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                error = radix_tree_insert(&mapping->page_tree, index, page);
+        else
+                error = shmem_radix_tree_replace(mapping, index, expected,
+                                                                 page);
        if (!error) {
-                page_cache_get(page);
+                mapping->nrpages++;
-                page->mapping = mapping;
+                __inc_zone_page_state(page, NR_FILE_PAGES);
-                page->index = index;
+                __inc_zone_page_state(page, NR_SHMEM);
+                spin_unlock_irq(&mapping->tree_lock);
-                spin_lock_irq(&mapping->tree_lock);
+        } else {
-                if (!expected)
+                page->mapping = NULL;
-                        error = radix_tree_insert(&mapping->page_tree,
+                spin_unlock_irq(&mapping->tree_lock);
-                                                        index, page);
+                page_cache_release(page);
-                else
-                        error = shmem_radix_tree_replace(mapping, index,
-                                                        expected, page);
-                if (!error) {
-                        mapping->nrpages++;
-                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                        __inc_zone_page_state(page, NR_SHMEM);
-                        spin_unlock_irq(&mapping->tree_lock);
-                } else {
-                        page->mapping = NULL;
-                        spin_unlock_irq(&mapping->tree_lock);
-                        page_cache_release(page);
-                }
-                if (!expected)
-                        radix_tree_preload_end();
        }
-        if (error)
-                mem_cgroup_uncharge_cache_page(page);
        return error;
 }
@@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
 /*
 * Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 */
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+                                                                 bool unfalloc)
 {
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+        pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
-        pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+        unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+        unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
        struct pagevec pvec;
        pgoff_t indices[PAGEVEC_SIZE];
        long nr_swaps_freed = 0;
        pgoff_t index;
        int i;
-        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+        if (lend == -1)
+                end = -1;       /* unsigned, so actually very big */
        pagevec_init(&pvec, 0);
        index = start;
-        while (index <= end) {
+        while (index < end) {
                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                        pvec.pages, indices);
                if (!pvec.nr)
                        break;
@@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        struct page *page = pvec.pages[i];
                        index = indices[i];
-                        if (index > end)
+                        if (index >= end)
                                break;
                        if (radix_tree_exceptional_entry(page)) {
+                                if (unfalloc)
+                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                                index, page);
                                continue;
@@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        if (!trylock_page(page))
                                continue;
-                        if (page->mapping == mapping) {
+                        if (!unfalloc || !PageUptodate(page)) {
-                                VM_BUG_ON(PageWriteback(page));
+                                if (page->mapping == mapping) {
-                                truncate_inode_page(mapping, page);
+                                        VM_BUG_ON(PageWriteback(page));
+                                        truncate_inode_page(mapping, page);
+                                }
                        }
                        unlock_page(page);
                }
@@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                index++;
        }
-        if (partial) {
+        if (partial_start) {
                struct page *page = NULL;
                shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
                if (page) {
-                        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+                        unsigned int top = PAGE_CACHE_SIZE;
+                        if (start > end) {
+                                top = partial_end;
+                                partial_end = 0;
+                        }
+                        zero_user_segment(page, partial_start, top);
                        set_page_dirty(page);
                        unlock_page(page);
                        page_cache_release(page);
                }
        }
+        if (partial_end) {
+                struct page *page = NULL;
+                shmem_getpage(inode, end, &page, SGP_READ, NULL);
+                if (page) {
+                        zero_user_segment(page, 0, partial_end);
+                        set_page_dirty(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+        }
+        if (start >= end)
+                return;
        index = start;
        for ( ; ; ) {
                cond_resched();
                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                        pvec.pages, indices);
                if (!pvec.nr) {
-                        if (index == start)
+                        if (index == start || unfalloc)
                                break;
                        index = start;
                        continue;
                }
-                if (index == start && indices[0] > end) {
+                if ((index == start || unfalloc) && indices[0] >= end) {
                        shmem_deswap_pagevec(&pvec);
                        pagevec_release(&pvec);
                        break;
@@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        struct page *page = pvec.pages[i];
                        index = indices[i];
-                        if (index > end)
+                        if (index >= end)
                                break;
                        if (radix_tree_exceptional_entry(page)) {
+                                if (unfalloc)
+                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                                index, page);
                                continue;
                        }
                        lock_page(page);
-                        if (page->mapping == mapping) {
+                        if (!unfalloc || !PageUptodate(page)) {
-                                VM_BUG_ON(PageWriteback(page));
+                                if (page->mapping == mapping) {
-                                truncate_inode_page(mapping, page);
+                                        VM_BUG_ON(PageWriteback(page));
+                                        truncate_inode_page(mapping, page);
+                                }
                        }
                        unlock_page(page);
                }
@@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
        info->swapped -= nr_swaps_freed;
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
+}
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+        shmem_undo_range(inode, lstart, lend, false);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode)
        }
        BUG_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
-        end_writeback(inode);
+        clear_inode(inode);
 }
 /*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
 static int shmem_unuse_inode(struct shmem_inode_info *info,
-                             swp_entry_t swap, struct page *page)
+                             swp_entry_t swap, struct page **pagep)
 {
        struct address_space *mapping = info->vfs_inode.i_mapping;
        void *radswap;
        pgoff_t index;
-        int error;
+        gfp_t gfp;
+        int error = 0;
        radswap = swp_to_radix_entry(swap);
        index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
+        gfp = mapping_gfp_mask(mapping);
+        if (shmem_should_replace_page(*pagep, gfp)) {
+                mutex_unlock(&shmem_swaplist_mutex);
+                error = shmem_replace_page(pagep, gfp, info, index);
+                mutex_lock(&shmem_swaplist_mutex);
+                /*
+                 * We needed to drop mutex to make that restrictive page
+                 * allocation, but the inode might have been freed while we
+                 * dropped it: although a racing shmem_evict_inode() cannot
+                 * complete without emptying the radix_tree, our page lock
+                 * on this swapcache page is not enough to prevent that -
+                 * free_swap_and_cache() of our swap entry will only
+                 * trylock_page(), removing swap from radix_tree whatever.
+                 *
+                 * We must not proceed to shmem_add_to_page_cache() if the
+                 * inode has been freed, but of course we cannot rely on
+                 * inode or mapping or info to check that.  However, we can
+                 * safely check if our swap entry is still in use (and here
+                 * it can't have got reused for another page): if it's still
+                 * in use, then the inode cannot have been freed yet, and we
+                 * can safely proceed (if it's no longer in use, that tells
+                 * nothing about the inode, but we don't need to unuse swap).
+                 */
+                if (!page_swapcount(*pagep))
+                        error = -ENOENT;
+        }
        /*
         * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-        error = shmem_add_to_page_cache(page, mapping, index,
+        if (!error)
+                error = shmem_add_to_page_cache(*pagep, mapping, index,
                                                GFP_NOWAIT, radswap);
-        /* which does mem_cgroup_uncharge_cache_page on error */
        if (error != -ENOMEM) {
                /*
                 * Truncation and eviction use free_swap_and_cache(), which
                 * only does trylock page: if we raced, best clean up here.
                 */
-                delete_from_swap_cache(page);
+                delete_from_swap_cache(*pagep);
-                set_page_dirty(page);
+                set_page_dirty(*pagep);
                if (!error) {
                        spin_lock(&info->lock);
                        info->swapped--;
@@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        struct list_head *this, *next;
        struct shmem_inode_info *info;
        int found = 0;
-        int error;
+        int error = 0;
+        /*
+         * There's a faint possibility that swap page was replaced before
+         * caller locked it: caller will come back later with the right page.
+         */
+        if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
+                goto out;
        /*
         * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        list_for_each_safe(this, next, &shmem_swaplist) {
                info = list_entry(this, struct shmem_inode_info, swaplist);
                if (info->swapped)
-                        found = shmem_unuse_inode(info, swap, page);
+                        found = shmem_unuse_inode(info, swap, &page);
                else
                        list_del_init(&info->swaplist);
                cond_resched();
@@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        }
        mutex_unlock(&shmem_swaplist_mutex);
-        if (!found)
-                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
                error = found;
 out:
@@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
                goto redirty;
        }
+        /*
+         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+         * value into swapfile.c, the only way we can correctly account for a
+         * fallocated page arriving here is now to initialize it and write it.
+         *
+         * That's okay for a page already fallocated earlier, but if we have
+         * not yet completed the fallocation, then (a) we want to keep track
+         * of this page in case we have to undo it, and (b) it may not be a
+         * good idea to continue anyway, once we're pushing into swap.  So
+         * reactivate the page, and let shmem_fallocate() quit when too many.
+         */
+        if (!PageUptodate(page)) {
+                if (inode->i_private) {
+                        struct shmem_falloc *shmem_falloc;
+                        spin_lock(&inode->i_lock);
+                        shmem_falloc = inode->i_private;
+                        if (shmem_falloc &&
+                            index >= shmem_falloc->start &&
+                            index < shmem_falloc->next)
+                                shmem_falloc->nr_unswapped++;
+                        else
+                                shmem_falloc = NULL;
+                        spin_unlock(&inode->i_lock);
+                        if (shmem_falloc)
+                                goto redirty;
+                }
+                clear_highpage(page);
+                flush_dcache_page(page);
+                SetPageUptodate(page);
+        }
        swap = get_swap_page();
        if (!swap.val)
                goto redirty;
@@ -856,6 +979,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 /*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to.  If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+        return page_zonenum(page) > gfp_zone(gfp);
+}
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                                struct shmem_inode_info *info, pgoff_t index)
+{
+        struct page *oldpage, *newpage;
+        struct address_space *swap_mapping;
+        pgoff_t swap_index;
+        int error;
+        oldpage = *pagep;
+        swap_index = page_private(oldpage);
+        swap_mapping = page_mapping(oldpage);
+        /*
+         * We have arrived here because our zones are constrained, so don't
+         * limit chance of success by further cpuset and node constraints.
+         */
+        gfp &= ~GFP_CONSTRAINT_MASK;
+        newpage = shmem_alloc_page(gfp, info, index);
+        if (!newpage)
+                return -ENOMEM;
+        page_cache_get(newpage);
+        copy_highpage(newpage, oldpage);
+        flush_dcache_page(newpage);
+        __set_page_locked(newpage);
+        SetPageUptodate(newpage);
+        SetPageSwapBacked(newpage);
+        set_page_private(newpage, swap_index);
+        SetPageSwapCache(newpage);
+        /*
+         * Our caller will very soon move newpage out of swapcache, but it's
+         * a nice clean interface for us to replace oldpage by newpage there.
+         */
+        spin_lock_irq(&swap_mapping->tree_lock);
+        error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+                                                                   newpage);
+        if (!error) {
+                __inc_zone_page_state(newpage, NR_FILE_PAGES);
+                __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+        }
+        spin_unlock_irq(&swap_mapping->tree_lock);
+        if (unlikely(error)) {
+                /*
+                 * Is this possible?  I think not, now that our callers check
+                 * both PageSwapCache and page_private after getting page lock;
+                 * but be defensive.  Reverse old to newpage for clear and free.
+                 */
+                oldpage = newpage;
+        } else {
+                mem_cgroup_replace_page_cache(oldpage, newpage);
+                lru_cache_add_anon(newpage);
+                *pagep = newpage;
+        }
+        ClearPageSwapCache(oldpage);
+        set_page_private(oldpage, 0);
+        unlock_page(oldpage);
+        page_cache_release(oldpage);
+        page_cache_release(oldpage);
+        return error;
+}
+/*
 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1078,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        swp_entry_t swap;
        int error;
        int once = 0;
+        int alloced = 0;
        if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
@@ -883,19 +1090,21 @@ repeat:
                page = NULL;
        }
-        if (sgp != SGP_WRITE &&
+        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
                goto failed;
        }
+        /* fallocated page? */
+        if (page && !PageUptodate(page)) {
+                if (sgp != SGP_READ)
+                        goto clear;
+                unlock_page(page);
+                page_cache_release(page);
+                page = NULL;
+        }
        if (page || (sgp == SGP_READ && !swap.val)) {
-                /*
-                 * Once we can get the page lock, it must be uptodate:
-                 * if there were an error in reading back from swap,
-                 * the page would not be inserted into the filecache.
-                 */
-                BUG_ON(page && !PageUptodate(page));
                *pagep = page;
                return 0;
        }
@@ -923,26 +1132,31 @@ repeat:
                /* We have to do this with page locked to prevent races */
                lock_page(page);
+                if (!PageSwapCache(page) || page_private(page) != swap.val ||
+                    !shmem_confirm_swap(mapping, index, swap)) {
+                        error = -EEXIST;        /* try again */
+                        goto unlock;
+                }
                if (!PageUptodate(page)) {
                        error = -EIO;
                        goto failed;
                }
                wait_on_page_writeback(page);
-                /* Someone may have already done it for us */
+                if (shmem_should_replace_page(page, gfp)) {
-                if (page->mapping) {
+                        error = shmem_replace_page(&page, gfp, info, index);
-                        if (page->mapping == mapping &&
+                        if (error)
-                            page->index == index)
+                                goto failed;
-                                goto done;
-                        error = -EEXIST;
-                        goto failed;
                }
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
+                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                gfp, swp_to_radix_entry(swap));
+                        /* We already confirmed swap, and make no allocation */
+                        VM_BUG_ON(error);
+                }
                if (error)
                        goto failed;
@@ -979,11 +1193,18 @@ repeat:
                __set_page_locked(page);
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
-                        error = shmem_add_to_page_cache(page, mapping, index,
-                                                gfp, NULL);
                if (error)
                        goto decused;
+                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                if (!error) {
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                        gfp, NULL);
+                        radix_tree_preload_end();
+                }
+                if (error) {
+                        mem_cgroup_uncharge_cache_page(page);
+                        goto decused;
+                }
                lru_cache_add_anon(page);
                spin_lock(&info->lock);
@@ -991,19 +1212,36 @@ repeat:
                inode->i_blocks += BLOCKS_PER_PAGE;
                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                alloced = true;
-                clear_highpage(page);
+                /*
-                flush_dcache_page(page);
+                 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
-                SetPageUptodate(page);
+                 */
+                if (sgp == SGP_FALLOC)
+                        sgp = SGP_WRITE;
+clear:
+                /*
+                 * Let SGP_WRITE caller clear ends if write does not fill page;
+                 * but SGP_FALLOC on a page fallocated earlier must initialize
+                 * it now, lest undo on failure cancel our earlier guarantee.
+                 */
+                if (sgp != SGP_WRITE) {
+                        clear_highpage(page);
+                        flush_dcache_page(page);
+                        SetPageUptodate(page);
+                }
                if (sgp == SGP_DIRTY)
                        set_page_dirty(page);
        }
-done:
        /* Perhaps the file has been truncated since we checked */
-        if (sgp != SGP_WRITE &&
+        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
-                goto trunc;
+                if (alloced)
+                        goto trunc;
+                else
+                        goto failed;
        }
        *pagep = page;
        return 0;
@@ -1012,6 +1250,7 @@ done:
         * Error recovery.
         */
 trunc:
+        info = SHMEM_I(inode);
        ClearPageDirty(page);
        delete_from_page_cache(page);
        spin_lock(&info->lock);
@@ -1019,19 +1258,16 @@ trunc:
        inode->i_blocks -= BLOCKS_PER_PAGE;
        spin_unlock(&info->lock);
 decused:
+        sbinfo = SHMEM_SB(inode->i_sb);
        if (sbinfo->max_blocks)
                percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
        shmem_unacct_blocks(info->flags, 1);
 failed:
-        if (swap.val && error != -EINVAL) {
+        if (swap.val && error != -EINVAL &&
-                struct page *test = find_get_page(mapping, index);
+            !shmem_confirm_swap(mapping, index, swap))
-                if (test && !radix_tree_exceptional_entry(test))
+                error = -EEXIST;
-                        page_cache_release(test);
+unlock:
-                /* Have another try if the entry has changed */
-                if (test != swp_to_radix_entry(swap))
-                        error = -EEXIST;
-        }
        if (page) {
                unlock_page(page);
                page_cache_release(page);
@@ -1043,7 +1279,7 @@ failed:
                spin_unlock(&info->lock);
                goto repeat;
        }
-        if (error == -EEXIST)
+        if (error == -EEXIST)   /* from above or from radix_tree_insert */
                goto repeat;
        return error;
 }
@@ -1204,6 +1440,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);
+        if (!PageUptodate(page)) {
+                if (copied < PAGE_CACHE_SIZE) {
+                        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                        zero_user_segments(page, 0, from,
+                                        from + copied, PAGE_CACHE_SIZE);
+                }
+                SetPageUptodate(page);
+        }
        set_page_dirty(page);
        unlock_page(page);
        page_cache_release(page);
@@ -1365,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
+                .nr_pages_max = PIPE_DEF_BUFFERS,
                .flags = flags,
                .ops = &page_cache_pipe_buf_ops,
                .spd_release = spd_release_page,
@@ -1453,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        if (spd.nr_pages)
                error = splice_to_pipe(pipe, &spd);
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
        if (error > 0) {
                *ppos += error;
@@ -1462,6 +1707,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        return error;
 }
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+                                                         loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+        struct shmem_falloc shmem_falloc;
+        pgoff_t start, index, end;
+        int error;
+        mutex_lock(&inode->i_mutex);
+        if (mode & FALLOC_FL_PUNCH_HOLE) {
+                struct address_space *mapping = file->f_mapping;
+                loff_t unmap_start = round_up(offset, PAGE_SIZE);
+                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+                if ((u64)unmap_end > (u64)unmap_start)
+                        unmap_mapping_range(mapping, unmap_start,
+                                            1 + unmap_end - unmap_start, 0);
+                shmem_truncate_range(inode, offset, offset + len - 1);
+                /* No need to unmap again: hole-punching leaves COWed pages */
+                error = 0;
+                goto out;
+        }
+        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+        error = inode_newsize_ok(inode, offset + len);
+        if (error)
+                goto out;
+        start = offset >> PAGE_CACHE_SHIFT;
+        end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /* Try to avoid a swapstorm if len is impossible to satisfy */
+        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+                error = -ENOSPC;
+                goto out;
+        }
+        shmem_falloc.start = start;
+        shmem_falloc.next  = start;
+        shmem_falloc.nr_falloced = 0;
+        shmem_falloc.nr_unswapped = 0;
+        spin_lock(&inode->i_lock);
+        inode->i_private = &shmem_falloc;
+        spin_unlock(&inode->i_lock);
+        for (index = start; index < end; index++) {
+                struct page *page;
+                /*
+                 * Good, the fallocate(2) manpage permits EINTR: we may have
+                 * been interrupted because we are using up too much memory.
+                 */
+                if (signal_pending(current))
+                        error = -EINTR;
+                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+                        error = -ENOMEM;
+                else
+                        error = shmem_getpage(inode, index, &page, SGP_FALLOC,
+                                                                        NULL);
+                if (error) {
+                        /* Remove the !PageUptodate pages we added */
+                        shmem_undo_range(inode,
+                                (loff_t)start << PAGE_CACHE_SHIFT,
+                                (loff_t)index << PAGE_CACHE_SHIFT, true);
+                        goto undone;
+                }
+                /*
+                 * Inform shmem_writepage() how far we have reached.
+                 * No need for lock or barrier: we have the page lock.
+                 */
+                shmem_falloc.next++;
+                if (!PageUptodate(page))
+                        shmem_falloc.nr_falloced++;
+                /*
+                 * If !PageUptodate, leave it that way so that freeable pages
+                 * can be recognized if we need to rollback on error later.
+                 * But set_page_dirty so that memory pressure will swap rather
+                 * than free the pages we are allocating (and SGP_CACHE pages
+                 * might still be clean: we now need to mark those dirty too).
+                 */
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                cond_resched();
+        }
+        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+                i_size_write(inode, offset + len);
+        inode->i_ctime = CURRENT_TIME;
+undone:
+        spin_lock(&inode->i_lock);
+        inode->i_private = NULL;
+        spin_unlock(&inode->i_lock);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return error;
+}
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1665,6 +2011,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                kaddr = kmap_atomic(page);
                memcpy(kaddr, symname, len);
                kunmap_atomic(kaddr);
+                SetPageUptodate(page);
                set_page_dirty(page);
                unlock_page(page);
                page_cache_release(page);
@@ -2033,11 +2380,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
        return dentry;
 }
-static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
-                                int connectable)
+                                struct inode *parent)
 {
-        struct inode *inode = dentry->d_inode;
        if (*len < 3) {
                *len = 3;
                return 255;
@@ -2075,6 +2420,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                               bool remount)
 {
        char *this_char, *value, *rest;
+        uid_t uid;
+        gid_t gid;
        while (options != NULL) {
                this_char = options;
@@ -2134,15 +2481,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                } else if (!strcmp(this_char,"uid")) {
                        if (remount)
                                continue;
-                        sbinfo->uid = simple_strtoul(value, &rest, 0);
+                        uid = simple_strtoul(value, &rest, 0);
                        if (*rest)
                                goto bad_val;
+                        sbinfo->uid = make_kuid(current_user_ns(), uid);
+                        if (!uid_valid(sbinfo->uid))
+                                goto bad_val;
                } else if (!strcmp(this_char,"gid")) {
                        if (remount)
                                continue;
-                        sbinfo->gid = simple_strtoul(value, &rest, 0);
+                        gid = simple_strtoul(value, &rest, 0);
                        if (*rest)
                                goto bad_val;
+                        sbinfo->gid = make_kgid(current_user_ns(), gid);
+                        if (!gid_valid(sbinfo->gid))
+                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
                        if (mpol_parse_str(value, &sbinfo->mpol, 1))
                                goto bad_val;
@@ -2210,10 +2563,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
        if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
-        if (sbinfo->uid != 0)
+        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
-                seq_printf(seq, ",uid=%u", sbinfo->uid);
+                seq_printf(seq, ",uid=%u",
-        if (sbinfo->gid != 0)
+                                from_kuid_munged(&init_user_ns, sbinfo->uid));
-                seq_printf(seq, ",gid=%u", sbinfo->gid);
+        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
+                seq_printf(seq, ",gid=%u",
+                                from_kgid_munged(&init_user_ns, sbinfo->gid));
        shmem_show_mpol(seq, sbinfo->mpol);
        return 0;
 }
@@ -2260,6 +2615,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                }
        }
        sb->s_export_op = &shmem_export_ops;
+        sb->s_flags |= MS_NOSEC;
 #else
        sb->s_flags |= MS_NOUSER;
 #endif
@@ -2362,12 +2718,12 @@ static const struct file_operations shmem_file_operations = {
        .fsync          = noop_fsync,
        .splice_read    = shmem_file_splice_read,
        .splice_write   = generic_file_splice_write,
+        .fallocate      = shmem_fallocate,
 #endif
 };
 static const struct inode_operations shmem_inode_operations = {
        .setattr        = shmem_setattr,
-        .truncate_range = shmem_truncate_range,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,