3 files changed, 151 insertions, 48 deletions
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9dd9fbb75139..cbb335813ec0 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -17,6 +17,29 @@
 #include "filemap.h"
 /*
+ * We do use our own empty page to avoid interference with other users
+ * of ZERO_PAGE(), such as /dev/zero
+ */
+static struct page *__xip_sparse_page;
+static struct page *xip_sparse_page(void)
+{
+        if (!__xip_sparse_page) {
+                unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER);
+                if (zeroes) {
+                        static DEFINE_SPINLOCK(xip_alloc_lock);
+                        spin_lock(&xip_alloc_lock);
+                        if (!__xip_sparse_page)
+                                __xip_sparse_page = virt_to_page(zeroes);
+                        else
+                                free_page(zeroes);
+                        spin_unlock(&xip_alloc_lock);
+                }
+        }
+        return __xip_sparse_page;
+}
+/*
 * This is a file read routine for execute in place files, and uses
 * the mapping->a_ops->get_xip_page() function for the actual low-level
 * stuff.
@@ -162,7 +185,7 @@ EXPORT_SYMBOL_GPL(xip_file_sendfile);
 * xip_write
 *
 * This function walks all vmas of the address_space and unmaps the
- * ZERO_PAGE when found at pgoff. Should it go in rmap.c?
+ * __xip_sparse_page when found at pgoff.
 */
 static void
 __xip_unmap (struct address_space * mapping,
@@ -177,13 +200,16 @@ __xip_unmap (struct address_space * mapping,
        spinlock_t *ptl;
        struct page *page;
+        page = __xip_sparse_page;
+        if (!page)
+                return;
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-                page = ZERO_PAGE(0);
                pte = page_check_address(page, mm, address, &ptl);
                if (pte) {
                        /* Nuke the page table entry. */
@@ -222,16 +248,14 @@ xip_file_nopage(struct vm_area_struct * area,
                + area->vm_pgoff;
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (pgoff >= size) {
+        if (pgoff >= size)
-                return NULL;
+                return NOPAGE_SIGBUS;
-        }
        page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
-        if (!IS_ERR(page)) {
+        if (!IS_ERR(page))
                goto out;
-        }
        if (PTR_ERR(page) != -ENODATA)
-                return NULL;
+                return NOPAGE_SIGBUS;
        /* sparse block */
        if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
@@ -241,12 +265,14 @@ xip_file_nopage(struct vm_area_struct * area,
                page = mapping->a_ops->get_xip_page (mapping,
                        pgoff*(PAGE_SIZE/512), 1);
                if (IS_ERR(page))
-                        return NULL;
+                        return NOPAGE_SIGBUS;
                /* unmap page at pgoff from all other vmas */
                __xip_unmap(mapping, pgoff);
        } else {
-                /* not shared and writable, use ZERO_PAGE() */
+                /* not shared and writable, use xip_sparse_page() */
-                page = ZERO_PAGE(0);
+                page = xip_sparse_page();
+                if (!page)
+                        return NOPAGE_OOM;
        }
 out:
diff --git a/mm/madvise.c b/mm/madvise.c
index 77916e9fc52b..603c5257ed6e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -159,9 +159,10 @@ static long madvise_remove(struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
 {
        struct address_space *mapping;
-        loff_t offset, endoff;
+        loff_t offset, endoff;
+        int error;
-        *prev = vma;
+        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
                return -EINVAL;
@@ -180,7 +181,12 @@ static long madvise_remove(struct vm_area_struct *vma,
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
        endoff = (loff_t)(end - vma->vm_start - 1)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        return  vmtruncate_range(mapping->host, offset, endoff);
+        /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+        up_write(&current->mm->mmap_sem);
+        error = vmtruncate_range(mapping->host, offset, endoff);
+        down_write(&current->mm->mmap_sem);
+        return error;
 }
 static long
@@ -315,12 +321,15 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
                if (error)
                        goto out;
                start = tmp;
-                if (start < prev->vm_end)
+                if (prev && start < prev->vm_end)
                        start = prev->vm_end;
                error = unmapped_error;
                if (start >= end)
                        goto out;
-                vma = prev->vm_next;
+                if (prev)
+                        vma = prev->vm_next;
+                else    /* madvise_remove dropped mmap_sem */
+                        vma = find_vma(current->mm, start);
        }
 out:
        up_write(&current->mm->mmap_sem);
diff --git a/mm/shmem.c b/mm/shmem.c
index b8c429a2d271..b2a35ebf071a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -402,26 +402,38 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 /*
 * shmem_free_swp - free some swap entries in a directory
 *
- * @dir:   pointer to the directory
+ * @dir:        pointer to the directory
- * @edir:  pointer after last entry of the directory
+ * @edir:       pointer after last entry of the directory
+ * @punch_lock: pointer to spinlock when needed for the holepunch case
 */
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
+                                                spinlock_t *punch_lock)
 {
+        spinlock_t *punch_unlock = NULL;
        swp_entry_t *ptr;
        int freed = 0;
        for (ptr = dir; ptr < edir; ptr++) {
                if (ptr->val) {
+                        if (unlikely(punch_lock)) {
+                                punch_unlock = punch_lock;
+                                punch_lock = NULL;
+                                spin_lock(punch_unlock);
+                                if (!ptr->val)
+                                        continue;
+                        }
                        free_swap_and_cache(*ptr);
                        *ptr = (swp_entry_t){0};
                        freed++;
                }
        }
+        if (punch_unlock)
+                spin_unlock(punch_unlock);
        return freed;
 }
-static int shmem_map_and_free_swp(struct page *subdir,
+static int shmem_map_and_free_swp(struct page *subdir, int offset,
-                int offset, int limit, struct page ***dir)
+                int limit, struct page ***dir, spinlock_t *punch_lock)
 {
        swp_entry_t *ptr;
        int freed = 0;
@@ -431,7 +443,8 @@ static int shmem_map_and_free_swp(struct page *subdir,
                int size = limit - offset;
                if (size > LATENCY_LIMIT)
                        size = LATENCY_LIMIT;
-                freed += shmem_free_swp(ptr+offset, ptr+offset+size);
+                freed += shmem_free_swp(ptr+offset, ptr+offset+size,
+                                                        punch_lock);
                if (need_resched()) {
                        shmem_swp_unmap(ptr);
                        if (*dir) {
@@ -481,7 +494,10 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
        long nr_swaps_freed = 0;
        int offset;
        int freed;
-        int punch_hole = 0;
+        int punch_hole;
+        spinlock_t *needs_lock;
+        spinlock_t *punch_lock;
+        unsigned long upper_limit;
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -492,11 +508,20 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
        info->flags |= SHMEM_TRUNCATE;
        if (likely(end == (loff_t) -1)) {
                limit = info->next_index;
+                upper_limit = SHMEM_MAX_INDEX;
                info->next_index = idx;
+                needs_lock = NULL;
+                punch_hole = 0;
        } else {
-                limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                if (end + 1 >= inode->i_size) { /* we may free a little more */
-                if (limit > info->next_index)
+                        limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
-                        limit = info->next_index;
+                                                        PAGE_CACHE_SHIFT;
+                        upper_limit = SHMEM_MAX_INDEX;
+                } else {
+                        limit = (end + 1) >> PAGE_CACHE_SHIFT;
+                        upper_limit = limit;
+                }
+                needs_lock = &info->lock;
                punch_hole = 1;
        }
@@ -513,17 +538,30 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
                size = limit;
                if (size > SHMEM_NR_DIRECT)
                        size = SHMEM_NR_DIRECT;
-                nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
+                nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
        }
        /*
         * If there are no indirect blocks or we are punching a hole
         * below indirect blocks, nothing to be done.
         */
-        if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
+        if (!topdir || limit <= SHMEM_NR_DIRECT)
                goto done2;
-        BUG_ON(limit <= SHMEM_NR_DIRECT);
+        /*
+         * The truncation case has already dropped info->lock, and we're safe
+         * because i_size and next_index have already been lowered, preventing
+         * access beyond.  But in the punch_hole case, we still need to take
+         * the lock when updating the swap directory, because there might be
+         * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
+         * shmem_writepage.  However, whenever we find we can remove a whole
+         * directory page (not at the misaligned start or end of the range),
+         * we first NULLify its pointer in the level above, and then have no
+         * need to take the lock when updating its contents: needs_lock and
+         * punch_lock (either pointing to info->lock or NULL) manage this.
+         */
+        upper_limit -= SHMEM_NR_DIRECT;
        limit -= SHMEM_NR_DIRECT;
        idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
        offset = idx % ENTRIES_PER_PAGE;
@@ -543,8 +581,14 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
                if (*dir) {
                        diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
                                ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
-                        if (!diroff && !offset) {
+                        if (!diroff && !offset && upper_limit >= stage) {
-                                *dir = NULL;
+                                if (needs_lock) {
+                                        spin_lock(needs_lock);
+                                        *dir = NULL;
+                                        spin_unlock(needs_lock);
+                                        needs_lock = NULL;
+                                } else
+                                        *dir = NULL;
                                nr_pages_to_free++;
                                list_add(&middir->lru, &pages_to_free);
                        }
@@ -570,39 +614,55 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
                        }
                        stage = idx + ENTRIES_PER_PAGEPAGE;
                        middir = *dir;
-                        *dir = NULL;
+                        if (punch_hole)
-                        nr_pages_to_free++;
+                                needs_lock = &info->lock;
-                        list_add(&middir->lru, &pages_to_free);
+                        if (upper_limit >= stage) {
+                                if (needs_lock) {
+                                        spin_lock(needs_lock);
+                                        *dir = NULL;
+                                        spin_unlock(needs_lock);
+                                        needs_lock = NULL;
+                                } else
+                                        *dir = NULL;
+                                nr_pages_to_free++;
+                                list_add(&middir->lru, &pages_to_free);
+                        }
                        shmem_dir_unmap(dir);
                        cond_resched();
                        dir = shmem_dir_map(middir);
                        diroff = 0;
                }
+                punch_lock = needs_lock;
                subdir = dir[diroff];
-                if (subdir && page_private(subdir)) {
+                if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
+                        if (needs_lock) {
+                                spin_lock(needs_lock);
+                                dir[diroff] = NULL;
+                                spin_unlock(needs_lock);
+                                punch_lock = NULL;
+                        } else
+                                dir[diroff] = NULL;
+                        nr_pages_to_free++;
+                        list_add(&subdir->lru, &pages_to_free);
+                }
+                if (subdir && page_private(subdir) /* has swap entries */) {
                        size = limit - idx;
                        if (size > ENTRIES_PER_PAGE)
                                size = ENTRIES_PER_PAGE;
                        freed = shmem_map_and_free_swp(subdir,
-                                                offset, size, &dir);
+                                        offset, size, &dir, punch_lock);
                        if (!dir)
                                dir = shmem_dir_map(middir);
                        nr_swaps_freed += freed;
-                        if (offset)
+                        if (offset || punch_lock) {
                                spin_lock(&info->lock);
-                        set_page_private(subdir, page_private(subdir) - freed);
+                                set_page_private(subdir,
-                        if (offset)
+                                        page_private(subdir) - freed);
                                spin_unlock(&info->lock);
-                        if (!punch_hole)
+                        } else
-                                BUG_ON(page_private(subdir) > offset);
+                                BUG_ON(page_private(subdir) != freed);
-                }
-                if (offset)
-                        offset = 0;
-                else if (subdir && !page_private(subdir)) {
-                        dir[diroff] = NULL;
-                        nr_pages_to_free++;
-                        list_add(&subdir->lru, &pages_to_free);
                }
+                offset = 0;
        }
 done1:
        shmem_dir_unmap(dir);
@@ -614,8 +674,16 @@ done2:
                 * generic_delete_inode did it, before we lowered next_index.
                 * Also, though shmem_getpage checks i_size before adding to
                 * cache, no recheck after: so fix the narrow window there too.
+                 *
+                 * Recalling truncate_inode_pages_range and unmap_mapping_range
+                 * every time for punch_hole (which never got a chance to clear
+                 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
+                 * yet hardly ever necessary: try to optimize them out later.
                 */
                truncate_inode_pages_range(inode->i_mapping, start, end);
+                if (punch_hole)
+                        unmap_mapping_range(inode->i_mapping, start,
+                                                        end - start, 1);
        }
        spin_lock(&info->lock);

diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9dd9fbb75139..cbb335813ec0 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c
@@ -17,6 +17,29 @@
17	#include "filemap.h"	17	#include "filemap.h"
18		18
19	/*	19	/*
		20	* We do use our own empty page to avoid interference with other users
		21	* of ZERO_PAGE(), such as /dev/zero
		22	*/
		23	static struct page *__xip_sparse_page;
		24
		25	static struct page *xip_sparse_page(void)
		26	{
		27	if (!__xip_sparse_page) {
		28	unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER);
		29	if (zeroes) {
		30	static DEFINE_SPINLOCK(xip_alloc_lock);
		31	spin_lock(&xip_alloc_lock);
		32	if (!__xip_sparse_page)
		33	__xip_sparse_page = virt_to_page(zeroes);
		34	else
		35	free_page(zeroes);
		36	spin_unlock(&xip_alloc_lock);
		37	}
		38	}
		39	return __xip_sparse_page;
		40	}
		41
		42	/*
20	* This is a file read routine for execute in place files, and uses	43	* This is a file read routine for execute in place files, and uses
21	* the mapping->a_ops->get_xip_page() function for the actual low-level	44	* the mapping->a_ops->get_xip_page() function for the actual low-level
22	* stuff.	45	* stuff.
@@ -162,7 +185,7 @@ EXPORT_SYMBOL_GPL(xip_file_sendfile);
162	* xip_write	185	* xip_write
163	*	186	*
164	* This function walks all vmas of the address_space and unmaps the	187	* This function walks all vmas of the address_space and unmaps the
165	* ZERO_PAGE when found at pgoff. Should it go in rmap.c?	188	* __xip_sparse_page when found at pgoff.
166	*/	189	*/
167	static void	190	static void
168	__xip_unmap (struct address_space * mapping,	191	__xip_unmap (struct address_space * mapping,
@@ -177,13 +200,16 @@ __xip_unmap (struct address_space * mapping,
177	spinlock_t *ptl;	200	spinlock_t *ptl;
178	struct page *page;	201	struct page *page;
179		202
		203	page = __xip_sparse_page;
		204	if (!page)
		205	return;
		206
180	spin_lock(&mapping->i_mmap_lock);	207	spin_lock(&mapping->i_mmap_lock);
181	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {	208	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
182	mm = vma->vm_mm;	209	mm = vma->vm_mm;
183	address = vma->vm_start +	210	address = vma->vm_start +
184	((pgoff - vma->vm_pgoff) << PAGE_SHIFT);	211	((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
185	BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);	212	BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
186	page = ZERO_PAGE(0);
187	pte = page_check_address(page, mm, address, &ptl);	213	pte = page_check_address(page, mm, address, &ptl);
188	if (pte) {	214	if (pte) {
189	/* Nuke the page table entry. */	215	/* Nuke the page table entry. */
@@ -222,16 +248,14 @@ xip_file_nopage(struct vm_area_struct * area,
222	+ area->vm_pgoff;	248	+ area->vm_pgoff;
223		249
224	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	250	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
225	if (pgoff >= size) {	251	if (pgoff >= size)
226	return NULL;	252	return NOPAGE_SIGBUS;
227	}
228		253
229	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);	254	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
230	if (!IS_ERR(page)) {	255	if (!IS_ERR(page))
231	goto out;	256	goto out;
232	}
233	if (PTR_ERR(page) != -ENODATA)	257	if (PTR_ERR(page) != -ENODATA)
234	return NULL;	258	return NOPAGE_SIGBUS;
235		259
236	/* sparse block */	260	/* sparse block */
237	if ((area->vm_flags & (VM_WRITE \| VM_MAYWRITE)) &&	261	if ((area->vm_flags & (VM_WRITE \| VM_MAYWRITE)) &&
@@ -241,12 +265,14 @@ xip_file_nopage(struct vm_area_struct * area,
241	page = mapping->a_ops->get_xip_page (mapping,	265	page = mapping->a_ops->get_xip_page (mapping,
242	pgoff*(PAGE_SIZE/512), 1);	266	pgoff*(PAGE_SIZE/512), 1);
243	if (IS_ERR(page))	267	if (IS_ERR(page))
244	return NULL;	268	return NOPAGE_SIGBUS;
245	/* unmap page at pgoff from all other vmas */	269	/* unmap page at pgoff from all other vmas */
246	__xip_unmap(mapping, pgoff);	270	__xip_unmap(mapping, pgoff);
247	} else {	271	} else {
248	/* not shared and writable, use ZERO_PAGE() */	272	/* not shared and writable, use xip_sparse_page() */
249	page = ZERO_PAGE(0);	273	page = xip_sparse_page();
		274	if (!page)
		275	return NOPAGE_OOM;
250	}	276	}
251		277
252	out:	278	out:


diff --git a/mm/madvise.c b/mm/madvise.c index 77916e9fc52b..603c5257ed6e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c
@@ -159,9 +159,10 @@ static long madvise_remove(struct vm_area_struct *vma,
159	unsigned long start, unsigned long end)	159	unsigned long start, unsigned long end)
160	{	160	{
161	struct address_space *mapping;	161	struct address_space *mapping;
162	loff_t offset, endoff;	162	loff_t offset, endoff;
		163	int error;
163		164
164	*prev = vma;	165	prev = NULL; / tell sys_madvise we drop mmap_sem */
165		166
166	if (vma->vm_flags & (VM_LOCKED\|VM_NONLINEAR\|VM_HUGETLB))	167	if (vma->vm_flags & (VM_LOCKED\|VM_NONLINEAR\|VM_HUGETLB))
167	return -EINVAL;	168	return -EINVAL;
@@ -180,7 +181,12 @@ static long madvise_remove(struct vm_area_struct *vma,
180	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);	181	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
181	endoff = (loff_t)(end - vma->vm_start - 1)	182	endoff = (loff_t)(end - vma->vm_start - 1)
182	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);	183	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
183	return vmtruncate_range(mapping->host, offset, endoff);	184
		185	/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
		186	up_write(&current->mm->mmap_sem);
		187	error = vmtruncate_range(mapping->host, offset, endoff);
		188	down_write(&current->mm->mmap_sem);
		189	return error;
184	}	190	}
185		191
186	static long	192	static long
@@ -315,12 +321,15 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
315	if (error)	321	if (error)
316	goto out;	322	goto out;
317	start = tmp;	323	start = tmp;
318	if (start < prev->vm_end)	324	if (prev && start < prev->vm_end)
319	start = prev->vm_end;	325	start = prev->vm_end;
320	error = unmapped_error;	326	error = unmapped_error;
321	if (start >= end)	327	if (start >= end)
322	goto out;	328	goto out;
323	vma = prev->vm_next;	329	if (prev)
		330	vma = prev->vm_next;
		331	else /* madvise_remove dropped mmap_sem */
		332	vma = find_vma(current->mm, start);
324	}	333	}
325	out:	334	out:
326	up_write(&current->mm->mmap_sem);	335	up_write(&current->mm->mmap_sem);


diff --git a/mm/shmem.c b/mm/shmem.c index b8c429a2d271..b2a35ebf071a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c
@@ -402,26 +402,38 @@ static swp_entry_t shmem_swp_alloc(struct shmem_inode_info info, unsigned long
402	/*	402	/*
403	* shmem_free_swp - free some swap entries in a directory	403	* shmem_free_swp - free some swap entries in a directory
404	*	404	*
405	* @dir: pointer to the directory	405	* @dir: pointer to the directory
406	* @edir: pointer after last entry of the directory	406	* @edir: pointer after last entry of the directory
		407	* @punch_lock: pointer to spinlock when needed for the holepunch case
407	*/	408	*/
408	static int shmem_free_swp(swp_entry_t dir, swp_entry_t edir)	409	static int shmem_free_swp(swp_entry_t dir, swp_entry_t edir,
		410	spinlock_t *punch_lock)
409	{	411	{
		412	spinlock_t *punch_unlock = NULL;
410	swp_entry_t *ptr;	413	swp_entry_t *ptr;
411	int freed = 0;	414	int freed = 0;
412		415
413	for (ptr = dir; ptr < edir; ptr++) {	416	for (ptr = dir; ptr < edir; ptr++) {
414	if (ptr->val) {	417	if (ptr->val) {
		418	if (unlikely(punch_lock)) {
		419	punch_unlock = punch_lock;
		420	punch_lock = NULL;
		421	spin_lock(punch_unlock);
		422	if (!ptr->val)
		423	continue;
		424	}
415	free_swap_and_cache(*ptr);	425	free_swap_and_cache(*ptr);
416	*ptr = (swp_entry_t){0};	426	*ptr = (swp_entry_t){0};
417	freed++;	427	freed++;
418	}	428	}
419	}	429	}
		430	if (punch_unlock)
		431	spin_unlock(punch_unlock);
420	return freed;	432	return freed;
421	}	433	}
422		434
423	static int shmem_map_and_free_swp(struct page *subdir,	435	static int shmem_map_and_free_swp(struct page *subdir, int offset,
424	int offset, int limit, struct page ***dir)	436	int limit, struct page **dir, spinlock_t punch_lock)
425	{	437	{
426	swp_entry_t *ptr;	438	swp_entry_t *ptr;
427	int freed = 0;	439	int freed = 0;
@@ -431,7 +443,8 @@ static int shmem_map_and_free_swp(struct page *subdir,
431	int size = limit - offset;	443	int size = limit - offset;
432	if (size > LATENCY_LIMIT)	444	if (size > LATENCY_LIMIT)
433	size = LATENCY_LIMIT;	445	size = LATENCY_LIMIT;
434	freed += shmem_free_swp(ptr+offset, ptr+offset+size);	446	freed += shmem_free_swp(ptr+offset, ptr+offset+size,
		447	punch_lock);
435	if (need_resched()) {	448	if (need_resched()) {
436	shmem_swp_unmap(ptr);	449	shmem_swp_unmap(ptr);
437	if (*dir) {	450	if (*dir) {
@@ -481,7 +494,10 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
481	long nr_swaps_freed = 0;	494	long nr_swaps_freed = 0;
482	int offset;	495	int offset;
483	int freed;	496	int freed;
484	int punch_hole = 0;	497	int punch_hole;
		498	spinlock_t *needs_lock;
		499	spinlock_t *punch_lock;
		500	unsigned long upper_limit;
485		501
486	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	502	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
487	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	503	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -492,11 +508,20 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
492	info->flags \|= SHMEM_TRUNCATE;	508	info->flags \|= SHMEM_TRUNCATE;
493	if (likely(end == (loff_t) -1)) {	509	if (likely(end == (loff_t) -1)) {
494	limit = info->next_index;	510	limit = info->next_index;
		511	upper_limit = SHMEM_MAX_INDEX;
495	info->next_index = idx;	512	info->next_index = idx;
		513	needs_lock = NULL;
		514	punch_hole = 0;
496	} else {	515	} else {
497	limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	516	if (end + 1 >= inode->i_size) { /* we may free a little more */
498	if (limit > info->next_index)	517	limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
499	limit = info->next_index;	518	PAGE_CACHE_SHIFT;
		519	upper_limit = SHMEM_MAX_INDEX;
		520	} else {
		521	limit = (end + 1) >> PAGE_CACHE_SHIFT;
		522	upper_limit = limit;
		523	}
		524	needs_lock = &info->lock;
500	punch_hole = 1;	525	punch_hole = 1;
501	}	526	}
502		527
@@ -513,17 +538,30 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
513	size = limit;	538	size = limit;
514	if (size > SHMEM_NR_DIRECT)	539	if (size > SHMEM_NR_DIRECT)
515	size = SHMEM_NR_DIRECT;	540	size = SHMEM_NR_DIRECT;
516	nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);	541	nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
517	}	542	}
518		543
519	/*	544	/*
520	* If there are no indirect blocks or we are punching a hole	545	* If there are no indirect blocks or we are punching a hole
521	* below indirect blocks, nothing to be done.	546	* below indirect blocks, nothing to be done.
522	*/	547	*/
523	if (!topdir \|\| (punch_hole && (limit <= SHMEM_NR_DIRECT)))	548	if (!topdir \|\| limit <= SHMEM_NR_DIRECT)
524	goto done2;	549	goto done2;
525		550
526	BUG_ON(limit <= SHMEM_NR_DIRECT);	551	/*
		552	* The truncation case has already dropped info->lock, and we're safe
		553	* because i_size and next_index have already been lowered, preventing
		554	* access beyond. But in the punch_hole case, we still need to take
		555	* the lock when updating the swap directory, because there might be
		556	* racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
		557	* shmem_writepage. However, whenever we find we can remove a whole
		558	* directory page (not at the misaligned start or end of the range),
		559	* we first NULLify its pointer in the level above, and then have no
		560	* need to take the lock when updating its contents: needs_lock and
		561	* punch_lock (either pointing to info->lock or NULL) manage this.
		562	*/
		563
		564	upper_limit -= SHMEM_NR_DIRECT;
527	limit -= SHMEM_NR_DIRECT;	565	limit -= SHMEM_NR_DIRECT;
528	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;	566	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
529	offset = idx % ENTRIES_PER_PAGE;	567	offset = idx % ENTRIES_PER_PAGE;
@@ -543,8 +581,14 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
543	if (*dir) {	581	if (*dir) {
544	diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %	582	diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
545	ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;	583	ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
546	if (!diroff && !offset) {	584	if (!diroff && !offset && upper_limit >= stage) {
547	*dir = NULL;	585	if (needs_lock) {
		586	spin_lock(needs_lock);
		587	*dir = NULL;
		588	spin_unlock(needs_lock);
		589	needs_lock = NULL;
		590	} else
		591	*dir = NULL;
548	nr_pages_to_free++;	592	nr_pages_to_free++;
549	list_add(&middir->lru, &pages_to_free);	593	list_add(&middir->lru, &pages_to_free);
550	}	594	}
@@ -570,39 +614,55 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
570	}	614	}
571	stage = idx + ENTRIES_PER_PAGEPAGE;	615	stage = idx + ENTRIES_PER_PAGEPAGE;
572	middir = *dir;	616	middir = *dir;
573	*dir = NULL;	617	if (punch_hole)
574	nr_pages_to_free++;	618	needs_lock = &info->lock;
575	list_add(&middir->lru, &pages_to_free);	619	if (upper_limit >= stage) {
		620	if (needs_lock) {
		621	spin_lock(needs_lock);
		622	*dir = NULL;
		623	spin_unlock(needs_lock);
		624	needs_lock = NULL;
		625	} else
		626	*dir = NULL;
		627	nr_pages_to_free++;
		628	list_add(&middir->lru, &pages_to_free);
		629	}
576	shmem_dir_unmap(dir);	630	shmem_dir_unmap(dir);
577	cond_resched();	631	cond_resched();
578	dir = shmem_dir_map(middir);	632	dir = shmem_dir_map(middir);
579	diroff = 0;	633	diroff = 0;
580	}	634	}
		635	punch_lock = needs_lock;
581	subdir = dir[diroff];	636	subdir = dir[diroff];
582	if (subdir && page_private(subdir)) {	637	if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
		638	if (needs_lock) {
		639	spin_lock(needs_lock);
		640	dir[diroff] = NULL;
		641	spin_unlock(needs_lock);
		642	punch_lock = NULL;
		643	} else
		644	dir[diroff] = NULL;
		645	nr_pages_to_free++;
		646	list_add(&subdir->lru, &pages_to_free);
		647	}
		648	if (subdir && page_private(subdir) /* has swap entries */) {
583	size = limit - idx;	649	size = limit - idx;
584	if (size > ENTRIES_PER_PAGE)	650	if (size > ENTRIES_PER_PAGE)
585	size = ENTRIES_PER_PAGE;	651	size = ENTRIES_PER_PAGE;
586	freed = shmem_map_and_free_swp(subdir,	652	freed = shmem_map_and_free_swp(subdir,
587	offset, size, &dir);	653	offset, size, &dir, punch_lock);
588	if (!dir)	654	if (!dir)
589	dir = shmem_dir_map(middir);	655	dir = shmem_dir_map(middir);
590	nr_swaps_freed += freed;	656	nr_swaps_freed += freed;
591	if (offset)	657	if (offset \|\| punch_lock) {
592	spin_lock(&info->lock);	658	spin_lock(&info->lock);
593	set_page_private(subdir, page_private(subdir) - freed);	659	set_page_private(subdir,
594	if (offset)	660	page_private(subdir) - freed);
595	spin_unlock(&info->lock);	661	spin_unlock(&info->lock);
596	if (!punch_hole)	662	} else
597	BUG_ON(page_private(subdir) > offset);	663	BUG_ON(page_private(subdir) != freed);
598	}
599	if (offset)
600	offset = 0;
601	else if (subdir && !page_private(subdir)) {
602	dir[diroff] = NULL;
603	nr_pages_to_free++;
604	list_add(&subdir->lru, &pages_to_free);
605	}	664	}
		665	offset = 0;
606	}	666	}
607	done1:	667	done1:
608	shmem_dir_unmap(dir);	668	shmem_dir_unmap(dir);
@@ -614,8 +674,16 @@ done2:
614	* generic_delete_inode did it, before we lowered next_index.	674	* generic_delete_inode did it, before we lowered next_index.
615	* Also, though shmem_getpage checks i_size before adding to	675	* Also, though shmem_getpage checks i_size before adding to
616	* cache, no recheck after: so fix the narrow window there too.	676	* cache, no recheck after: so fix the narrow window there too.
		677	*
		678	* Recalling truncate_inode_pages_range and unmap_mapping_range
		679	* every time for punch_hole (which never got a chance to clear
		680	* SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
		681	* yet hardly ever necessary: try to optimize them out later.
617	*/	682	*/
618	truncate_inode_pages_range(inode->i_mapping, start, end);	683	truncate_inode_pages_range(inode->i_mapping, start, end);
		684	if (punch_hole)
		685	unmap_mapping_range(inode->i_mapping, start,
		686	end - start, 1);
619	}	687	}
620		688
621	spin_lock(&info->lock);	689	spin_lock(&info->lock);