aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c1813
1 files changed, 667 insertions, 1146 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 01c19c62d685..2d3577295298 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
6 * 2000-2001 Christoph Rohland 6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG 7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc. 8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins. 9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 * 13 *
@@ -28,7 +29,6 @@
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
30#include <linux/module.h> 31#include <linux/module.h>
31#include <linux/percpu_counter.h>
32#include <linux/swap.h> 32#include <linux/swap.h>
33 33
34static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h>
56#include <linux/splice.h>
54#include <linux/security.h> 57#include <linux/security.h>
55#include <linux/swapops.h> 58#include <linux/swapops.h>
56#include <linux/mempolicy.h> 59#include <linux/mempolicy.h>
@@ -62,43 +65,17 @@ static struct vfsmount *shm_mnt;
62#include <linux/magic.h> 65#include <linux/magic.h>
63 66
64#include <asm/uaccess.h> 67#include <asm/uaccess.h>
65#include <asm/div64.h>
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67 69
68/*
69 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
70 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
71 *
72 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
73 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
74 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
75 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
76 *
77 * We use / and * instead of shifts in the definitions below, so that the swap
78 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
79 */
80#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
81#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
82
83#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
84#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
85
86#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
87#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
88
89#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
90#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 71#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
91 72
92/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
93#define SHMEM_PAGEIN VM_READ
94#define SHMEM_TRUNCATE VM_WRITE
95
96/* Definition to limit shmem_truncate's steps between cond_rescheds */
97#define LATENCY_LIMIT 64
98
99/* Pretend that each entry is of this size in directory's i_size */ 73/* Pretend that each entry is of this size in directory's i_size */
100#define BOGO_DIRENT_SIZE 20 74#define BOGO_DIRENT_SIZE 20
101 75
76/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
77#define SHORT_SYMLINK_LEN 128
78
102struct shmem_xattr { 79struct shmem_xattr {
103 struct list_head list; /* anchored by shmem_inode_info->xattr_list */ 80 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
104 char *name; /* xattr name */ 81 char *name; /* xattr name */
@@ -106,7 +83,7 @@ struct shmem_xattr {
106 char value[0]; 83 char value[0];
107}; 84};
108 85
109/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 86/* Flag allocation requirements to shmem_getpage */
110enum sgp_type { 87enum sgp_type {
111 SGP_READ, /* don't exceed i_size, don't allocate page */ 88 SGP_READ, /* don't exceed i_size, don't allocate page */
112 SGP_CACHE, /* don't exceed i_size, may allocate page */ 89 SGP_CACHE, /* don't exceed i_size, may allocate page */
@@ -126,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void)
126} 103}
127#endif 104#endif
128 105
129static int shmem_getpage(struct inode *inode, unsigned long idx, 106static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
130 struct page **pagep, enum sgp_type sgp, int *type); 107 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
131
132static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
133{
134 /*
135 * The above definition of ENTRIES_PER_PAGE, and the use of
136 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
137 * might be reconsidered if it ever diverges from PAGE_SIZE.
138 *
139 * Mobility flags are masked out as swap vectors cannot move
140 */
141 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
142 PAGE_CACHE_SHIFT-PAGE_SHIFT);
143}
144
145static inline void shmem_dir_free(struct page *page)
146{
147 __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
148}
149
150static struct page **shmem_dir_map(struct page *page)
151{
152 return (struct page **)kmap_atomic(page, KM_USER0);
153}
154
155static inline void shmem_dir_unmap(struct page **dir)
156{
157 kunmap_atomic(dir, KM_USER0);
158}
159
160static swp_entry_t *shmem_swp_map(struct page *page)
161{
162 return (swp_entry_t *)kmap_atomic(page, KM_USER1);
163}
164
165static inline void shmem_swp_balance_unmap(void)
166{
167 /*
168 * When passing a pointer to an i_direct entry, to code which
169 * also handles indirect entries and so will shmem_swp_unmap,
170 * we must arrange for the preempt count to remain in balance.
171 * What kmap_atomic of a lowmem page does depends on config
172 * and architecture, so pretend to kmap_atomic some lowmem page.
173 */
174 (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
175}
176 108
177static inline void shmem_swp_unmap(swp_entry_t *entry) 109static inline int shmem_getpage(struct inode *inode, pgoff_t index,
110 struct page **pagep, enum sgp_type sgp, int *fault_type)
178{ 111{
179 kunmap_atomic(entry, KM_USER1); 112 return shmem_getpage_gfp(inode, index, pagep, sgp,
113 mapping_gfp_mask(inode->i_mapping), fault_type);
180} 114}
181 115
182static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 116static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -236,17 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
236static LIST_HEAD(shmem_swaplist); 170static LIST_HEAD(shmem_swaplist);
237static DEFINE_MUTEX(shmem_swaplist_mutex); 171static DEFINE_MUTEX(shmem_swaplist_mutex);
238 172
239static void shmem_free_blocks(struct inode *inode, long pages)
240{
241 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
242 if (sbinfo->max_blocks) {
243 percpu_counter_add(&sbinfo->used_blocks, -pages);
244 spin_lock(&inode->i_lock);
245 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
246 spin_unlock(&inode->i_lock);
247 }
248}
249
250static int shmem_reserve_inode(struct super_block *sb) 173static int shmem_reserve_inode(struct super_block *sb)
251{ 174{
252 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 175 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -273,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
273} 196}
274 197
275/** 198/**
276 * shmem_recalc_inode - recalculate the size of an inode 199 * shmem_recalc_inode - recalculate the block usage of an inode
277 * @inode: inode to recalc 200 * @inode: inode to recalc
278 * 201 *
279 * We have to calculate the free blocks since the mm can drop 202 * We have to calculate the free blocks since the mm can drop
@@ -291,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
291 214
292 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 215 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
293 if (freed > 0) { 216 if (freed > 0) {
217 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
218 if (sbinfo->max_blocks)
219 percpu_counter_add(&sbinfo->used_blocks, -freed);
294 info->alloced -= freed; 220 info->alloced -= freed;
221 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
295 shmem_unacct_blocks(info->flags, freed); 222 shmem_unacct_blocks(info->flags, freed);
296 shmem_free_blocks(inode, freed);
297 } 223 }
298} 224}
299 225
300/** 226/*
301 * shmem_swp_entry - find the swap vector position in the info structure 227 * Replace item expected in radix tree by a new item, while holding tree lock.
302 * @info: info structure for the inode
303 * @index: index of the page to find
304 * @page: optional page to add to the structure. Has to be preset to
305 * all zeros
306 *
307 * If there is no space allocated yet it will return NULL when
308 * page is NULL, else it will use the page for the needed block,
309 * setting it to NULL on return to indicate that it has been used.
310 *
311 * The swap vector is organized the following way:
312 *
313 * There are SHMEM_NR_DIRECT entries directly stored in the
314 * shmem_inode_info structure. So small files do not need an addional
315 * allocation.
316 *
317 * For pages with index > SHMEM_NR_DIRECT there is the pointer
318 * i_indirect which points to a page which holds in the first half
319 * doubly indirect blocks, in the second half triple indirect blocks:
320 *
321 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
322 * following layout (for SHMEM_NR_DIRECT == 16):
323 *
324 * i_indirect -> dir --> 16-19
325 * | +-> 20-23
326 * |
327 * +-->dir2 --> 24-27
328 * | +-> 28-31
329 * | +-> 32-35
330 * | +-> 36-39
331 * |
332 * +-->dir3 --> 40-43
333 * +-> 44-47
334 * +-> 48-51
335 * +-> 52-55
336 */ 228 */
337static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) 229static int shmem_radix_tree_replace(struct address_space *mapping,
338{ 230 pgoff_t index, void *expected, void *replacement)
339 unsigned long offset; 231{
340 struct page **dir; 232 void **pslot;
341 struct page *subdir; 233 void *item = NULL;
234
235 VM_BUG_ON(!expected);
236 pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
237 if (pslot)
238 item = radix_tree_deref_slot_protected(pslot,
239 &mapping->tree_lock);
240 if (item != expected)
241 return -ENOENT;
242 if (replacement)
243 radix_tree_replace_slot(pslot, replacement);
244 else
245 radix_tree_delete(&mapping->page_tree, index);
246 return 0;
247}
342 248
343 if (index < SHMEM_NR_DIRECT) { 249/*
344 shmem_swp_balance_unmap(); 250 * Like add_to_page_cache_locked, but error if expected item has gone.
345 return info->i_direct+index; 251 */
346 } 252static int shmem_add_to_page_cache(struct page *page,
347 if (!info->i_indirect) { 253 struct address_space *mapping,
348 if (page) { 254 pgoff_t index, gfp_t gfp, void *expected)
349 info->i_indirect = *page; 255{
350 *page = NULL; 256 int error = 0;
351 }
352 return NULL; /* need another page */
353 }
354 257
355 index -= SHMEM_NR_DIRECT; 258 VM_BUG_ON(!PageLocked(page));
356 offset = index % ENTRIES_PER_PAGE; 259 VM_BUG_ON(!PageSwapBacked(page));
357 index /= ENTRIES_PER_PAGE;
358 dir = shmem_dir_map(info->i_indirect);
359
360 if (index >= ENTRIES_PER_PAGE/2) {
361 index -= ENTRIES_PER_PAGE/2;
362 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
363 index %= ENTRIES_PER_PAGE;
364 subdir = *dir;
365 if (!subdir) {
366 if (page) {
367 *dir = *page;
368 *page = NULL;
369 }
370 shmem_dir_unmap(dir);
371 return NULL; /* need another page */
372 }
373 shmem_dir_unmap(dir);
374 dir = shmem_dir_map(subdir);
375 }
376 260
377 dir += index; 261 if (!expected)
378 subdir = *dir; 262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
379 if (!subdir) { 263 if (!error) {
380 if (!page || !(subdir = *page)) { 264 page_cache_get(page);
381 shmem_dir_unmap(dir); 265 page->mapping = mapping;
382 return NULL; /* need a page */ 266 page->index = index;
267
268 spin_lock_irq(&mapping->tree_lock);
269 if (!expected)
270 error = radix_tree_insert(&mapping->page_tree,
271 index, page);
272 else
273 error = shmem_radix_tree_replace(mapping, index,
274 expected, page);
275 if (!error) {
276 mapping->nrpages++;
277 __inc_zone_page_state(page, NR_FILE_PAGES);
278 __inc_zone_page_state(page, NR_SHMEM);
279 spin_unlock_irq(&mapping->tree_lock);
280 } else {
281 page->mapping = NULL;
282 spin_unlock_irq(&mapping->tree_lock);
283 page_cache_release(page);
383 } 284 }
384 *dir = subdir; 285 if (!expected)
385 *page = NULL; 286 radix_tree_preload_end();
386 } 287 }
387 shmem_dir_unmap(dir); 288 if (error)
388 return shmem_swp_map(subdir) + offset; 289 mem_cgroup_uncharge_cache_page(page);
290 return error;
389} 291}
390 292
391static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) 293/*
294 * Like delete_from_page_cache, but substitutes swap for page.
295 */
296static void shmem_delete_from_page_cache(struct page *page, void *radswap)
392{ 297{
393 long incdec = value? 1: -1; 298 struct address_space *mapping = page->mapping;
299 int error;
394 300
395 entry->val = value; 301 spin_lock_irq(&mapping->tree_lock);
396 info->swapped += incdec; 302 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
397 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { 303 page->mapping = NULL;
398 struct page *page = kmap_atomic_to_page(entry); 304 mapping->nrpages--;
399 set_page_private(page, page_private(page) + incdec); 305 __dec_zone_page_state(page, NR_FILE_PAGES);
400 } 306 __dec_zone_page_state(page, NR_SHMEM);
307 spin_unlock_irq(&mapping->tree_lock);
308 page_cache_release(page);
309 BUG_ON(error);
401} 310}
402 311
403/** 312/*
404 * shmem_swp_alloc - get the position of the swap entry for the page. 313 * Like find_get_pages, but collecting swap entries as well as pages.
405 * @info: info structure for the inode
406 * @index: index of the page to find
407 * @sgp: check and recheck i_size? skip allocation?
408 *
409 * If the entry does not exist, allocate it.
410 */ 314 */
411static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) 315static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
412{ 316 pgoff_t start, unsigned int nr_pages,
413 struct inode *inode = &info->vfs_inode; 317 struct page **pages, pgoff_t *indices)
414 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 318{
415 struct page *page = NULL; 319 unsigned int i;
416 swp_entry_t *entry; 320 unsigned int ret;
417 321 unsigned int nr_found;
418 if (sgp != SGP_WRITE && 322
419 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 323 rcu_read_lock();
420 return ERR_PTR(-EINVAL); 324restart:
421 325 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
422 while (!(entry = shmem_swp_entry(info, index, &page))) { 326 (void ***)pages, indices, start, nr_pages);
423 if (sgp == SGP_READ) 327 ret = 0;
424 return shmem_swp_map(ZERO_PAGE(0)); 328 for (i = 0; i < nr_found; i++) {
425 /* 329 struct page *page;
426 * Test used_blocks against 1 less max_blocks, since we have 1 data 330repeat:
427 * page (and perhaps indirect index pages) yet to allocate: 331 page = radix_tree_deref_slot((void **)pages[i]);
428 * a waste to allocate index if we cannot allocate data. 332 if (unlikely(!page))
429 */ 333 continue;
430 if (sbinfo->max_blocks) { 334 if (radix_tree_exception(page)) {
431 if (percpu_counter_compare(&sbinfo->used_blocks, 335 if (radix_tree_deref_retry(page))
432 sbinfo->max_blocks - 1) >= 0) 336 goto restart;
433 return ERR_PTR(-ENOSPC); 337 /*
434 percpu_counter_inc(&sbinfo->used_blocks); 338 * Otherwise, we must be storing a swap entry
435 spin_lock(&inode->i_lock); 339 * here as an exceptional entry: so return it
436 inode->i_blocks += BLOCKS_PER_PAGE; 340 * without attempting to raise page count.
437 spin_unlock(&inode->i_lock); 341 */
342 goto export;
438 } 343 }
344 if (!page_cache_get_speculative(page))
345 goto repeat;
439 346
440 spin_unlock(&info->lock); 347 /* Has the page moved? */
441 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 348 if (unlikely(page != *((void **)pages[i]))) {
442 spin_lock(&info->lock); 349 page_cache_release(page);
443 350 goto repeat;
444 if (!page) {
445 shmem_free_blocks(inode, 1);
446 return ERR_PTR(-ENOMEM);
447 }
448 if (sgp != SGP_WRITE &&
449 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
450 entry = ERR_PTR(-EINVAL);
451 break;
452 } 351 }
453 if (info->next_index <= index) 352export:
454 info->next_index = index + 1; 353 indices[ret] = indices[i];
455 } 354 pages[ret] = page;
456 if (page) { 355 ret++;
457 /* another task gave its page, or truncated the file */ 356 }
458 shmem_free_blocks(inode, 1); 357 if (unlikely(!ret && nr_found))
459 shmem_dir_free(page); 358 goto restart;
460 } 359 rcu_read_unlock();
461 if (info->next_index <= index && !IS_ERR(entry)) 360 return ret;
462 info->next_index = index + 1;
463 return entry;
464} 361}
465 362
466/** 363/*
467 * shmem_free_swp - free some swap entries in a directory 364 * Remove swap entry from radix tree, free the swap and its page cache.
468 * @dir: pointer to the directory
469 * @edir: pointer after last entry of the directory
470 * @punch_lock: pointer to spinlock when needed for the holepunch case
471 */ 365 */
472static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, 366static int shmem_free_swap(struct address_space *mapping,
473 spinlock_t *punch_lock) 367 pgoff_t index, void *radswap)
474{ 368{
475 spinlock_t *punch_unlock = NULL; 369 int error;
476 swp_entry_t *ptr; 370
477 int freed = 0; 371 spin_lock_irq(&mapping->tree_lock);
478 372 error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
479 for (ptr = dir; ptr < edir; ptr++) { 373 spin_unlock_irq(&mapping->tree_lock);
480 if (ptr->val) { 374 if (!error)
481 if (unlikely(punch_lock)) { 375 free_swap_and_cache(radix_to_swp_entry(radswap));
482 punch_unlock = punch_lock; 376 return error;
483 punch_lock = NULL;
484 spin_lock(punch_unlock);
485 if (!ptr->val)
486 continue;
487 }
488 free_swap_and_cache(*ptr);
489 *ptr = (swp_entry_t){0};
490 freed++;
491 }
492 }
493 if (punch_unlock)
494 spin_unlock(punch_unlock);
495 return freed;
496}
497
498static int shmem_map_and_free_swp(struct page *subdir, int offset,
499 int limit, struct page ***dir, spinlock_t *punch_lock)
500{
501 swp_entry_t *ptr;
502 int freed = 0;
503
504 ptr = shmem_swp_map(subdir);
505 for (; offset < limit; offset += LATENCY_LIMIT) {
506 int size = limit - offset;
507 if (size > LATENCY_LIMIT)
508 size = LATENCY_LIMIT;
509 freed += shmem_free_swp(ptr+offset, ptr+offset+size,
510 punch_lock);
511 if (need_resched()) {
512 shmem_swp_unmap(ptr);
513 if (*dir) {
514 shmem_dir_unmap(*dir);
515 *dir = NULL;
516 }
517 cond_resched();
518 ptr = shmem_swp_map(subdir);
519 }
520 }
521 shmem_swp_unmap(ptr);
522 return freed;
523} 377}
524 378
525static void shmem_free_pages(struct list_head *next) 379/*
380 * Pagevec may contain swap entries, so shuffle up pages before releasing.
381 */
382static void shmem_pagevec_release(struct pagevec *pvec)
526{ 383{
527 struct page *page; 384 int i, j;
528 int freed = 0; 385
529 386 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
530 do { 387 struct page *page = pvec->pages[i];
531 page = container_of(next, struct page, lru); 388 if (!radix_tree_exceptional_entry(page))
532 next = next->next; 389 pvec->pages[j++] = page;
533 shmem_dir_free(page); 390 }
534 freed++; 391 pvec->nr = j;
535 if (freed >= LATENCY_LIMIT) { 392 pagevec_release(pvec);
536 cond_resched();
537 freed = 0;
538 }
539 } while (next);
540} 393}
541 394
542void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 395/*
396 * Remove range of pages and swap entries from radix tree, and free them.
397 */
398void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
543{ 399{
400 struct address_space *mapping = inode->i_mapping;
544 struct shmem_inode_info *info = SHMEM_I(inode); 401 struct shmem_inode_info *info = SHMEM_I(inode);
545 unsigned long idx; 402 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
546 unsigned long size; 403 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
547 unsigned long limit; 404 pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
548 unsigned long stage; 405 struct pagevec pvec;
549 unsigned long diroff; 406 pgoff_t indices[PAGEVEC_SIZE];
550 struct page **dir;
551 struct page *topdir;
552 struct page *middir;
553 struct page *subdir;
554 swp_entry_t *ptr;
555 LIST_HEAD(pages_to_free);
556 long nr_pages_to_free = 0;
557 long nr_swaps_freed = 0; 407 long nr_swaps_freed = 0;
558 int offset; 408 pgoff_t index;
559 int freed; 409 int i;
560 int punch_hole;
561 spinlock_t *needs_lock;
562 spinlock_t *punch_lock;
563 unsigned long upper_limit;
564 410
565 truncate_inode_pages_range(inode->i_mapping, start, end); 411 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
566 412
567 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 413 pagevec_init(&pvec, 0);
568 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 414 index = start;
569 if (idx >= info->next_index) 415 while (index <= end) {
570 return; 416 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
417 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
418 pvec.pages, indices);
419 if (!pvec.nr)
420 break;
421 mem_cgroup_uncharge_start();
422 for (i = 0; i < pagevec_count(&pvec); i++) {
423 struct page *page = pvec.pages[i];
571 424
572 spin_lock(&info->lock); 425 index = indices[i];
573 info->flags |= SHMEM_TRUNCATE; 426 if (index > end)
574 if (likely(end == (loff_t) -1)) { 427 break;
575 limit = info->next_index;
576 upper_limit = SHMEM_MAX_INDEX;
577 info->next_index = idx;
578 needs_lock = NULL;
579 punch_hole = 0;
580 } else {
581 if (end + 1 >= inode->i_size) { /* we may free a little more */
582 limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
583 PAGE_CACHE_SHIFT;
584 upper_limit = SHMEM_MAX_INDEX;
585 } else {
586 limit = (end + 1) >> PAGE_CACHE_SHIFT;
587 upper_limit = limit;
588 }
589 needs_lock = &info->lock;
590 punch_hole = 1;
591 }
592 428
593 topdir = info->i_indirect; 429 if (radix_tree_exceptional_entry(page)) {
594 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { 430 nr_swaps_freed += !shmem_free_swap(mapping,
595 info->i_indirect = NULL; 431 index, page);
596 nr_pages_to_free++; 432 continue;
597 list_add(&topdir->lru, &pages_to_free); 433 }
434
435 if (!trylock_page(page))
436 continue;
437 if (page->mapping == mapping) {
438 VM_BUG_ON(PageWriteback(page));
439 truncate_inode_page(mapping, page);
440 }
441 unlock_page(page);
442 }
443 shmem_pagevec_release(&pvec);
444 mem_cgroup_uncharge_end();
445 cond_resched();
446 index++;
598 } 447 }
599 spin_unlock(&info->lock);
600 448
601 if (info->swapped && idx < SHMEM_NR_DIRECT) { 449 if (partial) {
602 ptr = info->i_direct; 450 struct page *page = NULL;
603 size = limit; 451 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
604 if (size > SHMEM_NR_DIRECT) 452 if (page) {
605 size = SHMEM_NR_DIRECT; 453 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
606 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); 454 set_page_dirty(page);
455 unlock_page(page);
456 page_cache_release(page);
457 }
607 } 458 }
608 459
609 /* 460 index = start;
610 * If there are no indirect blocks or we are punching a hole 461 for ( ; ; ) {
611 * below indirect blocks, nothing to be done. 462 cond_resched();
612 */ 463 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
613 if (!topdir || limit <= SHMEM_NR_DIRECT) 464 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
614 goto done2; 465 pvec.pages, indices);
466 if (!pvec.nr) {
467 if (index == start)
468 break;
469 index = start;
470 continue;
471 }
472 if (index == start && indices[0] > end) {
473 shmem_pagevec_release(&pvec);
474 break;
475 }
476 mem_cgroup_uncharge_start();
477 for (i = 0; i < pagevec_count(&pvec); i++) {
478 struct page *page = pvec.pages[i];
615 479
616 /* 480 index = indices[i];
617 * The truncation case has already dropped info->lock, and we're safe 481 if (index > end)
618 * because i_size and next_index have already been lowered, preventing 482 break;
619 * access beyond. But in the punch_hole case, we still need to take
620 * the lock when updating the swap directory, because there might be
621 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
622 * shmem_writepage. However, whenever we find we can remove a whole
623 * directory page (not at the misaligned start or end of the range),
624 * we first NULLify its pointer in the level above, and then have no
625 * need to take the lock when updating its contents: needs_lock and
626 * punch_lock (either pointing to info->lock or NULL) manage this.
627 */
628 483
629 upper_limit -= SHMEM_NR_DIRECT; 484 if (radix_tree_exceptional_entry(page)) {
630 limit -= SHMEM_NR_DIRECT; 485 nr_swaps_freed += !shmem_free_swap(mapping,
631 idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; 486 index, page);
632 offset = idx % ENTRIES_PER_PAGE; 487 continue;
633 idx -= offset;
634
635 dir = shmem_dir_map(topdir);
636 stage = ENTRIES_PER_PAGEPAGE/2;
637 if (idx < ENTRIES_PER_PAGEPAGE/2) {
638 middir = topdir;
639 diroff = idx/ENTRIES_PER_PAGE;
640 } else {
641 dir += ENTRIES_PER_PAGE/2;
642 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
643 while (stage <= idx)
644 stage += ENTRIES_PER_PAGEPAGE;
645 middir = *dir;
646 if (*dir) {
647 diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
648 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
649 if (!diroff && !offset && upper_limit >= stage) {
650 if (needs_lock) {
651 spin_lock(needs_lock);
652 *dir = NULL;
653 spin_unlock(needs_lock);
654 needs_lock = NULL;
655 } else
656 *dir = NULL;
657 nr_pages_to_free++;
658 list_add(&middir->lru, &pages_to_free);
659 } 488 }
660 shmem_dir_unmap(dir);
661 dir = shmem_dir_map(middir);
662 } else {
663 diroff = 0;
664 offset = 0;
665 idx = stage;
666 }
667 }
668 489
669 for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { 490 lock_page(page);
670 if (unlikely(idx == stage)) { 491 if (page->mapping == mapping) {
671 shmem_dir_unmap(dir); 492 VM_BUG_ON(PageWriteback(page));
672 dir = shmem_dir_map(topdir) + 493 truncate_inode_page(mapping, page);
673 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
674 while (!*dir) {
675 dir++;
676 idx += ENTRIES_PER_PAGEPAGE;
677 if (idx >= limit)
678 goto done1;
679 } 494 }
680 stage = idx + ENTRIES_PER_PAGEPAGE; 495 unlock_page(page);
681 middir = *dir;
682 if (punch_hole)
683 needs_lock = &info->lock;
684 if (upper_limit >= stage) {
685 if (needs_lock) {
686 spin_lock(needs_lock);
687 *dir = NULL;
688 spin_unlock(needs_lock);
689 needs_lock = NULL;
690 } else
691 *dir = NULL;
692 nr_pages_to_free++;
693 list_add(&middir->lru, &pages_to_free);
694 }
695 shmem_dir_unmap(dir);
696 cond_resched();
697 dir = shmem_dir_map(middir);
698 diroff = 0;
699 }
700 punch_lock = needs_lock;
701 subdir = dir[diroff];
702 if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
703 if (needs_lock) {
704 spin_lock(needs_lock);
705 dir[diroff] = NULL;
706 spin_unlock(needs_lock);
707 punch_lock = NULL;
708 } else
709 dir[diroff] = NULL;
710 nr_pages_to_free++;
711 list_add(&subdir->lru, &pages_to_free);
712 }
713 if (subdir && page_private(subdir) /* has swap entries */) {
714 size = limit - idx;
715 if (size > ENTRIES_PER_PAGE)
716 size = ENTRIES_PER_PAGE;
717 freed = shmem_map_and_free_swp(subdir,
718 offset, size, &dir, punch_lock);
719 if (!dir)
720 dir = shmem_dir_map(middir);
721 nr_swaps_freed += freed;
722 if (offset || punch_lock) {
723 spin_lock(&info->lock);
724 set_page_private(subdir,
725 page_private(subdir) - freed);
726 spin_unlock(&info->lock);
727 } else
728 BUG_ON(page_private(subdir) != freed);
729 } 496 }
730 offset = 0; 497 shmem_pagevec_release(&pvec);
731 } 498 mem_cgroup_uncharge_end();
732done1: 499 index++;
733 shmem_dir_unmap(dir);
734done2:
735 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
736 /*
737 * Call truncate_inode_pages again: racing shmem_unuse_inode
738 * may have swizzled a page in from swap since
739 * truncate_pagecache or generic_delete_inode did it, before we
740 * lowered next_index. Also, though shmem_getpage checks
741 * i_size before adding to cache, no recheck after: so fix the
742 * narrow window there too.
743 */
744 truncate_inode_pages_range(inode->i_mapping, start, end);
745 } 500 }
746 501
747 spin_lock(&info->lock); 502 spin_lock(&info->lock);
748 info->flags &= ~SHMEM_TRUNCATE;
749 info->swapped -= nr_swaps_freed; 503 info->swapped -= nr_swaps_freed;
750 if (nr_pages_to_free)
751 shmem_free_blocks(inode, nr_pages_to_free);
752 shmem_recalc_inode(inode); 504 shmem_recalc_inode(inode);
753 spin_unlock(&info->lock); 505 spin_unlock(&info->lock);
754 506
755 /* 507 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
756 * Empty swap vector directory pages to be freed?
757 */
758 if (!list_empty(&pages_to_free)) {
759 pages_to_free.prev->next = NULL;
760 shmem_free_pages(pages_to_free.next);
761 }
762} 508}
763EXPORT_SYMBOL_GPL(shmem_truncate_range); 509EXPORT_SYMBOL_GPL(shmem_truncate_range);
764 510
@@ -774,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
774 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 520 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
775 loff_t oldsize = inode->i_size; 521 loff_t oldsize = inode->i_size;
776 loff_t newsize = attr->ia_size; 522 loff_t newsize = attr->ia_size;
777 struct page *page = NULL;
778 523
779 if (newsize < oldsize) {
780 /*
781 * If truncating down to a partial page, then
782 * if that page is already allocated, hold it
783 * in memory until the truncation is over, so
784 * truncate_partial_page cannot miss it were
785 * it assigned to swap.
786 */
787 if (newsize & (PAGE_CACHE_SIZE-1)) {
788 (void) shmem_getpage(inode,
789 newsize >> PAGE_CACHE_SHIFT,
790 &page, SGP_READ, NULL);
791 if (page)
792 unlock_page(page);
793 }
794 /*
795 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
796 * detect if any pages might have been added to cache
797 * after truncate_inode_pages. But we needn't bother
798 * if it's being fully truncated to zero-length: the
799 * nrpages check is efficient enough in that case.
800 */
801 if (newsize) {
802 struct shmem_inode_info *info = SHMEM_I(inode);
803 spin_lock(&info->lock);
804 info->flags &= ~SHMEM_PAGEIN;
805 spin_unlock(&info->lock);
806 }
807 }
808 if (newsize != oldsize) { 524 if (newsize != oldsize) {
809 i_size_write(inode, newsize); 525 i_size_write(inode, newsize);
810 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 526 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -816,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
816 /* unmap again to remove racily COWed private pages */ 532 /* unmap again to remove racily COWed private pages */
817 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 533 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
818 } 534 }
819 if (page)
820 page_cache_release(page);
821 } 535 }
822 536
823 setattr_copy(inode, attr); 537 setattr_copy(inode, attr);
@@ -842,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
842 list_del_init(&info->swaplist); 556 list_del_init(&info->swaplist);
843 mutex_unlock(&shmem_swaplist_mutex); 557 mutex_unlock(&shmem_swaplist_mutex);
844 } 558 }
845 } 559 } else
560 kfree(info->symlink);
846 561
847 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 562 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
848 kfree(xattr->name); 563 kfree(xattr->name);
@@ -853,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
853 end_writeback(inode); 568 end_writeback(inode);
854} 569}
855 570
856static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 571/*
857{ 572 * If swap found in inode, free it and move page from swapcache to filecache.
858 swp_entry_t *ptr; 573 */
859 574static int shmem_unuse_inode(struct shmem_inode_info *info,
860 for (ptr = dir; ptr < edir; ptr++) { 575 swp_entry_t swap, struct page *page)
861 if (ptr->val == entry.val)
862 return ptr - dir;
863 }
864 return -1;
865}
866
867static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
868{ 576{
869 struct address_space *mapping; 577 struct address_space *mapping = info->vfs_inode.i_mapping;
870 unsigned long idx; 578 void *radswap;
871 unsigned long size; 579 pgoff_t index;
872 unsigned long limit;
873 unsigned long stage;
874 struct page **dir;
875 struct page *subdir;
876 swp_entry_t *ptr;
877 int offset;
878 int error; 580 int error;
879 581
880 idx = 0; 582 radswap = swp_to_radix_entry(swap);
881 ptr = info->i_direct; 583 index = radix_tree_locate_item(&mapping->page_tree, radswap);
882 spin_lock(&info->lock); 584 if (index == -1)
883 if (!info->swapped) { 585 return 0;
884 list_del_init(&info->swaplist);
885 goto lost2;
886 }
887 limit = info->next_index;
888 size = limit;
889 if (size > SHMEM_NR_DIRECT)
890 size = SHMEM_NR_DIRECT;
891 offset = shmem_find_swp(entry, ptr, ptr+size);
892 if (offset >= 0) {
893 shmem_swp_balance_unmap();
894 goto found;
895 }
896 if (!info->i_indirect)
897 goto lost2;
898
899 dir = shmem_dir_map(info->i_indirect);
900 stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
901
902 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
903 if (unlikely(idx == stage)) {
904 shmem_dir_unmap(dir-1);
905 if (cond_resched_lock(&info->lock)) {
906 /* check it has not been truncated */
907 if (limit > info->next_index) {
908 limit = info->next_index;
909 if (idx >= limit)
910 goto lost2;
911 }
912 }
913 dir = shmem_dir_map(info->i_indirect) +
914 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
915 while (!*dir) {
916 dir++;
917 idx += ENTRIES_PER_PAGEPAGE;
918 if (idx >= limit)
919 goto lost1;
920 }
921 stage = idx + ENTRIES_PER_PAGEPAGE;
922 subdir = *dir;
923 shmem_dir_unmap(dir);
924 dir = shmem_dir_map(subdir);
925 }
926 subdir = *dir;
927 if (subdir && page_private(subdir)) {
928 ptr = shmem_swp_map(subdir);
929 size = limit - idx;
930 if (size > ENTRIES_PER_PAGE)
931 size = ENTRIES_PER_PAGE;
932 offset = shmem_find_swp(entry, ptr, ptr+size);
933 shmem_swp_unmap(ptr);
934 if (offset >= 0) {
935 shmem_dir_unmap(dir);
936 ptr = shmem_swp_map(subdir);
937 goto found;
938 }
939 }
940 }
941lost1:
942 shmem_dir_unmap(dir-1);
943lost2:
944 spin_unlock(&info->lock);
945 return 0;
946found:
947 idx += offset;
948 ptr += offset;
949 586
950 /* 587 /*
951 * Move _head_ to start search for next from here. 588 * Move _head_ to start search for next from here.
952 * But be careful: shmem_evict_inode checks list_empty without taking 589 * But be careful: shmem_evict_inode checks list_empty without taking
953 * mutex, and there's an instant in list_move_tail when info->swaplist 590 * mutex, and there's an instant in list_move_tail when info->swaplist
954 * would appear empty, if it were the only one on shmem_swaplist. We 591 * would appear empty, if it were the only one on shmem_swaplist.
955 * could avoid doing it if inode NULL; or use this minor optimization.
956 */ 592 */
957 if (shmem_swaplist.next != &info->swaplist) 593 if (shmem_swaplist.next != &info->swaplist)
958 list_move_tail(&shmem_swaplist, &info->swaplist); 594 list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -962,42 +598,34 @@ found:
962 * but also to hold up shmem_evict_inode(): so inode cannot be freed 598 * but also to hold up shmem_evict_inode(): so inode cannot be freed
963 * beneath us (pagelock doesn't help until the page is in pagecache). 599 * beneath us (pagelock doesn't help until the page is in pagecache).
964 */ 600 */
965 mapping = info->vfs_inode.i_mapping; 601 error = shmem_add_to_page_cache(page, mapping, index,
966 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 602 GFP_NOWAIT, radswap);
967 /* which does mem_cgroup_uncharge_cache_page on error */ 603 /* which does mem_cgroup_uncharge_cache_page on error */
968 604
969 if (error == -EEXIST) { 605 if (error != -ENOMEM) {
970 struct page *filepage = find_get_page(mapping, idx); 606 /*
971 error = 1; 607 * Truncation and eviction use free_swap_and_cache(), which
972 if (filepage) { 608 * only does trylock page: if we raced, best clean up here.
973 /* 609 */
974 * There might be a more uptodate page coming down
975 * from a stacked writepage: forget our swappage if so.
976 */
977 if (PageUptodate(filepage))
978 error = 0;
979 page_cache_release(filepage);
980 }
981 }
982 if (!error) {
983 delete_from_swap_cache(page); 610 delete_from_swap_cache(page);
984 set_page_dirty(page); 611 set_page_dirty(page);
985 info->flags |= SHMEM_PAGEIN; 612 if (!error) {
986 shmem_swp_set(info, ptr, 0); 613 spin_lock(&info->lock);
987 swap_free(entry); 614 info->swapped--;
615 spin_unlock(&info->lock);
616 swap_free(swap);
617 }
988 error = 1; /* not an error, but entry was found */ 618 error = 1; /* not an error, but entry was found */
989 } 619 }
990 shmem_swp_unmap(ptr);
991 spin_unlock(&info->lock);
992 return error; 620 return error;
993} 621}
994 622
995/* 623/*
996 * shmem_unuse() search for an eventually swapped out shmem page. 624 * Search through swapped inodes to find and replace swap by page.
997 */ 625 */
998int shmem_unuse(swp_entry_t entry, struct page *page) 626int shmem_unuse(swp_entry_t swap, struct page *page)
999{ 627{
1000 struct list_head *p, *next; 628 struct list_head *this, *next;
1001 struct shmem_inode_info *info; 629 struct shmem_inode_info *info;
1002 int found = 0; 630 int found = 0;
1003 int error; 631 int error;
@@ -1006,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1006 * Charge page using GFP_KERNEL while we can wait, before taking 634 * Charge page using GFP_KERNEL while we can wait, before taking
1007 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 635 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1008 * Charged back to the user (not to caller) when swap account is used. 636 * Charged back to the user (not to caller) when swap account is used.
1009 * add_to_page_cache() will be called with GFP_NOWAIT.
1010 */ 637 */
1011 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 638 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
1012 if (error) 639 if (error)
1013 goto out; 640 goto out;
1014 /* 641 /* No radix_tree_preload: swap entry keeps a place for page in tree */
1015 * Try to preload while we can wait, to not make a habit of
1016 * draining atomic reserves; but don't latch on to this cpu,
1017 * it's okay if sometimes we get rescheduled after this.
1018 */
1019 error = radix_tree_preload(GFP_KERNEL);
1020 if (error)
1021 goto uncharge;
1022 radix_tree_preload_end();
1023 642
1024 mutex_lock(&shmem_swaplist_mutex); 643 mutex_lock(&shmem_swaplist_mutex);
1025 list_for_each_safe(p, next, &shmem_swaplist) { 644 list_for_each_safe(this, next, &shmem_swaplist) {
1026 info = list_entry(p, struct shmem_inode_info, swaplist); 645 info = list_entry(this, struct shmem_inode_info, swaplist);
1027 found = shmem_unuse_inode(info, entry, page); 646 if (info->swapped)
647 found = shmem_unuse_inode(info, swap, page);
648 else
649 list_del_init(&info->swaplist);
1028 cond_resched(); 650 cond_resched();
1029 if (found) 651 if (found)
1030 break; 652 break;
1031 } 653 }
1032 mutex_unlock(&shmem_swaplist_mutex); 654 mutex_unlock(&shmem_swaplist_mutex);
1033 655
1034uncharge:
1035 if (!found) 656 if (!found)
1036 mem_cgroup_uncharge_cache_page(page); 657 mem_cgroup_uncharge_cache_page(page);
1037 if (found < 0) 658 if (found < 0)
@@ -1048,10 +669,10 @@ out:
1048static int shmem_writepage(struct page *page, struct writeback_control *wbc) 669static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1049{ 670{
1050 struct shmem_inode_info *info; 671 struct shmem_inode_info *info;
1051 swp_entry_t *entry, swap;
1052 struct address_space *mapping; 672 struct address_space *mapping;
1053 unsigned long index;
1054 struct inode *inode; 673 struct inode *inode;
674 swp_entry_t swap;
675 pgoff_t index;
1055 676
1056 BUG_ON(!PageLocked(page)); 677 BUG_ON(!PageLocked(page));
1057 mapping = page->mapping; 678 mapping = page->mapping;
@@ -1066,69 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1066 /* 687 /*
1067 * shmem_backing_dev_info's capabilities prevent regular writeback or 688 * shmem_backing_dev_info's capabilities prevent regular writeback or
1068 * sync from ever calling shmem_writepage; but a stacking filesystem 689 * sync from ever calling shmem_writepage; but a stacking filesystem
1069 * may use the ->writepage of its underlying filesystem, in which case 690 * might use ->writepage of its underlying filesystem, in which case
1070 * tmpfs should write out to swap only in response to memory pressure, 691 * tmpfs should write out to swap only in response to memory pressure,
1071 * and not for the writeback threads or sync. However, in those cases, 692 * and not for the writeback threads or sync.
1072 * we do still want to check if there's a redundant swappage to be
1073 * discarded.
1074 */ 693 */
1075 if (wbc->for_reclaim) 694 if (!wbc->for_reclaim) {
1076 swap = get_swap_page(); 695 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1077 else 696 goto redirty;
1078 swap.val = 0; 697 }
698 swap = get_swap_page();
699 if (!swap.val)
700 goto redirty;
1079 701
1080 /* 702 /*
1081 * Add inode to shmem_unuse()'s list of swapped-out inodes, 703 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1082 * if it's not already there. Do it now because we cannot take 704 * if it's not already there. Do it now before the page is
1083 * mutex while holding spinlock, and must do so before the page 705 * moved to swap cache, when its pagelock no longer protects
1084 * is moved to swap cache, when its pagelock no longer protects
1085 * the inode from eviction. But don't unlock the mutex until 706 * the inode from eviction. But don't unlock the mutex until
1086 * we've taken the spinlock, because shmem_unuse_inode() will 707 * we've incremented swapped, because shmem_unuse_inode() will
1087 * prune a !swapped inode from the swaplist under both locks. 708 * prune a !swapped inode from the swaplist under this mutex.
1088 */ 709 */
1089 if (swap.val) { 710 mutex_lock(&shmem_swaplist_mutex);
1090 mutex_lock(&shmem_swaplist_mutex); 711 if (list_empty(&info->swaplist))
1091 if (list_empty(&info->swaplist)) 712 list_add_tail(&info->swaplist, &shmem_swaplist);
1092 list_add_tail(&info->swaplist, &shmem_swaplist);
1093 }
1094
1095 spin_lock(&info->lock);
1096 if (swap.val)
1097 mutex_unlock(&shmem_swaplist_mutex);
1098
1099 if (index >= info->next_index) {
1100 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1101 goto unlock;
1102 }
1103 entry = shmem_swp_entry(info, index, NULL);
1104 if (entry->val) {
1105 /*
1106 * The more uptodate page coming down from a stacked
1107 * writepage should replace our old swappage.
1108 */
1109 free_swap_and_cache(*entry);
1110 shmem_swp_set(info, entry, 0);
1111 }
1112 shmem_recalc_inode(inode);
1113 713
1114 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 714 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1115 delete_from_page_cache(page);
1116 shmem_swp_set(info, entry, swap.val);
1117 shmem_swp_unmap(entry);
1118 swap_shmem_alloc(swap); 715 swap_shmem_alloc(swap);
716 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
717
718 spin_lock(&info->lock);
719 info->swapped++;
720 shmem_recalc_inode(inode);
1119 spin_unlock(&info->lock); 721 spin_unlock(&info->lock);
722
723 mutex_unlock(&shmem_swaplist_mutex);
1120 BUG_ON(page_mapped(page)); 724 BUG_ON(page_mapped(page));
1121 swap_writepage(page, wbc); 725 swap_writepage(page, wbc);
1122 return 0; 726 return 0;
1123 } 727 }
1124 728
1125 shmem_swp_unmap(entry); 729 mutex_unlock(&shmem_swaplist_mutex);
1126unlock:
1127 spin_unlock(&info->lock);
1128 /*
1129 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1130 * clear SWAP_HAS_CACHE flag.
1131 */
1132 swapcache_free(swap, NULL); 730 swapcache_free(swap, NULL);
1133redirty: 731redirty:
1134 set_page_dirty(page); 732 set_page_dirty(page);
@@ -1165,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1165} 763}
1166#endif /* CONFIG_TMPFS */ 764#endif /* CONFIG_TMPFS */
1167 765
1168static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 766static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1169 struct shmem_inode_info *info, unsigned long idx) 767 struct shmem_inode_info *info, pgoff_t index)
1170{ 768{
1171 struct mempolicy mpol, *spol; 769 struct mempolicy mpol, *spol;
1172 struct vm_area_struct pvma; 770 struct vm_area_struct pvma;
1173 struct page *page;
1174 771
1175 spol = mpol_cond_copy(&mpol, 772 spol = mpol_cond_copy(&mpol,
1176 mpol_shared_policy_lookup(&info->policy, idx)); 773 mpol_shared_policy_lookup(&info->policy, index));
1177 774
1178 /* Create a pseudo vma that just contains the policy */ 775 /* Create a pseudo vma that just contains the policy */
1179 pvma.vm_start = 0; 776 pvma.vm_start = 0;
1180 pvma.vm_pgoff = idx; 777 pvma.vm_pgoff = index;
1181 pvma.vm_ops = NULL; 778 pvma.vm_ops = NULL;
1182 pvma.vm_policy = spol; 779 pvma.vm_policy = spol;
1183 page = swapin_readahead(entry, gfp, &pvma, 0); 780 return swapin_readahead(swap, gfp, &pvma, 0);
1184 return page;
1185} 781}
1186 782
1187static struct page *shmem_alloc_page(gfp_t gfp, 783static struct page *shmem_alloc_page(gfp_t gfp,
1188 struct shmem_inode_info *info, unsigned long idx) 784 struct shmem_inode_info *info, pgoff_t index)
1189{ 785{
1190 struct vm_area_struct pvma; 786 struct vm_area_struct pvma;
1191 787
1192 /* Create a pseudo vma that just contains the policy */ 788 /* Create a pseudo vma that just contains the policy */
1193 pvma.vm_start = 0; 789 pvma.vm_start = 0;
1194 pvma.vm_pgoff = idx; 790 pvma.vm_pgoff = index;
1195 pvma.vm_ops = NULL; 791 pvma.vm_ops = NULL;
1196 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 792 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1197 793
1198 /* 794 /*
1199 * alloc_page_vma() will drop the shared policy reference 795 * alloc_page_vma() will drop the shared policy reference
@@ -1202,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1202} 798}
1203#else /* !CONFIG_NUMA */ 799#else /* !CONFIG_NUMA */
1204#ifdef CONFIG_TMPFS 800#ifdef CONFIG_TMPFS
1205static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) 801static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1206{ 802{
1207} 803}
1208#endif /* CONFIG_TMPFS */ 804#endif /* CONFIG_TMPFS */
1209 805
1210static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 806static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1211 struct shmem_inode_info *info, unsigned long idx) 807 struct shmem_inode_info *info, pgoff_t index)
1212{ 808{
1213 return swapin_readahead(entry, gfp, NULL, 0); 809 return swapin_readahead(swap, gfp, NULL, 0);
1214} 810}
1215 811
1216static inline struct page *shmem_alloc_page(gfp_t gfp, 812static inline struct page *shmem_alloc_page(gfp_t gfp,
1217 struct shmem_inode_info *info, unsigned long idx) 813 struct shmem_inode_info *info, pgoff_t index)
1218{ 814{
1219 return alloc_page(gfp); 815 return alloc_page(gfp);
1220} 816}
@@ -1228,311 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1228#endif 824#endif
1229 825
1230/* 826/*
1231 * shmem_getpage - either get the page from swap or allocate a new one 827 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1232 * 828 *
1233 * If we allocate a new one we do not mark it dirty. That's up to the 829 * If we allocate a new one we do not mark it dirty. That's up to the
1234 * vm. If we swap it in we mark it dirty since we also free the swap 830 * vm. If we swap it in we mark it dirty since we also free the swap
1235 * entry since a page cannot live in both the swap and page cache 831 * entry since a page cannot live in both the swap and page cache
1236 */ 832 */
1237static int shmem_getpage(struct inode *inode, unsigned long idx, 833static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1238 struct page **pagep, enum sgp_type sgp, int *type) 834 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1239{ 835{
1240 struct address_space *mapping = inode->i_mapping; 836 struct address_space *mapping = inode->i_mapping;
1241 struct shmem_inode_info *info = SHMEM_I(inode); 837 struct shmem_inode_info *info;
1242 struct shmem_sb_info *sbinfo; 838 struct shmem_sb_info *sbinfo;
1243 struct page *filepage = *pagep; 839 struct page *page;
1244 struct page *swappage;
1245 struct page *prealloc_page = NULL;
1246 swp_entry_t *entry;
1247 swp_entry_t swap; 840 swp_entry_t swap;
1248 gfp_t gfp;
1249 int error; 841 int error;
842 int once = 0;
1250 843
1251 if (idx >= SHMEM_MAX_INDEX) 844 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1252 return -EFBIG; 845 return -EFBIG;
846repeat:
847 swap.val = 0;
848 page = find_lock_page(mapping, index);
849 if (radix_tree_exceptional_entry(page)) {
850 swap = radix_to_swp_entry(page);
851 page = NULL;
852 }
1253 853
1254 if (type) 854 if (sgp != SGP_WRITE &&
1255 *type = 0; 855 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
856 error = -EINVAL;
857 goto failed;
858 }
1256 859
1257 /* 860 if (page || (sgp == SGP_READ && !swap.val)) {
1258 * Normally, filepage is NULL on entry, and either found
1259 * uptodate immediately, or allocated and zeroed, or read
1260 * in under swappage, which is then assigned to filepage.
1261 * But shmem_readpage (required for splice) passes in a locked
1262 * filepage, which may be found not uptodate by other callers
1263 * too, and may need to be copied from the swappage read in.
1264 */
1265repeat:
1266 if (!filepage)
1267 filepage = find_lock_page(mapping, idx);
1268 if (filepage && PageUptodate(filepage))
1269 goto done;
1270 gfp = mapping_gfp_mask(mapping);
1271 if (!filepage) {
1272 /* 861 /*
1273 * Try to preload while we can wait, to not make a habit of 862 * Once we can get the page lock, it must be uptodate:
1274 * draining atomic reserves; but don't latch on to this cpu. 863 * if there were an error in reading back from swap,
864 * the page would not be inserted into the filecache.
1275 */ 865 */
1276 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 866 BUG_ON(page && !PageUptodate(page));
1277 if (error) 867 *pagep = page;
1278 goto failed; 868 return 0;
1279 radix_tree_preload_end();
1280 if (sgp != SGP_READ && !prealloc_page) {
1281 /* We don't care if this fails */
1282 prealloc_page = shmem_alloc_page(gfp, info, idx);
1283 if (prealloc_page) {
1284 if (mem_cgroup_cache_charge(prealloc_page,
1285 current->mm, GFP_KERNEL)) {
1286 page_cache_release(prealloc_page);
1287 prealloc_page = NULL;
1288 }
1289 }
1290 }
1291 } 869 }
1292 error = 0;
1293 870
1294 spin_lock(&info->lock); 871 /*
1295 shmem_recalc_inode(inode); 872 * Fast cache lookup did not find it:
1296 entry = shmem_swp_alloc(info, idx, sgp); 873 * bring it back from swap or allocate.
1297 if (IS_ERR(entry)) { 874 */
1298 spin_unlock(&info->lock); 875 info = SHMEM_I(inode);
1299 error = PTR_ERR(entry); 876 sbinfo = SHMEM_SB(inode->i_sb);
1300 goto failed;
1301 }
1302 swap = *entry;
1303 877
1304 if (swap.val) { 878 if (swap.val) {
1305 /* Look it up and read it in.. */ 879 /* Look it up and read it in.. */
1306 swappage = lookup_swap_cache(swap); 880 page = lookup_swap_cache(swap);
1307 if (!swappage) { 881 if (!page) {
1308 shmem_swp_unmap(entry);
1309 spin_unlock(&info->lock);
1310 /* here we actually do the io */ 882 /* here we actually do the io */
1311 if (type) 883 if (fault_type)
1312 *type |= VM_FAULT_MAJOR; 884 *fault_type |= VM_FAULT_MAJOR;
1313 swappage = shmem_swapin(swap, gfp, info, idx); 885 page = shmem_swapin(swap, gfp, info, index);
1314 if (!swappage) { 886 if (!page) {
1315 spin_lock(&info->lock); 887 error = -ENOMEM;
1316 entry = shmem_swp_alloc(info, idx, sgp); 888 goto failed;
1317 if (IS_ERR(entry))
1318 error = PTR_ERR(entry);
1319 else {
1320 if (entry->val == swap.val)
1321 error = -ENOMEM;
1322 shmem_swp_unmap(entry);
1323 }
1324 spin_unlock(&info->lock);
1325 if (error)
1326 goto failed;
1327 goto repeat;
1328 } 889 }
1329 wait_on_page_locked(swappage);
1330 page_cache_release(swappage);
1331 goto repeat;
1332 } 890 }
1333 891
1334 /* We have to do this with page locked to prevent races */ 892 /* We have to do this with page locked to prevent races */
1335 if (!trylock_page(swappage)) { 893 lock_page(page);
1336 shmem_swp_unmap(entry); 894 if (!PageUptodate(page)) {
1337 spin_unlock(&info->lock);
1338 wait_on_page_locked(swappage);
1339 page_cache_release(swappage);
1340 goto repeat;
1341 }
1342 if (PageWriteback(swappage)) {
1343 shmem_swp_unmap(entry);
1344 spin_unlock(&info->lock);
1345 wait_on_page_writeback(swappage);
1346 unlock_page(swappage);
1347 page_cache_release(swappage);
1348 goto repeat;
1349 }
1350 if (!PageUptodate(swappage)) {
1351 shmem_swp_unmap(entry);
1352 spin_unlock(&info->lock);
1353 unlock_page(swappage);
1354 page_cache_release(swappage);
1355 error = -EIO; 895 error = -EIO;
1356 goto failed; 896 goto failed;
1357 } 897 }
1358 898 wait_on_page_writeback(page);
1359 if (filepage) { 899
1360 shmem_swp_set(info, entry, 0); 900 /* Someone may have already done it for us */
1361 shmem_swp_unmap(entry); 901 if (page->mapping) {
1362 delete_from_swap_cache(swappage); 902 if (page->mapping == mapping &&
1363 spin_unlock(&info->lock); 903 page->index == index)
1364 copy_highpage(filepage, swappage); 904 goto done;
1365 unlock_page(swappage); 905 error = -EEXIST;
1366 page_cache_release(swappage); 906 goto failed;
1367 flush_dcache_page(filepage);
1368 SetPageUptodate(filepage);
1369 set_page_dirty(filepage);
1370 swap_free(swap);
1371 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1372 idx, GFP_NOWAIT))) {
1373 info->flags |= SHMEM_PAGEIN;
1374 shmem_swp_set(info, entry, 0);
1375 shmem_swp_unmap(entry);
1376 delete_from_swap_cache(swappage);
1377 spin_unlock(&info->lock);
1378 filepage = swappage;
1379 set_page_dirty(filepage);
1380 swap_free(swap);
1381 } else {
1382 shmem_swp_unmap(entry);
1383 spin_unlock(&info->lock);
1384 if (error == -ENOMEM) {
1385 /*
1386 * reclaim from proper memory cgroup and
1387 * call memcg's OOM if needed.
1388 */
1389 error = mem_cgroup_shmem_charge_fallback(
1390 swappage,
1391 current->mm,
1392 gfp);
1393 if (error) {
1394 unlock_page(swappage);
1395 page_cache_release(swappage);
1396 goto failed;
1397 }
1398 }
1399 unlock_page(swappage);
1400 page_cache_release(swappage);
1401 goto repeat;
1402 }
1403 } else if (sgp == SGP_READ && !filepage) {
1404 shmem_swp_unmap(entry);
1405 filepage = find_get_page(mapping, idx);
1406 if (filepage &&
1407 (!PageUptodate(filepage) || !trylock_page(filepage))) {
1408 spin_unlock(&info->lock);
1409 wait_on_page_locked(filepage);
1410 page_cache_release(filepage);
1411 filepage = NULL;
1412 goto repeat;
1413 } 907 }
908
909 error = mem_cgroup_cache_charge(page, current->mm,
910 gfp & GFP_RECLAIM_MASK);
911 if (!error)
912 error = shmem_add_to_page_cache(page, mapping, index,
913 gfp, swp_to_radix_entry(swap));
914 if (error)
915 goto failed;
916
917 spin_lock(&info->lock);
918 info->swapped--;
919 shmem_recalc_inode(inode);
1414 spin_unlock(&info->lock); 920 spin_unlock(&info->lock);
921
922 delete_from_swap_cache(page);
923 set_page_dirty(page);
924 swap_free(swap);
925
1415 } else { 926 } else {
1416 shmem_swp_unmap(entry); 927 if (shmem_acct_block(info->flags)) {
1417 sbinfo = SHMEM_SB(inode->i_sb); 928 error = -ENOSPC;
929 goto failed;
930 }
1418 if (sbinfo->max_blocks) { 931 if (sbinfo->max_blocks) {
1419 if (percpu_counter_compare(&sbinfo->used_blocks, 932 if (percpu_counter_compare(&sbinfo->used_blocks,
1420 sbinfo->max_blocks) >= 0 || 933 sbinfo->max_blocks) >= 0) {
1421 shmem_acct_block(info->flags)) 934 error = -ENOSPC;
1422 goto nospace; 935 goto unacct;
1423 percpu_counter_inc(&sbinfo->used_blocks);
1424 spin_lock(&inode->i_lock);
1425 inode->i_blocks += BLOCKS_PER_PAGE;
1426 spin_unlock(&inode->i_lock);
1427 } else if (shmem_acct_block(info->flags))
1428 goto nospace;
1429
1430 if (!filepage) {
1431 int ret;
1432
1433 if (!prealloc_page) {
1434 spin_unlock(&info->lock);
1435 filepage = shmem_alloc_page(gfp, info, idx);
1436 if (!filepage) {
1437 shmem_unacct_blocks(info->flags, 1);
1438 shmem_free_blocks(inode, 1);
1439 error = -ENOMEM;
1440 goto failed;
1441 }
1442 SetPageSwapBacked(filepage);
1443
1444 /*
1445 * Precharge page while we can wait, compensate
1446 * after
1447 */
1448 error = mem_cgroup_cache_charge(filepage,
1449 current->mm, GFP_KERNEL);
1450 if (error) {
1451 page_cache_release(filepage);
1452 shmem_unacct_blocks(info->flags, 1);
1453 shmem_free_blocks(inode, 1);
1454 filepage = NULL;
1455 goto failed;
1456 }
1457
1458 spin_lock(&info->lock);
1459 } else {
1460 filepage = prealloc_page;
1461 prealloc_page = NULL;
1462 SetPageSwapBacked(filepage);
1463 } 936 }
937 percpu_counter_inc(&sbinfo->used_blocks);
938 }
1464 939
1465 entry = shmem_swp_alloc(info, idx, sgp); 940 page = shmem_alloc_page(gfp, info, index);
1466 if (IS_ERR(entry)) 941 if (!page) {
1467 error = PTR_ERR(entry); 942 error = -ENOMEM;
1468 else { 943 goto decused;
1469 swap = *entry;
1470 shmem_swp_unmap(entry);
1471 }
1472 ret = error || swap.val;
1473 if (ret)
1474 mem_cgroup_uncharge_cache_page(filepage);
1475 else
1476 ret = add_to_page_cache_lru(filepage, mapping,
1477 idx, GFP_NOWAIT);
1478 /*
1479 * At add_to_page_cache_lru() failure, uncharge will
1480 * be done automatically.
1481 */
1482 if (ret) {
1483 spin_unlock(&info->lock);
1484 page_cache_release(filepage);
1485 shmem_unacct_blocks(info->flags, 1);
1486 shmem_free_blocks(inode, 1);
1487 filepage = NULL;
1488 if (error)
1489 goto failed;
1490 goto repeat;
1491 }
1492 info->flags |= SHMEM_PAGEIN;
1493 } 944 }
1494 945
946 SetPageSwapBacked(page);
947 __set_page_locked(page);
948 error = mem_cgroup_cache_charge(page, current->mm,
949 gfp & GFP_RECLAIM_MASK);
950 if (!error)
951 error = shmem_add_to_page_cache(page, mapping, index,
952 gfp, NULL);
953 if (error)
954 goto decused;
955 lru_cache_add_anon(page);
956
957 spin_lock(&info->lock);
1495 info->alloced++; 958 info->alloced++;
959 inode->i_blocks += BLOCKS_PER_PAGE;
960 shmem_recalc_inode(inode);
1496 spin_unlock(&info->lock); 961 spin_unlock(&info->lock);
1497 clear_highpage(filepage); 962
1498 flush_dcache_page(filepage); 963 clear_highpage(page);
1499 SetPageUptodate(filepage); 964 flush_dcache_page(page);
965 SetPageUptodate(page);
1500 if (sgp == SGP_DIRTY) 966 if (sgp == SGP_DIRTY)
1501 set_page_dirty(filepage); 967 set_page_dirty(page);
1502 } 968 }
1503done: 969done:
1504 *pagep = filepage; 970 /* Perhaps the file has been truncated since we checked */
1505 error = 0; 971 if (sgp != SGP_WRITE &&
1506 goto out; 972 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
973 error = -EINVAL;
974 goto trunc;
975 }
976 *pagep = page;
977 return 0;
1507 978
1508nospace:
1509 /* 979 /*
1510 * Perhaps the page was brought in from swap between find_lock_page 980 * Error recovery.
1511 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1512 * but must also avoid reporting a spurious ENOSPC while working on a
1513 * full tmpfs. (When filepage has been passed in to shmem_getpage, it
1514 * is already in page cache, which prevents this race from occurring.)
1515 */ 981 */
1516 if (!filepage) { 982trunc:
1517 struct page *page = find_get_page(mapping, idx); 983 ClearPageDirty(page);
1518 if (page) { 984 delete_from_page_cache(page);
1519 spin_unlock(&info->lock); 985 spin_lock(&info->lock);
1520 page_cache_release(page); 986 info->alloced--;
1521 goto repeat; 987 inode->i_blocks -= BLOCKS_PER_PAGE;
1522 }
1523 }
1524 spin_unlock(&info->lock); 988 spin_unlock(&info->lock);
1525 error = -ENOSPC; 989decused:
990 if (sbinfo->max_blocks)
991 percpu_counter_add(&sbinfo->used_blocks, -1);
992unacct:
993 shmem_unacct_blocks(info->flags, 1);
1526failed: 994failed:
1527 if (*pagep != filepage) { 995 if (swap.val && error != -EINVAL) {
1528 unlock_page(filepage); 996 struct page *test = find_get_page(mapping, index);
1529 page_cache_release(filepage); 997 if (test && !radix_tree_exceptional_entry(test))
998 page_cache_release(test);
999 /* Have another try if the entry has changed */
1000 if (test != swp_to_radix_entry(swap))
1001 error = -EEXIST;
1530 } 1002 }
1531out: 1003 if (page) {
1532 if (prealloc_page) { 1004 unlock_page(page);
1533 mem_cgroup_uncharge_cache_page(prealloc_page); 1005 page_cache_release(page);
1534 page_cache_release(prealloc_page); 1006 }
1007 if (error == -ENOSPC && !once++) {
1008 info = SHMEM_I(inode);
1009 spin_lock(&info->lock);
1010 shmem_recalc_inode(inode);
1011 spin_unlock(&info->lock);
1012 goto repeat;
1535 } 1013 }
1014 if (error == -EEXIST)
1015 goto repeat;
1536 return error; 1016 return error;
1537} 1017}
1538 1018
@@ -1540,36 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1540{ 1020{
1541 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1021 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1542 int error; 1022 int error;
1543 int ret; 1023 int ret = VM_FAULT_LOCKED;
1544
1545 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1546 return VM_FAULT_SIGBUS;
1547 1024
1548 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1025 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1549 if (error) 1026 if (error)
1550 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1027 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1028
1551 if (ret & VM_FAULT_MAJOR) { 1029 if (ret & VM_FAULT_MAJOR) {
1552 count_vm_event(PGMAJFAULT); 1030 count_vm_event(PGMAJFAULT);
1553 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1031 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554 } 1032 }
1555 return ret | VM_FAULT_LOCKED; 1033 return ret;
1556} 1034}
1557 1035
1558#ifdef CONFIG_NUMA 1036#ifdef CONFIG_NUMA
1559static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1037static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1560{ 1038{
1561 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1039 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1562 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1040 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1563} 1041}
1564 1042
1565static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1043static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1566 unsigned long addr) 1044 unsigned long addr)
1567{ 1045{
1568 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1046 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1569 unsigned long idx; 1047 pgoff_t index;
1570 1048
1571 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1049 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1572 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); 1050 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1573} 1051}
1574#endif 1052#endif
1575 1053
@@ -1667,20 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1667 1145
1668#ifdef CONFIG_TMPFS 1146#ifdef CONFIG_TMPFS
1669static const struct inode_operations shmem_symlink_inode_operations; 1147static const struct inode_operations shmem_symlink_inode_operations;
1670static const struct inode_operations shmem_symlink_inline_operations; 1148static const struct inode_operations shmem_short_symlink_operations;
1671
1672/*
1673 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1674 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1675 * below the loop driver, in the generic fashion that many filesystems support.
1676 */
1677static int shmem_readpage(struct file *file, struct page *page)
1678{
1679 struct inode *inode = page->mapping->host;
1680 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1681 unlock_page(page);
1682 return error;
1683}
1684 1149
1685static int 1150static int
1686shmem_write_begin(struct file *file, struct address_space *mapping, 1151shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1689,7 +1154,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1689{ 1154{
1690 struct inode *inode = mapping->host; 1155 struct inode *inode = mapping->host;
1691 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1156 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1692 *pagep = NULL;
1693 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1157 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1694} 1158}
1695 1159
@@ -1714,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1714{ 1178{
1715 struct inode *inode = filp->f_path.dentry->d_inode; 1179 struct inode *inode = filp->f_path.dentry->d_inode;
1716 struct address_space *mapping = inode->i_mapping; 1180 struct address_space *mapping = inode->i_mapping;
1717 unsigned long index, offset; 1181 pgoff_t index;
1182 unsigned long offset;
1718 enum sgp_type sgp = SGP_READ; 1183 enum sgp_type sgp = SGP_READ;
1719 1184
1720 /* 1185 /*
@@ -1730,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1730 1195
1731 for (;;) { 1196 for (;;) {
1732 struct page *page = NULL; 1197 struct page *page = NULL;
1733 unsigned long end_index, nr, ret; 1198 pgoff_t end_index;
1199 unsigned long nr, ret;
1734 loff_t i_size = i_size_read(inode); 1200 loff_t i_size = i_size_read(inode);
1735 1201
1736 end_index = i_size >> PAGE_CACHE_SHIFT; 1202 end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1846,6 +1312,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1846 return retval; 1312 return retval;
1847} 1313}
1848 1314
1315static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1316 struct pipe_inode_info *pipe, size_t len,
1317 unsigned int flags)
1318{
1319 struct address_space *mapping = in->f_mapping;
1320 struct inode *inode = mapping->host;
1321 unsigned int loff, nr_pages, req_pages;
1322 struct page *pages[PIPE_DEF_BUFFERS];
1323 struct partial_page partial[PIPE_DEF_BUFFERS];
1324 struct page *page;
1325 pgoff_t index, end_index;
1326 loff_t isize, left;
1327 int error, page_nr;
1328 struct splice_pipe_desc spd = {
1329 .pages = pages,
1330 .partial = partial,
1331 .flags = flags,
1332 .ops = &page_cache_pipe_buf_ops,
1333 .spd_release = spd_release_page,
1334 };
1335
1336 isize = i_size_read(inode);
1337 if (unlikely(*ppos >= isize))
1338 return 0;
1339
1340 left = isize - *ppos;
1341 if (unlikely(left < len))
1342 len = left;
1343
1344 if (splice_grow_spd(pipe, &spd))
1345 return -ENOMEM;
1346
1347 index = *ppos >> PAGE_CACHE_SHIFT;
1348 loff = *ppos & ~PAGE_CACHE_MASK;
1349 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1350 nr_pages = min(req_pages, pipe->buffers);
1351
1352 spd.nr_pages = find_get_pages_contig(mapping, index,
1353 nr_pages, spd.pages);
1354 index += spd.nr_pages;
1355 error = 0;
1356
1357 while (spd.nr_pages < nr_pages) {
1358 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1359 if (error)
1360 break;
1361 unlock_page(page);
1362 spd.pages[spd.nr_pages++] = page;
1363 index++;
1364 }
1365
1366 index = *ppos >> PAGE_CACHE_SHIFT;
1367 nr_pages = spd.nr_pages;
1368 spd.nr_pages = 0;
1369
1370 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1371 unsigned int this_len;
1372
1373 if (!len)
1374 break;
1375
1376 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1377 page = spd.pages[page_nr];
1378
1379 if (!PageUptodate(page) || page->mapping != mapping) {
1380 error = shmem_getpage(inode, index, &page,
1381 SGP_CACHE, NULL);
1382 if (error)
1383 break;
1384 unlock_page(page);
1385 page_cache_release(spd.pages[page_nr]);
1386 spd.pages[page_nr] = page;
1387 }
1388
1389 isize = i_size_read(inode);
1390 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1391 if (unlikely(!isize || index > end_index))
1392 break;
1393
1394 if (end_index == index) {
1395 unsigned int plen;
1396
1397 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1398 if (plen <= loff)
1399 break;
1400
1401 this_len = min(this_len, plen - loff);
1402 len = this_len;
1403 }
1404
1405 spd.partial[page_nr].offset = loff;
1406 spd.partial[page_nr].len = this_len;
1407 len -= this_len;
1408 loff = 0;
1409 spd.nr_pages++;
1410 index++;
1411 }
1412
1413 while (page_nr < nr_pages)
1414 page_cache_release(spd.pages[page_nr++]);
1415
1416 if (spd.nr_pages)
1417 error = splice_to_pipe(pipe, &spd);
1418
1419 splice_shrink_spd(pipe, &spd);
1420
1421 if (error > 0) {
1422 *ppos += error;
1423 file_accessed(in);
1424 }
1425 return error;
1426}
1427
1849static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1428static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1850{ 1429{
1851 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1430 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1855,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1855 buf->f_namelen = NAME_MAX; 1434 buf->f_namelen = NAME_MAX;
1856 if (sbinfo->max_blocks) { 1435 if (sbinfo->max_blocks) {
1857 buf->f_blocks = sbinfo->max_blocks; 1436 buf->f_blocks = sbinfo->max_blocks;
1858 buf->f_bavail = buf->f_bfree = 1437 buf->f_bavail =
1859 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); 1438 buf->f_bfree = sbinfo->max_blocks -
1439 percpu_counter_sum(&sbinfo->used_blocks);
1860 } 1440 }
1861 if (sbinfo->max_inodes) { 1441 if (sbinfo->max_inodes) {
1862 buf->f_files = sbinfo->max_inodes; 1442 buf->f_files = sbinfo->max_inodes;
@@ -2006,7 +1586,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2006 int error; 1586 int error;
2007 int len; 1587 int len;
2008 struct inode *inode; 1588 struct inode *inode;
2009 struct page *page = NULL; 1589 struct page *page;
2010 char *kaddr; 1590 char *kaddr;
2011 struct shmem_inode_info *info; 1591 struct shmem_inode_info *info;
2012 1592
@@ -2030,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2030 1610
2031 info = SHMEM_I(inode); 1611 info = SHMEM_I(inode);
2032 inode->i_size = len-1; 1612 inode->i_size = len-1;
2033 if (len <= SHMEM_SYMLINK_INLINE_LEN) { 1613 if (len <= SHORT_SYMLINK_LEN) {
2034 /* do it inline */ 1614 info->symlink = kmemdup(symname, len, GFP_KERNEL);
2035 memcpy(info->inline_symlink, symname, len); 1615 if (!info->symlink) {
2036 inode->i_op = &shmem_symlink_inline_operations; 1616 iput(inode);
1617 return -ENOMEM;
1618 }
1619 inode->i_op = &shmem_short_symlink_operations;
2037 } else { 1620 } else {
2038 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 1621 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2039 if (error) { 1622 if (error) {
@@ -2056,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2056 return 0; 1639 return 0;
2057} 1640}
2058 1641
2059static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 1642static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2060{ 1643{
2061 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); 1644 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2062 return NULL; 1645 return NULL;
2063} 1646}
2064 1647
2065static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 1648static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2066{ 1649{
2067 struct page *page = NULL; 1650 struct page *page = NULL;
2068 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1651 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2069 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1652 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2070 if (page) 1653 if (page)
2071 unlock_page(page); 1654 unlock_page(page);
2072 return page; 1655 return page;
@@ -2177,7 +1760,6 @@ out:
2177 return err; 1760 return err;
2178} 1761}
2179 1762
2180
2181static const struct xattr_handler *shmem_xattr_handlers[] = { 1763static const struct xattr_handler *shmem_xattr_handlers[] = {
2182#ifdef CONFIG_TMPFS_POSIX_ACL 1764#ifdef CONFIG_TMPFS_POSIX_ACL
2183 &generic_acl_access_handler, 1765 &generic_acl_access_handler,
@@ -2307,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2307} 1889}
2308#endif /* CONFIG_TMPFS_XATTR */ 1890#endif /* CONFIG_TMPFS_XATTR */
2309 1891
2310static const struct inode_operations shmem_symlink_inline_operations = { 1892static const struct inode_operations shmem_short_symlink_operations = {
2311 .readlink = generic_readlink, 1893 .readlink = generic_readlink,
2312 .follow_link = shmem_follow_link_inline, 1894 .follow_link = shmem_follow_short_symlink,
2313#ifdef CONFIG_TMPFS_XATTR 1895#ifdef CONFIG_TMPFS_XATTR
2314 .setxattr = shmem_setxattr, 1896 .setxattr = shmem_setxattr,
2315 .getxattr = shmem_getxattr, 1897 .getxattr = shmem_getxattr,
@@ -2509,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2509 if (config.max_inodes < inodes) 2091 if (config.max_inodes < inodes)
2510 goto out; 2092 goto out;
2511 /* 2093 /*
2512 * Those tests also disallow limited->unlimited while any are in 2094 * Those tests disallow limited->unlimited while any are in use;
2513 * use, so i_blocks will always be zero when max_blocks is zero;
2514 * but we must separately disallow unlimited->limited, because 2095 * but we must separately disallow unlimited->limited, because
2515 * in that case we have no record of how much is already in use. 2096 * in that case we have no record of how much is already in use.
2516 */ 2097 */
@@ -2602,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2602 goto failed; 2183 goto failed;
2603 sbinfo->free_inodes = sbinfo->max_inodes; 2184 sbinfo->free_inodes = sbinfo->max_inodes;
2604 2185
2605 sb->s_maxbytes = SHMEM_MAX_BYTES; 2186 sb->s_maxbytes = MAX_LFS_FILESIZE;
2606 sb->s_blocksize = PAGE_CACHE_SIZE; 2187 sb->s_blocksize = PAGE_CACHE_SIZE;
2607 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2188 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2608 sb->s_magic = TMPFS_MAGIC; 2189 sb->s_magic = TMPFS_MAGIC;
@@ -2637,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep;
2637 2218
2638static struct inode *shmem_alloc_inode(struct super_block *sb) 2219static struct inode *shmem_alloc_inode(struct super_block *sb)
2639{ 2220{
2640 struct shmem_inode_info *p; 2221 struct shmem_inode_info *info;
2641 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2222 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2642 if (!p) 2223 if (!info)
2643 return NULL; 2224 return NULL;
2644 return &p->vfs_inode; 2225 return &info->vfs_inode;
2645} 2226}
2646 2227
2647static void shmem_i_callback(struct rcu_head *head) 2228static void shmem_destroy_callback(struct rcu_head *head)
2648{ 2229{
2649 struct inode *inode = container_of(head, struct inode, i_rcu); 2230 struct inode *inode = container_of(head, struct inode, i_rcu);
2650 INIT_LIST_HEAD(&inode->i_dentry); 2231 INIT_LIST_HEAD(&inode->i_dentry);
@@ -2653,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head)
2653 2234
2654static void shmem_destroy_inode(struct inode *inode) 2235static void shmem_destroy_inode(struct inode *inode)
2655{ 2236{
2656 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2237 if ((inode->i_mode & S_IFMT) == S_IFREG)
2657 /* only struct inode is valid if it's an inline symlink */
2658 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2238 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2659 } 2239 call_rcu(&inode->i_rcu, shmem_destroy_callback);
2660 call_rcu(&inode->i_rcu, shmem_i_callback);
2661} 2240}
2662 2241
2663static void init_once(void *foo) 2242static void shmem_init_inode(void *foo)
2664{ 2243{
2665 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2244 struct shmem_inode_info *info = foo;
2666 2245 inode_init_once(&info->vfs_inode);
2667 inode_init_once(&p->vfs_inode);
2668} 2246}
2669 2247
2670static int init_inodecache(void) 2248static int shmem_init_inodecache(void)
2671{ 2249{
2672 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2250 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2673 sizeof(struct shmem_inode_info), 2251 sizeof(struct shmem_inode_info),
2674 0, SLAB_PANIC, init_once); 2252 0, SLAB_PANIC, shmem_init_inode);
2675 return 0; 2253 return 0;
2676} 2254}
2677 2255
2678static void destroy_inodecache(void) 2256static void shmem_destroy_inodecache(void)
2679{ 2257{
2680 kmem_cache_destroy(shmem_inode_cachep); 2258 kmem_cache_destroy(shmem_inode_cachep);
2681} 2259}
@@ -2684,7 +2262,6 @@ static const struct address_space_operations shmem_aops = {
2684 .writepage = shmem_writepage, 2262 .writepage = shmem_writepage,
2685 .set_page_dirty = __set_page_dirty_no_writeback, 2263 .set_page_dirty = __set_page_dirty_no_writeback,
2686#ifdef CONFIG_TMPFS 2264#ifdef CONFIG_TMPFS
2687 .readpage = shmem_readpage,
2688 .write_begin = shmem_write_begin, 2265 .write_begin = shmem_write_begin,
2689 .write_end = shmem_write_end, 2266 .write_end = shmem_write_end,
2690#endif 2267#endif
@@ -2701,7 +2278,7 @@ static const struct file_operations shmem_file_operations = {
2701 .aio_read = shmem_file_aio_read, 2278 .aio_read = shmem_file_aio_read,
2702 .aio_write = generic_file_aio_write, 2279 .aio_write = generic_file_aio_write,
2703 .fsync = noop_fsync, 2280 .fsync = noop_fsync,
2704 .splice_read = generic_file_splice_read, 2281 .splice_read = shmem_file_splice_read,
2705 .splice_write = generic_file_splice_write, 2282 .splice_write = generic_file_splice_write,
2706#endif 2283#endif
2707}; 2284};
@@ -2715,10 +2292,6 @@ static const struct inode_operations shmem_inode_operations = {
2715 .listxattr = shmem_listxattr, 2292 .listxattr = shmem_listxattr,
2716 .removexattr = shmem_removexattr, 2293 .removexattr = shmem_removexattr,
2717#endif 2294#endif
2718#ifdef CONFIG_TMPFS_POSIX_ACL
2719 .check_acl = generic_check_acl,
2720#endif
2721
2722}; 2295};
2723 2296
2724static const struct inode_operations shmem_dir_inode_operations = { 2297static const struct inode_operations shmem_dir_inode_operations = {
@@ -2741,7 +2314,6 @@ static const struct inode_operations shmem_dir_inode_operations = {
2741#endif 2314#endif
2742#ifdef CONFIG_TMPFS_POSIX_ACL 2315#ifdef CONFIG_TMPFS_POSIX_ACL
2743 .setattr = shmem_setattr, 2316 .setattr = shmem_setattr,
2744 .check_acl = generic_check_acl,
2745#endif 2317#endif
2746}; 2318};
2747 2319
@@ -2754,7 +2326,6 @@ static const struct inode_operations shmem_special_inode_operations = {
2754#endif 2326#endif
2755#ifdef CONFIG_TMPFS_POSIX_ACL 2327#ifdef CONFIG_TMPFS_POSIX_ACL
2756 .setattr = shmem_setattr, 2328 .setattr = shmem_setattr,
2757 .check_acl = generic_check_acl,
2758#endif 2329#endif
2759}; 2330};
2760 2331
@@ -2779,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
2779#endif 2350#endif
2780}; 2351};
2781 2352
2782
2783static struct dentry *shmem_mount(struct file_system_type *fs_type, 2353static struct dentry *shmem_mount(struct file_system_type *fs_type,
2784 int flags, const char *dev_name, void *data) 2354 int flags, const char *dev_name, void *data)
2785{ 2355{
2786 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2356 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2787} 2357}
2788 2358
2789static struct file_system_type tmpfs_fs_type = { 2359static struct file_system_type shmem_fs_type = {
2790 .owner = THIS_MODULE, 2360 .owner = THIS_MODULE,
2791 .name = "tmpfs", 2361 .name = "tmpfs",
2792 .mount = shmem_mount, 2362 .mount = shmem_mount,
2793 .kill_sb = kill_litter_super, 2363 .kill_sb = kill_litter_super,
2794}; 2364};
2795 2365
2796int __init init_tmpfs(void) 2366int __init shmem_init(void)
2797{ 2367{
2798 int error; 2368 int error;
2799 2369
@@ -2801,18 +2371,18 @@ int __init init_tmpfs(void)
2801 if (error) 2371 if (error)
2802 goto out4; 2372 goto out4;
2803 2373
2804 error = init_inodecache(); 2374 error = shmem_init_inodecache();
2805 if (error) 2375 if (error)
2806 goto out3; 2376 goto out3;
2807 2377
2808 error = register_filesystem(&tmpfs_fs_type); 2378 error = register_filesystem(&shmem_fs_type);
2809 if (error) { 2379 if (error) {
2810 printk(KERN_ERR "Could not register tmpfs\n"); 2380 printk(KERN_ERR "Could not register tmpfs\n");
2811 goto out2; 2381 goto out2;
2812 } 2382 }
2813 2383
2814 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, 2384 shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
2815 tmpfs_fs_type.name, NULL); 2385 shmem_fs_type.name, NULL);
2816 if (IS_ERR(shm_mnt)) { 2386 if (IS_ERR(shm_mnt)) {
2817 error = PTR_ERR(shm_mnt); 2387 error = PTR_ERR(shm_mnt);
2818 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2388 printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2821,9 +2391,9 @@ int __init init_tmpfs(void)
2821 return 0; 2391 return 0;
2822 2392
2823out1: 2393out1:
2824 unregister_filesystem(&tmpfs_fs_type); 2394 unregister_filesystem(&shmem_fs_type);
2825out2: 2395out2:
2826 destroy_inodecache(); 2396 shmem_destroy_inodecache();
2827out3: 2397out3:
2828 bdi_destroy(&shmem_backing_dev_info); 2398 bdi_destroy(&shmem_backing_dev_info);
2829out4: 2399out4:
@@ -2831,45 +2401,6 @@ out4:
2831 return error; 2401 return error;
2832} 2402}
2833 2403
2834#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2835/**
2836 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2837 * @inode: the inode to be searched
2838 * @pgoff: the offset to be searched
2839 * @pagep: the pointer for the found page to be stored
2840 * @ent: the pointer for the found swap entry to be stored
2841 *
2842 * If a page is found, refcount of it is incremented. Callers should handle
2843 * these refcount.
2844 */
2845void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2846 struct page **pagep, swp_entry_t *ent)
2847{
2848 swp_entry_t entry = { .val = 0 }, *ptr;
2849 struct page *page = NULL;
2850 struct shmem_inode_info *info = SHMEM_I(inode);
2851
2852 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2853 goto out;
2854
2855 spin_lock(&info->lock);
2856 ptr = shmem_swp_entry(info, pgoff, NULL);
2857#ifdef CONFIG_SWAP
2858 if (ptr && ptr->val) {
2859 entry.val = ptr->val;
2860 page = find_get_page(&swapper_space, entry.val);
2861 } else
2862#endif
2863 page = find_get_page(inode->i_mapping, pgoff);
2864 if (ptr)
2865 shmem_swp_unmap(ptr);
2866 spin_unlock(&info->lock);
2867out:
2868 *pagep = page;
2869 *ent = entry;
2870}
2871#endif
2872
2873#else /* !CONFIG_SHMEM */ 2404#else /* !CONFIG_SHMEM */
2874 2405
2875/* 2406/*
@@ -2883,23 +2414,23 @@ out:
2883 2414
2884#include <linux/ramfs.h> 2415#include <linux/ramfs.h>
2885 2416
2886static struct file_system_type tmpfs_fs_type = { 2417static struct file_system_type shmem_fs_type = {
2887 .name = "tmpfs", 2418 .name = "tmpfs",
2888 .mount = ramfs_mount, 2419 .mount = ramfs_mount,
2889 .kill_sb = kill_litter_super, 2420 .kill_sb = kill_litter_super,
2890}; 2421};
2891 2422
2892int __init init_tmpfs(void) 2423int __init shmem_init(void)
2893{ 2424{
2894 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2425 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2895 2426
2896 shm_mnt = kern_mount(&tmpfs_fs_type); 2427 shm_mnt = kern_mount(&shmem_fs_type);
2897 BUG_ON(IS_ERR(shm_mnt)); 2428 BUG_ON(IS_ERR(shm_mnt));
2898 2429
2899 return 0; 2430 return 0;
2900} 2431}
2901 2432
2902int shmem_unuse(swp_entry_t entry, struct page *page) 2433int shmem_unuse(swp_entry_t swap, struct page *page)
2903{ 2434{
2904 return 0; 2435 return 0;
2905} 2436}
@@ -2909,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2909 return 0; 2440 return 0;
2910} 2441}
2911 2442
2912void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 2443void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2913{ 2444{
2914 truncate_inode_pages_range(inode->i_mapping, start, end); 2445 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2915} 2446}
2916EXPORT_SYMBOL_GPL(shmem_truncate_range); 2447EXPORT_SYMBOL_GPL(shmem_truncate_range);
2917 2448
2918#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2919/**
2920 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2921 * @inode: the inode to be searched
2922 * @pgoff: the offset to be searched
2923 * @pagep: the pointer for the found page to be stored
2924 * @ent: the pointer for the found swap entry to be stored
2925 *
2926 * If a page is found, refcount of it is incremented. Callers should handle
2927 * these refcount.
2928 */
2929void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2930 struct page **pagep, swp_entry_t *ent)
2931{
2932 struct page *page = NULL;
2933
2934 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2935 goto out;
2936 page = find_get_page(inode->i_mapping, pgoff);
2937out:
2938 *pagep = page;
2939 *ent = (swp_entry_t){ .val = 0 };
2940}
2941#endif
2942
2943#define shmem_vm_ops generic_file_vm_ops 2449#define shmem_vm_ops generic_file_vm_ops
2944#define shmem_file_operations ramfs_file_operations 2450#define shmem_file_operations ramfs_file_operations
2945#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2451#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2946#define shmem_acct_size(flags, size) 0 2452#define shmem_acct_size(flags, size) 0
2947#define shmem_unacct_size(flags, size) do {} while (0) 2453#define shmem_unacct_size(flags, size) do {} while (0)
2948#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2949 2454
2950#endif /* CONFIG_SHMEM */ 2455#endif /* CONFIG_SHMEM */
2951 2456
@@ -2969,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2969 if (IS_ERR(shm_mnt)) 2474 if (IS_ERR(shm_mnt))
2970 return (void *)shm_mnt; 2475 return (void *)shm_mnt;
2971 2476
2972 if (size < 0 || size > SHMEM_MAX_BYTES) 2477 if (size < 0 || size > MAX_LFS_FILESIZE)
2973 return ERR_PTR(-EINVAL); 2478 return ERR_PTR(-EINVAL);
2974 2479
2975 if (shmem_acct_size(flags, size)) 2480 if (shmem_acct_size(flags, size))
@@ -3048,13 +2553,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those 2553 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 2554 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 * 2555 *
3051 * Provide a stub for those callers to start using now, then later 2556 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when 2557 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */ 2558 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 2559struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056 pgoff_t index, gfp_t gfp) 2560 pgoff_t index, gfp_t gfp)
3057{ 2561{
2562#ifdef CONFIG_SHMEM
2563 struct inode *inode = mapping->host;
2564 struct page *page;
2565 int error;
2566
2567 BUG_ON(mapping->a_ops != &shmem_aops);
2568 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
2569 if (error)
2570 page = ERR_PTR(error);
2571 else
2572 unlock_page(page);
2573 return page;
2574#else
2575 /*
2576 * The tiny !SHMEM case uses ramfs without swap
2577 */
3058 return read_cache_page_gfp(mapping, index, gfp); 2578 return read_cache_page_gfp(mapping, index, gfp);
2579#endif
3059} 2580}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 2581EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);