aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c297
1 files changed, 38 insertions, 259 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 876bc595d0f8..f3e5f8944d17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page)
115{ 116{
116 struct address_space *mapping = page->mapping; 117 struct address_space *mapping = page->mapping;
117 118
118 mem_cgroup_uncharge_cache_page(page);
119 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
120 page->mapping = NULL; 120 page->mapping = NULL;
121 mapping->nrpages--; 121 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
123 BUG_ON(page_mapped(page)); 123 BUG_ON(page_mapped(page));
124 mem_cgroup_uncharge_cache_page(page);
124 125
125 /* 126 /*
126 * Some filesystems seem to re-dirty the page even after 127 * Some filesystems seem to re-dirty the page even after
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
492int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 493int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
493 pgoff_t offset, gfp_t gfp_mask) 494 pgoff_t offset, gfp_t gfp_mask)
494{ 495{
495 int ret = add_to_page_cache(page, mapping, offset, gfp_mask); 496 int ret;
496 if (ret == 0) 497
497 lru_cache_add(page); 498 /*
499 * Splice_read and readahead add shmem/tmpfs pages into the page cache
500 * before shmem_readpage has a chance to mark them as SwapBacked: they
501 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
502 * (called in add_to_page_cache) needs to know where they're going too.
503 */
504 if (mapping_cap_swap_backed(mapping))
505 SetPageSwapBacked(page);
506
507 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
508 if (ret == 0) {
509 if (page_is_file_cache(page))
510 lru_cache_add_file(page);
511 else
512 lru_cache_add_active_anon(page);
513 }
498 return ret; 514 return ret;
499} 515}
500 516
@@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
557 * mechananism between PageLocked pages and PageWriteback pages is shared. 573 * mechananism between PageLocked pages and PageWriteback pages is shared.
558 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 574 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
559 * 575 *
560 * The first mb is necessary to safely close the critical section opened by the 576 * The mb is necessary to enforce ordering between the clear_bit and the read
561 * test_and_set_bit() to lock the page; the second mb is necessary to enforce 577 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
562 * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
563 * races with a parallel wait_on_page_locked()).
564 */ 578 */
565void unlock_page(struct page *page) 579void unlock_page(struct page *page)
566{ 580{
567 smp_mb__before_clear_bit(); 581 VM_BUG_ON(!PageLocked(page));
568 if (!test_and_clear_bit(PG_locked, &page->flags)) 582 clear_bit_unlock(PG_locked, &page->flags);
569 BUG(); 583 smp_mb__after_clear_bit();
570 smp_mb__after_clear_bit();
571 wake_up_page(page, PG_locked); 584 wake_up_page(page, PG_locked);
572} 585}
573EXPORT_SYMBOL(unlock_page); 586EXPORT_SYMBOL(unlock_page);
@@ -1100,8 +1113,9 @@ page_ok:
1100 1113
1101page_not_up_to_date: 1114page_not_up_to_date:
1102 /* Get exclusive access to the page ... */ 1115 /* Get exclusive access to the page ... */
1103 if (lock_page_killable(page)) 1116 error = lock_page_killable(page);
1104 goto readpage_eio; 1117 if (unlikely(error))
1118 goto readpage_error;
1105 1119
1106page_not_up_to_date_locked: 1120page_not_up_to_date_locked:
1107 /* Did it get truncated before we got the lock? */ 1121 /* Did it get truncated before we got the lock? */
@@ -1130,8 +1144,9 @@ readpage:
1130 } 1144 }
1131 1145
1132 if (!PageUptodate(page)) { 1146 if (!PageUptodate(page)) {
1133 if (lock_page_killable(page)) 1147 error = lock_page_killable(page);
1134 goto readpage_eio; 1148 if (unlikely(error))
1149 goto readpage_error;
1135 if (!PageUptodate(page)) { 1150 if (!PageUptodate(page)) {
1136 if (page->mapping == NULL) { 1151 if (page->mapping == NULL) {
1137 /* 1152 /*
@@ -1143,15 +1158,14 @@ readpage:
1143 } 1158 }
1144 unlock_page(page); 1159 unlock_page(page);
1145 shrink_readahead_size_eio(filp, ra); 1160 shrink_readahead_size_eio(filp, ra);
1146 goto readpage_eio; 1161 error = -EIO;
1162 goto readpage_error;
1147 } 1163 }
1148 unlock_page(page); 1164 unlock_page(page);
1149 } 1165 }
1150 1166
1151 goto page_ok; 1167 goto page_ok;
1152 1168
1153readpage_eio:
1154 error = -EIO;
1155readpage_error: 1169readpage_error:
1156 /* UHHUH! A synchronous read error occurred. Report it */ 1170 /* UHHUH! A synchronous read error occurred. Report it */
1157 desc->error = error; 1171 desc->error = error;
@@ -1186,8 +1200,7 @@ out:
1186 ra->prev_pos |= prev_offset; 1200 ra->prev_pos |= prev_offset;
1187 1201
1188 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1202 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1189 if (filp) 1203 file_accessed(filp);
1190 file_accessed(filp);
1191} 1204}
1192 1205
1193int file_read_actor(read_descriptor_t *desc, struct page *page, 1206int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -2016,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping,
2016{ 2029{
2017 const struct address_space_operations *aops = mapping->a_ops; 2030 const struct address_space_operations *aops = mapping->a_ops;
2018 2031
2019 if (aops->write_begin) { 2032 return aops->write_begin(file, mapping, pos, len, flags,
2020 return aops->write_begin(file, mapping, pos, len, flags,
2021 pagep, fsdata); 2033 pagep, fsdata);
2022 } else {
2023 int ret;
2024 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2025 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2026 struct inode *inode = mapping->host;
2027 struct page *page;
2028again:
2029 page = __grab_cache_page(mapping, index);
2030 *pagep = page;
2031 if (!page)
2032 return -ENOMEM;
2033
2034 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
2035 /*
2036 * There is no way to resolve a short write situation
2037 * for a !Uptodate page (except by double copying in
2038 * the caller done by generic_perform_write_2copy).
2039 *
2040 * Instead, we have to bring it uptodate here.
2041 */
2042 ret = aops->readpage(file, page);
2043 page_cache_release(page);
2044 if (ret) {
2045 if (ret == AOP_TRUNCATED_PAGE)
2046 goto again;
2047 return ret;
2048 }
2049 goto again;
2050 }
2051
2052 ret = aops->prepare_write(file, page, offset, offset+len);
2053 if (ret) {
2054 unlock_page(page);
2055 page_cache_release(page);
2056 if (pos + len > inode->i_size)
2057 vmtruncate(inode, inode->i_size);
2058 }
2059 return ret;
2060 }
2061} 2034}
2062EXPORT_SYMBOL(pagecache_write_begin); 2035EXPORT_SYMBOL(pagecache_write_begin);
2063 2036
@@ -2066,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
2066 struct page *page, void *fsdata) 2039 struct page *page, void *fsdata)
2067{ 2040{
2068 const struct address_space_operations *aops = mapping->a_ops; 2041 const struct address_space_operations *aops = mapping->a_ops;
2069 int ret;
2070
2071 if (aops->write_end) {
2072 mark_page_accessed(page);
2073 ret = aops->write_end(file, mapping, pos, len, copied,
2074 page, fsdata);
2075 } else {
2076 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2077 struct inode *inode = mapping->host;
2078
2079 flush_dcache_page(page);
2080 ret = aops->commit_write(file, page, offset, offset+len);
2081 unlock_page(page);
2082 mark_page_accessed(page);
2083 page_cache_release(page);
2084 2042
2085 if (ret < 0) { 2043 mark_page_accessed(page);
2086 if (pos + len > inode->i_size) 2044 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2087 vmtruncate(inode, inode->i_size);
2088 } else if (ret > 0)
2089 ret = min_t(size_t, copied, ret);
2090 else
2091 ret = copied;
2092 }
2093
2094 return ret;
2095} 2045}
2096EXPORT_SYMBOL(pagecache_write_end); 2046EXPORT_SYMBOL(pagecache_write_end);
2097 2047
@@ -2213,174 +2163,6 @@ repeat:
2213} 2163}
2214EXPORT_SYMBOL(__grab_cache_page); 2164EXPORT_SYMBOL(__grab_cache_page);
2215 2165
2216static ssize_t generic_perform_write_2copy(struct file *file,
2217 struct iov_iter *i, loff_t pos)
2218{
2219 struct address_space *mapping = file->f_mapping;
2220 const struct address_space_operations *a_ops = mapping->a_ops;
2221 struct inode *inode = mapping->host;
2222 long status = 0;
2223 ssize_t written = 0;
2224
2225 do {
2226 struct page *src_page;
2227 struct page *page;
2228 pgoff_t index; /* Pagecache index for current page */
2229 unsigned long offset; /* Offset into pagecache page */
2230 unsigned long bytes; /* Bytes to write to page */
2231 size_t copied; /* Bytes copied from user */
2232
2233 offset = (pos & (PAGE_CACHE_SIZE - 1));
2234 index = pos >> PAGE_CACHE_SHIFT;
2235 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2236 iov_iter_count(i));
2237
2238 /*
2239 * a non-NULL src_page indicates that we're doing the
2240 * copy via get_user_pages and kmap.
2241 */
2242 src_page = NULL;
2243
2244 /*
2245 * Bring in the user page that we will copy from _first_.
2246 * Otherwise there's a nasty deadlock on copying from the
2247 * same page as we're writing to, without it being marked
2248 * up-to-date.
2249 *
2250 * Not only is this an optimisation, but it is also required
2251 * to check that the address is actually valid, when atomic
2252 * usercopies are used, below.
2253 */
2254 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2255 status = -EFAULT;
2256 break;
2257 }
2258
2259 page = __grab_cache_page(mapping, index);
2260 if (!page) {
2261 status = -ENOMEM;
2262 break;
2263 }
2264
2265 /*
2266 * non-uptodate pages cannot cope with short copies, and we
2267 * cannot take a pagefault with the destination page locked.
2268 * So pin the source page to copy it.
2269 */
2270 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2271 unlock_page(page);
2272
2273 src_page = alloc_page(GFP_KERNEL);
2274 if (!src_page) {
2275 page_cache_release(page);
2276 status = -ENOMEM;
2277 break;
2278 }
2279
2280 /*
2281 * Cannot get_user_pages with a page locked for the
2282 * same reason as we can't take a page fault with a
2283 * page locked (as explained below).
2284 */
2285 copied = iov_iter_copy_from_user(src_page, i,
2286 offset, bytes);
2287 if (unlikely(copied == 0)) {
2288 status = -EFAULT;
2289 page_cache_release(page);
2290 page_cache_release(src_page);
2291 break;
2292 }
2293 bytes = copied;
2294
2295 lock_page(page);
2296 /*
2297 * Can't handle the page going uptodate here, because
2298 * that means we would use non-atomic usercopies, which
2299 * zero out the tail of the page, which can cause
2300 * zeroes to become transiently visible. We could just
2301 * use a non-zeroing copy, but the APIs aren't too
2302 * consistent.
2303 */
2304 if (unlikely(!page->mapping || PageUptodate(page))) {
2305 unlock_page(page);
2306 page_cache_release(page);
2307 page_cache_release(src_page);
2308 continue;
2309 }
2310 }
2311
2312 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2313 if (unlikely(status))
2314 goto fs_write_aop_error;
2315
2316 if (!src_page) {
2317 /*
2318 * Must not enter the pagefault handler here, because
2319 * we hold the page lock, so we might recursively
2320 * deadlock on the same lock, or get an ABBA deadlock
2321 * against a different lock, or against the mmap_sem
2322 * (which nests outside the page lock). So increment
2323 * preempt count, and use _atomic usercopies.
2324 *
2325 * The page is uptodate so we are OK to encounter a
2326 * short copy: if unmodified parts of the page are
2327 * marked dirty and written out to disk, it doesn't
2328 * really matter.
2329 */
2330 pagefault_disable();
2331 copied = iov_iter_copy_from_user_atomic(page, i,
2332 offset, bytes);
2333 pagefault_enable();
2334 } else {
2335 void *src, *dst;
2336 src = kmap_atomic(src_page, KM_USER0);
2337 dst = kmap_atomic(page, KM_USER1);
2338 memcpy(dst + offset, src + offset, bytes);
2339 kunmap_atomic(dst, KM_USER1);
2340 kunmap_atomic(src, KM_USER0);
2341 copied = bytes;
2342 }
2343 flush_dcache_page(page);
2344
2345 status = a_ops->commit_write(file, page, offset, offset+bytes);
2346 if (unlikely(status < 0))
2347 goto fs_write_aop_error;
2348 if (unlikely(status > 0)) /* filesystem did partial write */
2349 copied = min_t(size_t, copied, status);
2350
2351 unlock_page(page);
2352 mark_page_accessed(page);
2353 page_cache_release(page);
2354 if (src_page)
2355 page_cache_release(src_page);
2356
2357 iov_iter_advance(i, copied);
2358 pos += copied;
2359 written += copied;
2360
2361 balance_dirty_pages_ratelimited(mapping);
2362 cond_resched();
2363 continue;
2364
2365fs_write_aop_error:
2366 unlock_page(page);
2367 page_cache_release(page);
2368 if (src_page)
2369 page_cache_release(src_page);
2370
2371 /*
2372 * prepare_write() may have instantiated a few blocks
2373 * outside i_size. Trim these off again. Don't need
2374 * i_size_read because we hold i_mutex.
2375 */
2376 if (pos + bytes > inode->i_size)
2377 vmtruncate(inode, inode->i_size);
2378 break;
2379 } while (iov_iter_count(i));
2380
2381 return written ? written : status;
2382}
2383
2384static ssize_t generic_perform_write(struct file *file, 2166static ssize_t generic_perform_write(struct file *file,
2385 struct iov_iter *i, loff_t pos) 2167 struct iov_iter *i, loff_t pos)
2386{ 2168{
@@ -2481,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2481 struct iov_iter i; 2263 struct iov_iter i;
2482 2264
2483 iov_iter_init(&i, iov, nr_segs, count, written); 2265 iov_iter_init(&i, iov, nr_segs, count, written);
2484 if (a_ops->write_begin) 2266 status = generic_perform_write(file, &i, pos);
2485 status = generic_perform_write(file, &i, pos);
2486 else
2487 status = generic_perform_write_2copy(file, &i, pos);
2488 2267
2489 if (likely(status >= 0)) { 2268 if (likely(status >= 0)) {
2490 written += status; 2269 written += status;