diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 297 |
1 files changed, 38 insertions, 259 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 876bc595d0f8..f3e5f8944d17 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
| 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
| 35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
| 36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | ||
| 36 | #include "internal.h" | 37 | #include "internal.h" |
| 37 | 38 | ||
| 38 | /* | 39 | /* |
| @@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page) | |||
| 115 | { | 116 | { |
| 116 | struct address_space *mapping = page->mapping; | 117 | struct address_space *mapping = page->mapping; |
| 117 | 118 | ||
| 118 | mem_cgroup_uncharge_cache_page(page); | ||
| 119 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); |
| 120 | page->mapping = NULL; | 120 | page->mapping = NULL; |
| 121 | mapping->nrpages--; | 121 | mapping->nrpages--; |
| 122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 123 | BUG_ON(page_mapped(page)); | 123 | BUG_ON(page_mapped(page)); |
| 124 | mem_cgroup_uncharge_cache_page(page); | ||
| 124 | 125 | ||
| 125 | /* | 126 | /* |
| 126 | * Some filesystems seem to re-dirty the page even after | 127 | * Some filesystems seem to re-dirty the page even after |
| @@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked); | |||
| 492 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 493 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
| 493 | pgoff_t offset, gfp_t gfp_mask) | 494 | pgoff_t offset, gfp_t gfp_mask) |
| 494 | { | 495 | { |
| 495 | int ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 496 | int ret; |
| 496 | if (ret == 0) | 497 | |
| 497 | lru_cache_add(page); | 498 | /* |
| 499 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | ||
| 500 | * before shmem_readpage has a chance to mark them as SwapBacked: they | ||
| 501 | * need to go on the active_anon lru below, and mem_cgroup_cache_charge | ||
| 502 | * (called in add_to_page_cache) needs to know where they're going too. | ||
| 503 | */ | ||
| 504 | if (mapping_cap_swap_backed(mapping)) | ||
| 505 | SetPageSwapBacked(page); | ||
| 506 | |||
| 507 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | ||
| 508 | if (ret == 0) { | ||
| 509 | if (page_is_file_cache(page)) | ||
| 510 | lru_cache_add_file(page); | ||
| 511 | else | ||
| 512 | lru_cache_add_active_anon(page); | ||
| 513 | } | ||
| 498 | return ret; | 514 | return ret; |
| 499 | } | 515 | } |
| 500 | 516 | ||
| @@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
| 557 | * mechananism between PageLocked pages and PageWriteback pages is shared. | 573 | * mechananism between PageLocked pages and PageWriteback pages is shared. |
| 558 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 574 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
| 559 | * | 575 | * |
| 560 | * The first mb is necessary to safely close the critical section opened by the | 576 | * The mb is necessary to enforce ordering between the clear_bit and the read |
| 561 | * test_and_set_bit() to lock the page; the second mb is necessary to enforce | 577 | * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). |
| 562 | * ordering between the clear_bit and the read of the waitqueue (to avoid SMP | ||
| 563 | * races with a parallel wait_on_page_locked()). | ||
| 564 | */ | 578 | */ |
| 565 | void unlock_page(struct page *page) | 579 | void unlock_page(struct page *page) |
| 566 | { | 580 | { |
| 567 | smp_mb__before_clear_bit(); | 581 | VM_BUG_ON(!PageLocked(page)); |
| 568 | if (!test_and_clear_bit(PG_locked, &page->flags)) | 582 | clear_bit_unlock(PG_locked, &page->flags); |
| 569 | BUG(); | 583 | smp_mb__after_clear_bit(); |
| 570 | smp_mb__after_clear_bit(); | ||
| 571 | wake_up_page(page, PG_locked); | 584 | wake_up_page(page, PG_locked); |
| 572 | } | 585 | } |
| 573 | EXPORT_SYMBOL(unlock_page); | 586 | EXPORT_SYMBOL(unlock_page); |
| @@ -1100,8 +1113,9 @@ page_ok: | |||
| 1100 | 1113 | ||
| 1101 | page_not_up_to_date: | 1114 | page_not_up_to_date: |
| 1102 | /* Get exclusive access to the page ... */ | 1115 | /* Get exclusive access to the page ... */ |
| 1103 | if (lock_page_killable(page)) | 1116 | error = lock_page_killable(page); |
| 1104 | goto readpage_eio; | 1117 | if (unlikely(error)) |
| 1118 | goto readpage_error; | ||
| 1105 | 1119 | ||
| 1106 | page_not_up_to_date_locked: | 1120 | page_not_up_to_date_locked: |
| 1107 | /* Did it get truncated before we got the lock? */ | 1121 | /* Did it get truncated before we got the lock? */ |
| @@ -1130,8 +1144,9 @@ readpage: | |||
| 1130 | } | 1144 | } |
| 1131 | 1145 | ||
| 1132 | if (!PageUptodate(page)) { | 1146 | if (!PageUptodate(page)) { |
| 1133 | if (lock_page_killable(page)) | 1147 | error = lock_page_killable(page); |
| 1134 | goto readpage_eio; | 1148 | if (unlikely(error)) |
| 1149 | goto readpage_error; | ||
| 1135 | if (!PageUptodate(page)) { | 1150 | if (!PageUptodate(page)) { |
| 1136 | if (page->mapping == NULL) { | 1151 | if (page->mapping == NULL) { |
| 1137 | /* | 1152 | /* |
| @@ -1143,15 +1158,14 @@ readpage: | |||
| 1143 | } | 1158 | } |
| 1144 | unlock_page(page); | 1159 | unlock_page(page); |
| 1145 | shrink_readahead_size_eio(filp, ra); | 1160 | shrink_readahead_size_eio(filp, ra); |
| 1146 | goto readpage_eio; | 1161 | error = -EIO; |
| 1162 | goto readpage_error; | ||
| 1147 | } | 1163 | } |
| 1148 | unlock_page(page); | 1164 | unlock_page(page); |
| 1149 | } | 1165 | } |
| 1150 | 1166 | ||
| 1151 | goto page_ok; | 1167 | goto page_ok; |
| 1152 | 1168 | ||
| 1153 | readpage_eio: | ||
| 1154 | error = -EIO; | ||
| 1155 | readpage_error: | 1169 | readpage_error: |
| 1156 | /* UHHUH! A synchronous read error occurred. Report it */ | 1170 | /* UHHUH! A synchronous read error occurred. Report it */ |
| 1157 | desc->error = error; | 1171 | desc->error = error; |
| @@ -1186,8 +1200,7 @@ out: | |||
| 1186 | ra->prev_pos |= prev_offset; | 1200 | ra->prev_pos |= prev_offset; |
| 1187 | 1201 | ||
| 1188 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; | 1202 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; |
| 1189 | if (filp) | 1203 | file_accessed(filp); |
| 1190 | file_accessed(filp); | ||
| 1191 | } | 1204 | } |
| 1192 | 1205 | ||
| 1193 | int file_read_actor(read_descriptor_t *desc, struct page *page, | 1206 | int file_read_actor(read_descriptor_t *desc, struct page *page, |
| @@ -2016,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping, | |||
| 2016 | { | 2029 | { |
| 2017 | const struct address_space_operations *aops = mapping->a_ops; | 2030 | const struct address_space_operations *aops = mapping->a_ops; |
| 2018 | 2031 | ||
| 2019 | if (aops->write_begin) { | 2032 | return aops->write_begin(file, mapping, pos, len, flags, |
| 2020 | return aops->write_begin(file, mapping, pos, len, flags, | ||
| 2021 | pagep, fsdata); | 2033 | pagep, fsdata); |
| 2022 | } else { | ||
| 2023 | int ret; | ||
| 2024 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
| 2025 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 2026 | struct inode *inode = mapping->host; | ||
| 2027 | struct page *page; | ||
| 2028 | again: | ||
| 2029 | page = __grab_cache_page(mapping, index); | ||
| 2030 | *pagep = page; | ||
| 2031 | if (!page) | ||
| 2032 | return -ENOMEM; | ||
| 2033 | |||
| 2034 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
| 2035 | /* | ||
| 2036 | * There is no way to resolve a short write situation | ||
| 2037 | * for a !Uptodate page (except by double copying in | ||
| 2038 | * the caller done by generic_perform_write_2copy). | ||
| 2039 | * | ||
| 2040 | * Instead, we have to bring it uptodate here. | ||
| 2041 | */ | ||
| 2042 | ret = aops->readpage(file, page); | ||
| 2043 | page_cache_release(page); | ||
| 2044 | if (ret) { | ||
| 2045 | if (ret == AOP_TRUNCATED_PAGE) | ||
| 2046 | goto again; | ||
| 2047 | return ret; | ||
| 2048 | } | ||
| 2049 | goto again; | ||
| 2050 | } | ||
| 2051 | |||
| 2052 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
| 2053 | if (ret) { | ||
| 2054 | unlock_page(page); | ||
| 2055 | page_cache_release(page); | ||
| 2056 | if (pos + len > inode->i_size) | ||
| 2057 | vmtruncate(inode, inode->i_size); | ||
| 2058 | } | ||
| 2059 | return ret; | ||
| 2060 | } | ||
| 2061 | } | 2034 | } |
| 2062 | EXPORT_SYMBOL(pagecache_write_begin); | 2035 | EXPORT_SYMBOL(pagecache_write_begin); |
| 2063 | 2036 | ||
| @@ -2066,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, | |||
| 2066 | struct page *page, void *fsdata) | 2039 | struct page *page, void *fsdata) |
| 2067 | { | 2040 | { |
| 2068 | const struct address_space_operations *aops = mapping->a_ops; | 2041 | const struct address_space_operations *aops = mapping->a_ops; |
| 2069 | int ret; | ||
| 2070 | |||
| 2071 | if (aops->write_end) { | ||
| 2072 | mark_page_accessed(page); | ||
| 2073 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
| 2074 | page, fsdata); | ||
| 2075 | } else { | ||
| 2076 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 2077 | struct inode *inode = mapping->host; | ||
| 2078 | |||
| 2079 | flush_dcache_page(page); | ||
| 2080 | ret = aops->commit_write(file, page, offset, offset+len); | ||
| 2081 | unlock_page(page); | ||
| 2082 | mark_page_accessed(page); | ||
| 2083 | page_cache_release(page); | ||
| 2084 | 2042 | ||
| 2085 | if (ret < 0) { | 2043 | mark_page_accessed(page); |
| 2086 | if (pos + len > inode->i_size) | 2044 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); |
| 2087 | vmtruncate(inode, inode->i_size); | ||
| 2088 | } else if (ret > 0) | ||
| 2089 | ret = min_t(size_t, copied, ret); | ||
| 2090 | else | ||
| 2091 | ret = copied; | ||
| 2092 | } | ||
| 2093 | |||
| 2094 | return ret; | ||
| 2095 | } | 2045 | } |
| 2096 | EXPORT_SYMBOL(pagecache_write_end); | 2046 | EXPORT_SYMBOL(pagecache_write_end); |
| 2097 | 2047 | ||
| @@ -2213,174 +2163,6 @@ repeat: | |||
| 2213 | } | 2163 | } |
| 2214 | EXPORT_SYMBOL(__grab_cache_page); | 2164 | EXPORT_SYMBOL(__grab_cache_page); |
| 2215 | 2165 | ||
| 2216 | static ssize_t generic_perform_write_2copy(struct file *file, | ||
| 2217 | struct iov_iter *i, loff_t pos) | ||
| 2218 | { | ||
| 2219 | struct address_space *mapping = file->f_mapping; | ||
| 2220 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2221 | struct inode *inode = mapping->host; | ||
| 2222 | long status = 0; | ||
| 2223 | ssize_t written = 0; | ||
| 2224 | |||
| 2225 | do { | ||
| 2226 | struct page *src_page; | ||
| 2227 | struct page *page; | ||
| 2228 | pgoff_t index; /* Pagecache index for current page */ | ||
| 2229 | unsigned long offset; /* Offset into pagecache page */ | ||
| 2230 | unsigned long bytes; /* Bytes to write to page */ | ||
| 2231 | size_t copied; /* Bytes copied from user */ | ||
| 2232 | |||
| 2233 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
| 2234 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 2235 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
| 2236 | iov_iter_count(i)); | ||
| 2237 | |||
| 2238 | /* | ||
| 2239 | * a non-NULL src_page indicates that we're doing the | ||
| 2240 | * copy via get_user_pages and kmap. | ||
| 2241 | */ | ||
| 2242 | src_page = NULL; | ||
| 2243 | |||
| 2244 | /* | ||
| 2245 | * Bring in the user page that we will copy from _first_. | ||
| 2246 | * Otherwise there's a nasty deadlock on copying from the | ||
| 2247 | * same page as we're writing to, without it being marked | ||
| 2248 | * up-to-date. | ||
| 2249 | * | ||
| 2250 | * Not only is this an optimisation, but it is also required | ||
| 2251 | * to check that the address is actually valid, when atomic | ||
| 2252 | * usercopies are used, below. | ||
| 2253 | */ | ||
| 2254 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
| 2255 | status = -EFAULT; | ||
| 2256 | break; | ||
| 2257 | } | ||
| 2258 | |||
| 2259 | page = __grab_cache_page(mapping, index); | ||
| 2260 | if (!page) { | ||
| 2261 | status = -ENOMEM; | ||
| 2262 | break; | ||
| 2263 | } | ||
| 2264 | |||
| 2265 | /* | ||
| 2266 | * non-uptodate pages cannot cope with short copies, and we | ||
| 2267 | * cannot take a pagefault with the destination page locked. | ||
| 2268 | * So pin the source page to copy it. | ||
| 2269 | */ | ||
| 2270 | if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { | ||
| 2271 | unlock_page(page); | ||
| 2272 | |||
| 2273 | src_page = alloc_page(GFP_KERNEL); | ||
| 2274 | if (!src_page) { | ||
| 2275 | page_cache_release(page); | ||
| 2276 | status = -ENOMEM; | ||
| 2277 | break; | ||
| 2278 | } | ||
| 2279 | |||
| 2280 | /* | ||
| 2281 | * Cannot get_user_pages with a page locked for the | ||
| 2282 | * same reason as we can't take a page fault with a | ||
| 2283 | * page locked (as explained below). | ||
| 2284 | */ | ||
| 2285 | copied = iov_iter_copy_from_user(src_page, i, | ||
| 2286 | offset, bytes); | ||
| 2287 | if (unlikely(copied == 0)) { | ||
| 2288 | status = -EFAULT; | ||
| 2289 | page_cache_release(page); | ||
| 2290 | page_cache_release(src_page); | ||
| 2291 | break; | ||
| 2292 | } | ||
| 2293 | bytes = copied; | ||
| 2294 | |||
| 2295 | lock_page(page); | ||
| 2296 | /* | ||
| 2297 | * Can't handle the page going uptodate here, because | ||
| 2298 | * that means we would use non-atomic usercopies, which | ||
| 2299 | * zero out the tail of the page, which can cause | ||
| 2300 | * zeroes to become transiently visible. We could just | ||
| 2301 | * use a non-zeroing copy, but the APIs aren't too | ||
| 2302 | * consistent. | ||
| 2303 | */ | ||
| 2304 | if (unlikely(!page->mapping || PageUptodate(page))) { | ||
| 2305 | unlock_page(page); | ||
| 2306 | page_cache_release(page); | ||
| 2307 | page_cache_release(src_page); | ||
| 2308 | continue; | ||
| 2309 | } | ||
| 2310 | } | ||
| 2311 | |||
| 2312 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | ||
| 2313 | if (unlikely(status)) | ||
| 2314 | goto fs_write_aop_error; | ||
| 2315 | |||
| 2316 | if (!src_page) { | ||
| 2317 | /* | ||
| 2318 | * Must not enter the pagefault handler here, because | ||
| 2319 | * we hold the page lock, so we might recursively | ||
| 2320 | * deadlock on the same lock, or get an ABBA deadlock | ||
| 2321 | * against a different lock, or against the mmap_sem | ||
| 2322 | * (which nests outside the page lock). So increment | ||
| 2323 | * preempt count, and use _atomic usercopies. | ||
| 2324 | * | ||
| 2325 | * The page is uptodate so we are OK to encounter a | ||
| 2326 | * short copy: if unmodified parts of the page are | ||
| 2327 | * marked dirty and written out to disk, it doesn't | ||
| 2328 | * really matter. | ||
| 2329 | */ | ||
| 2330 | pagefault_disable(); | ||
| 2331 | copied = iov_iter_copy_from_user_atomic(page, i, | ||
| 2332 | offset, bytes); | ||
| 2333 | pagefault_enable(); | ||
| 2334 | } else { | ||
| 2335 | void *src, *dst; | ||
| 2336 | src = kmap_atomic(src_page, KM_USER0); | ||
| 2337 | dst = kmap_atomic(page, KM_USER1); | ||
| 2338 | memcpy(dst + offset, src + offset, bytes); | ||
| 2339 | kunmap_atomic(dst, KM_USER1); | ||
| 2340 | kunmap_atomic(src, KM_USER0); | ||
| 2341 | copied = bytes; | ||
| 2342 | } | ||
| 2343 | flush_dcache_page(page); | ||
| 2344 | |||
| 2345 | status = a_ops->commit_write(file, page, offset, offset+bytes); | ||
| 2346 | if (unlikely(status < 0)) | ||
| 2347 | goto fs_write_aop_error; | ||
| 2348 | if (unlikely(status > 0)) /* filesystem did partial write */ | ||
| 2349 | copied = min_t(size_t, copied, status); | ||
| 2350 | |||
| 2351 | unlock_page(page); | ||
| 2352 | mark_page_accessed(page); | ||
| 2353 | page_cache_release(page); | ||
| 2354 | if (src_page) | ||
| 2355 | page_cache_release(src_page); | ||
| 2356 | |||
| 2357 | iov_iter_advance(i, copied); | ||
| 2358 | pos += copied; | ||
| 2359 | written += copied; | ||
| 2360 | |||
| 2361 | balance_dirty_pages_ratelimited(mapping); | ||
| 2362 | cond_resched(); | ||
| 2363 | continue; | ||
| 2364 | |||
| 2365 | fs_write_aop_error: | ||
| 2366 | unlock_page(page); | ||
| 2367 | page_cache_release(page); | ||
| 2368 | if (src_page) | ||
| 2369 | page_cache_release(src_page); | ||
| 2370 | |||
| 2371 | /* | ||
| 2372 | * prepare_write() may have instantiated a few blocks | ||
| 2373 | * outside i_size. Trim these off again. Don't need | ||
| 2374 | * i_size_read because we hold i_mutex. | ||
| 2375 | */ | ||
| 2376 | if (pos + bytes > inode->i_size) | ||
| 2377 | vmtruncate(inode, inode->i_size); | ||
| 2378 | break; | ||
| 2379 | } while (iov_iter_count(i)); | ||
| 2380 | |||
| 2381 | return written ? written : status; | ||
| 2382 | } | ||
| 2383 | |||
| 2384 | static ssize_t generic_perform_write(struct file *file, | 2166 | static ssize_t generic_perform_write(struct file *file, |
| 2385 | struct iov_iter *i, loff_t pos) | 2167 | struct iov_iter *i, loff_t pos) |
| 2386 | { | 2168 | { |
| @@ -2481,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2481 | struct iov_iter i; | 2263 | struct iov_iter i; |
| 2482 | 2264 | ||
| 2483 | iov_iter_init(&i, iov, nr_segs, count, written); | 2265 | iov_iter_init(&i, iov, nr_segs, count, written); |
| 2484 | if (a_ops->write_begin) | 2266 | status = generic_perform_write(file, &i, pos); |
| 2485 | status = generic_perform_write(file, &i, pos); | ||
| 2486 | else | ||
| 2487 | status = generic_perform_write_2copy(file, &i, pos); | ||
| 2488 | 2267 | ||
| 2489 | if (likely(status >= 0)) { | 2268 | if (likely(status >= 0)) { |
| 2490 | written += status; | 2269 | written += status; |
