aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/bootmem.c9
-rw-r--r--mm/cma.c68
-rw-r--r--mm/compaction.c21
-rw-r--r--mm/huge_memory.c15
-rw-r--r--mm/internal.h25
-rw-r--r--mm/iov_iter.c1062
-rw-r--r--mm/memcontrol.c105
-rw-r--r--mm/memory.c1
-rw-r--r--mm/memory_hotplug.c31
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/nobootmem.c8
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c68
-rw-r--r--mm/page_cgroup.c1
-rw-r--r--mm/page_isolation.c43
-rw-r--r--mm/rmap.c88
-rw-r--r--mm/slab_common.c14
-rw-r--r--mm/truncate.c6
19 files changed, 762 insertions, 856 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index b3cbe19f71b5..fcad8322ef36 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -68,11 +68,13 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
68 * to be released by the balloon driver. 68 * to be released by the balloon driver.
69 */ 69 */
70 if (trylock_page(page)) { 70 if (trylock_page(page)) {
71#ifdef CONFIG_BALLOON_COMPACTION
71 if (!PagePrivate(page)) { 72 if (!PagePrivate(page)) {
72 /* raced with isolation */ 73 /* raced with isolation */
73 unlock_page(page); 74 unlock_page(page);
74 continue; 75 continue;
75 } 76 }
77#endif
76 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 78 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
77 balloon_page_delete(page); 79 balloon_page_delete(page);
78 __count_vm_event(BALLOON_DEFLATE); 80 __count_vm_event(BALLOON_DEFLATE);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8a000cebb0d7..477be696511d 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
243 243
244static int reset_managed_pages_done __initdata; 244static int reset_managed_pages_done __initdata;
245 245
246static inline void __init reset_node_managed_pages(pg_data_t *pgdat) 246void reset_node_managed_pages(pg_data_t *pgdat)
247{ 247{
248 struct zone *z; 248 struct zone *z;
249 249
250 if (reset_managed_pages_done)
251 return;
252
253 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 250 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
254 z->managed_pages = 0; 251 z->managed_pages = 0;
255} 252}
@@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void)
258{ 255{
259 struct pglist_data *pgdat; 256 struct pglist_data *pgdat;
260 257
258 if (reset_managed_pages_done)
259 return;
260
261 for_each_online_pgdat(pgdat) 261 for_each_online_pgdat(pgdat)
262 reset_node_managed_pages(pgdat); 262 reset_node_managed_pages(pgdat);
263
263 reset_managed_pages_done = 1; 264 reset_managed_pages_done = 1;
264} 265}
265 266
diff --git a/mm/cma.c b/mm/cma.c
index 963bc4add9af..fde706e1284f 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -124,6 +124,7 @@ static int __init cma_activate_area(struct cma *cma)
124 124
125err: 125err:
126 kfree(cma->bitmap); 126 kfree(cma->bitmap);
127 cma->count = 0;
127 return -EINVAL; 128 return -EINVAL;
128} 129}
129 130
@@ -217,9 +218,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
217 phys_addr_t highmem_start = __pa(high_memory); 218 phys_addr_t highmem_start = __pa(high_memory);
218 int ret = 0; 219 int ret = 0;
219 220
220 pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", 221 pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
221 __func__, (unsigned long)size, (unsigned long)base, 222 __func__, &size, &base, &limit, &alignment);
222 (unsigned long)limit, (unsigned long)alignment);
223 223
224 if (cma_area_count == ARRAY_SIZE(cma_areas)) { 224 if (cma_area_count == ARRAY_SIZE(cma_areas)) {
225 pr_err("Not enough slots for CMA reserved regions!\n"); 225 pr_err("Not enough slots for CMA reserved regions!\n");
@@ -244,52 +244,72 @@ int __init cma_declare_contiguous(phys_addr_t base,
244 size = ALIGN(size, alignment); 244 size = ALIGN(size, alignment);
245 limit &= ~(alignment - 1); 245 limit &= ~(alignment - 1);
246 246
247 if (!base)
248 fixed = false;
249
247 /* size should be aligned with order_per_bit */ 250 /* size should be aligned with order_per_bit */
248 if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) 251 if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
249 return -EINVAL; 252 return -EINVAL;
250 253
251 /* 254 /*
252 * adjust limit to avoid crossing low/high memory boundary for 255 * If allocating at a fixed base the request region must not cross the
253 * automatically allocated regions 256 * low/high memory boundary.
254 */ 257 */
255 if (((limit == 0 || limit > memblock_end) && 258 if (fixed && base < highmem_start && base + size > highmem_start) {
256 (memblock_end - size < highmem_start &&
257 memblock_end > highmem_start)) ||
258 (!fixed && limit > highmem_start && limit - size < highmem_start)) {
259 limit = highmem_start;
260 }
261
262 if (fixed && base < highmem_start && base+size > highmem_start) {
263 ret = -EINVAL; 259 ret = -EINVAL;
264 pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", 260 pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
265 (unsigned long)base, (unsigned long)highmem_start); 261 &base, &highmem_start);
266 goto err; 262 goto err;
267 } 263 }
268 264
265 /*
266 * If the limit is unspecified or above the memblock end, its effective
267 * value will be the memblock end. Set it explicitly to simplify further
268 * checks.
269 */
270 if (limit == 0 || limit > memblock_end)
271 limit = memblock_end;
272
269 /* Reserve memory */ 273 /* Reserve memory */
270 if (base && fixed) { 274 if (fixed) {
271 if (memblock_is_region_reserved(base, size) || 275 if (memblock_is_region_reserved(base, size) ||
272 memblock_reserve(base, size) < 0) { 276 memblock_reserve(base, size) < 0) {
273 ret = -EBUSY; 277 ret = -EBUSY;
274 goto err; 278 goto err;
275 } 279 }
276 } else { 280 } else {
277 phys_addr_t addr = memblock_alloc_range(size, alignment, base, 281 phys_addr_t addr = 0;
278 limit); 282
283 /*
284 * All pages in the reserved area must come from the same zone.
285 * If the requested region crosses the low/high memory boundary,
286 * try allocating from high memory first and fall back to low
287 * memory in case of failure.
288 */
289 if (base < highmem_start && limit > highmem_start) {
290 addr = memblock_alloc_range(size, alignment,
291 highmem_start, limit);
292 limit = highmem_start;
293 }
294
279 if (!addr) { 295 if (!addr) {
280 ret = -ENOMEM; 296 addr = memblock_alloc_range(size, alignment, base,
281 goto err; 297 limit);
282 } else { 298 if (!addr) {
283 base = addr; 299 ret = -ENOMEM;
300 goto err;
301 }
284 } 302 }
303
304 base = addr;
285 } 305 }
286 306
287 ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); 307 ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
288 if (ret) 308 if (ret)
289 goto err; 309 goto err;
290 310
291 pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, 311 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
292 (unsigned long)base); 312 &base);
293 return 0; 313 return 0;
294 314
295err: 315err:
diff --git a/mm/compaction.c b/mm/compaction.c
index edba18aed173..f9792ba3537c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -479,6 +479,16 @@ isolate_freepages_range(struct compact_control *cc,
479 479
480 block_end_pfn = min(block_end_pfn, end_pfn); 480 block_end_pfn = min(block_end_pfn, end_pfn);
481 481
482 /*
483 * pfn could pass the block_end_pfn if isolated freepage
484 * is more than pageblock order. In this case, we adjust
485 * scanning range to right one.
486 */
487 if (pfn >= block_end_pfn) {
488 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
489 block_end_pfn = min(block_end_pfn, end_pfn);
490 }
491
482 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 492 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
483 break; 493 break;
484 494
@@ -784,6 +794,9 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
784 cc->nr_migratepages = 0; 794 cc->nr_migratepages = 0;
785 break; 795 break;
786 } 796 }
797
798 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
799 break;
787 } 800 }
788 acct_isolated(cc->zone, cc); 801 acct_isolated(cc->zone, cc);
789 802
@@ -1026,8 +1039,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1026 } 1039 }
1027 1040
1028 acct_isolated(zone, cc); 1041 acct_isolated(zone, cc);
1029 /* Record where migration scanner will be restarted */ 1042 /*
1030 cc->migrate_pfn = low_pfn; 1043 * Record where migration scanner will be restarted. If we end up in
1044 * the same pageblock as the free scanner, make the scanners fully
1045 * meet so that compact_finished() terminates compaction.
1046 */
1047 cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
1031 1048
1032 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1049 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1033} 1050}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74c78aa8bc2f..de984159cf0b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -200,7 +200,7 @@ retry:
200 preempt_disable(); 200 preempt_disable();
201 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 201 if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
202 preempt_enable(); 202 preempt_enable();
203 __free_page(zero_page); 203 __free_pages(zero_page, compound_order(zero_page));
204 goto retry; 204 goto retry;
205 } 205 }
206 206
@@ -232,7 +232,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
232 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 232 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
233 struct page *zero_page = xchg(&huge_zero_page, NULL); 233 struct page *zero_page = xchg(&huge_zero_page, NULL);
234 BUG_ON(zero_page == NULL); 234 BUG_ON(zero_page == NULL);
235 __free_page(zero_page); 235 __free_pages(zero_page, compound_order(zero_page));
236 return HPAGE_PMD_NR; 236 return HPAGE_PMD_NR;
237 } 237 }
238 238
@@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
803 return VM_FAULT_FALLBACK; 803 return VM_FAULT_FALLBACK;
804 if (unlikely(anon_vma_prepare(vma))) 804 if (unlikely(anon_vma_prepare(vma)))
805 return VM_FAULT_OOM; 805 return VM_FAULT_OOM;
806 if (unlikely(khugepaged_enter(vma))) 806 if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
807 return VM_FAULT_OOM; 807 return VM_FAULT_OOM;
808 if (!(flags & FAULT_FLAG_WRITE) && 808 if (!(flags & FAULT_FLAG_WRITE) &&
809 transparent_hugepage_use_zero_page()) { 809 transparent_hugepage_use_zero_page()) {
@@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
1970 * register it here without waiting a page fault that 1970 * register it here without waiting a page fault that
1971 * may not happen any time soon. 1971 * may not happen any time soon.
1972 */ 1972 */
1973 if (unlikely(khugepaged_enter_vma_merge(vma))) 1973 if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
1974 return -ENOMEM; 1974 return -ENOMEM;
1975 break; 1975 break;
1976 case MADV_NOHUGEPAGE: 1976 case MADV_NOHUGEPAGE:
@@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm)
2071 return 0; 2071 return 0;
2072} 2072}
2073 2073
2074int khugepaged_enter_vma_merge(struct vm_area_struct *vma) 2074int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
2075 unsigned long vm_flags)
2075{ 2076{
2076 unsigned long hstart, hend; 2077 unsigned long hstart, hend;
2077 if (!vma->anon_vma) 2078 if (!vma->anon_vma)
@@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
2083 if (vma->vm_ops) 2084 if (vma->vm_ops)
2084 /* khugepaged not yet working on file or special mappings */ 2085 /* khugepaged not yet working on file or special mappings */
2085 return 0; 2086 return 0;
2086 VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); 2087 VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
2087 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2088 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2088 hend = vma->vm_end & HPAGE_PMD_MASK; 2089 hend = vma->vm_end & HPAGE_PMD_MASK;
2089 if (hstart < hend) 2090 if (hstart < hend)
2090 return khugepaged_enter(vma); 2091 return khugepaged_enter(vma, vm_flags);
2091 return 0; 2092 return 0;
2092} 2093}
2093 2094
diff --git a/mm/internal.h b/mm/internal.h
index 829304090b90..a4f90ba7068e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
108/* 108/*
109 * in mm/page_alloc.c 109 * in mm/page_alloc.c
110 */ 110 */
111
112/*
113 * Locate the struct page for both the matching buddy in our
114 * pair (buddy1) and the combined O(n+1) page they form (page).
115 *
116 * 1) Any buddy B1 will have an order O twin B2 which satisfies
117 * the following equation:
118 * B2 = B1 ^ (1 << O)
119 * For example, if the starting buddy (buddy2) is #8 its order
120 * 1 buddy is #10:
121 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
122 *
123 * 2) Any buddy B will have an order O+1 parent P which
124 * satisfies the following equation:
125 * P = B & ~(1 << O)
126 *
127 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
128 */
129static inline unsigned long
130__find_buddy_index(unsigned long page_idx, unsigned int order)
131{
132 return page_idx ^ (1 << order);
133}
134
135extern int __isolate_free_page(struct page *page, unsigned int order);
111extern void __free_pages_bootmem(struct page *page, unsigned int order); 136extern void __free_pages_bootmem(struct page *page, unsigned int order);
112extern void prep_compound_page(struct page *page, unsigned long order); 137extern void prep_compound_page(struct page *page, unsigned long order);
113#ifdef CONFIG_MEMORY_FAILURE 138#ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index eafcf60f6b83..a1599ca4ab0e 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -3,95 +3,136 @@
3#include <linux/pagemap.h> 3#include <linux/pagemap.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/vmalloc.h> 5#include <linux/vmalloc.h>
6 6#include <net/checksum.h>
7static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i) 7
8{ 8#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
9 size_t skip, copy, left, wanted; 9 size_t left; \
10 const struct iovec *iov; 10 size_t wanted = n; \
11 char __user *buf; 11 __p = i->iov; \
12 12 __v.iov_len = min(n, __p->iov_len - skip); \
13 if (unlikely(bytes > i->count)) 13 if (likely(__v.iov_len)) { \
14 bytes = i->count; 14 __v.iov_base = __p->iov_base + skip; \
15 15 left = (STEP); \
16 if (unlikely(!bytes)) 16 __v.iov_len -= left; \
17 return 0; 17 skip += __v.iov_len; \
18 18 n -= __v.iov_len; \
19 wanted = bytes; 19 } else { \
20 iov = i->iov; 20 left = 0; \
21 skip = i->iov_offset; 21 } \
22 buf = iov->iov_base + skip; 22 while (unlikely(!left && n)) { \
23 copy = min(bytes, iov->iov_len - skip); 23 __p++; \
24 24 __v.iov_len = min(n, __p->iov_len); \
25 left = __copy_to_user(buf, from, copy); 25 if (unlikely(!__v.iov_len)) \
26 copy -= left; 26 continue; \
27 skip += copy; 27 __v.iov_base = __p->iov_base; \
28 from += copy; 28 left = (STEP); \
29 bytes -= copy; 29 __v.iov_len -= left; \
30 while (unlikely(!left && bytes)) { 30 skip = __v.iov_len; \
31 iov++; 31 n -= __v.iov_len; \
32 buf = iov->iov_base; 32 } \
33 copy = min(bytes, iov->iov_len); 33 n = wanted - n; \
34 left = __copy_to_user(buf, from, copy); 34}
35 copy -= left; 35
36 skip = copy; 36#define iterate_kvec(i, n, __v, __p, skip, STEP) { \
37 from += copy; 37 size_t wanted = n; \
38 bytes -= copy; 38 __p = i->kvec; \
39 } 39 __v.iov_len = min(n, __p->iov_len - skip); \
40 40 if (likely(__v.iov_len)) { \
41 if (skip == iov->iov_len) { 41 __v.iov_base = __p->iov_base + skip; \
42 iov++; 42 (void)(STEP); \
43 skip = 0; 43 skip += __v.iov_len; \
44 } 44 n -= __v.iov_len; \
45 i->count -= wanted - bytes; 45 } \
46 i->nr_segs -= iov - i->iov; 46 while (unlikely(n)) { \
47 i->iov = iov; 47 __p++; \
48 i->iov_offset = skip; 48 __v.iov_len = min(n, __p->iov_len); \
49 return wanted - bytes; 49 if (unlikely(!__v.iov_len)) \
50} 50 continue; \
51 51 __v.iov_base = __p->iov_base; \
52static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i) 52 (void)(STEP); \
53{ 53 skip = __v.iov_len; \
54 size_t skip, copy, left, wanted; 54 n -= __v.iov_len; \
55 const struct iovec *iov; 55 } \
56 char __user *buf; 56 n = wanted; \
57 57}
58 if (unlikely(bytes > i->count)) 58
59 bytes = i->count; 59#define iterate_bvec(i, n, __v, __p, skip, STEP) { \
60 60 size_t wanted = n; \
61 if (unlikely(!bytes)) 61 __p = i->bvec; \
62 return 0; 62 __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \
63 63 if (likely(__v.bv_len)) { \
64 wanted = bytes; 64 __v.bv_page = __p->bv_page; \
65 iov = i->iov; 65 __v.bv_offset = __p->bv_offset + skip; \
66 skip = i->iov_offset; 66 (void)(STEP); \
67 buf = iov->iov_base + skip; 67 skip += __v.bv_len; \
68 copy = min(bytes, iov->iov_len - skip); 68 n -= __v.bv_len; \
69 69 } \
70 left = __copy_from_user(to, buf, copy); 70 while (unlikely(n)) { \
71 copy -= left; 71 __p++; \
72 skip += copy; 72 __v.bv_len = min_t(size_t, n, __p->bv_len); \
73 to += copy; 73 if (unlikely(!__v.bv_len)) \
74 bytes -= copy; 74 continue; \
75 while (unlikely(!left && bytes)) { 75 __v.bv_page = __p->bv_page; \
76 iov++; 76 __v.bv_offset = __p->bv_offset; \
77 buf = iov->iov_base; 77 (void)(STEP); \
78 copy = min(bytes, iov->iov_len); 78 skip = __v.bv_len; \
79 left = __copy_from_user(to, buf, copy); 79 n -= __v.bv_len; \
80 copy -= left; 80 } \
81 skip = copy; 81 n = wanted; \
82 to += copy; 82}
83 bytes -= copy; 83
84 } 84#define iterate_all_kinds(i, n, v, I, B, K) { \
85 85 size_t skip = i->iov_offset; \
86 if (skip == iov->iov_len) { 86 if (unlikely(i->type & ITER_BVEC)) { \
87 iov++; 87 const struct bio_vec *bvec; \
88 skip = 0; 88 struct bio_vec v; \
89 } 89 iterate_bvec(i, n, v, bvec, skip, (B)) \
90 i->count -= wanted - bytes; 90 } else if (unlikely(i->type & ITER_KVEC)) { \
91 i->nr_segs -= iov - i->iov; 91 const struct kvec *kvec; \
92 i->iov = iov; 92 struct kvec v; \
93 i->iov_offset = skip; 93 iterate_kvec(i, n, v, kvec, skip, (K)) \
94 return wanted - bytes; 94 } else { \
95 const struct iovec *iov; \
96 struct iovec v; \
97 iterate_iovec(i, n, v, iov, skip, (I)) \
98 } \
99}
100
101#define iterate_and_advance(i, n, v, I, B, K) { \
102 size_t skip = i->iov_offset; \
103 if (unlikely(i->type & ITER_BVEC)) { \
104 const struct bio_vec *bvec; \
105 struct bio_vec v; \
106 iterate_bvec(i, n, v, bvec, skip, (B)) \
107 if (skip == bvec->bv_len) { \
108 bvec++; \
109 skip = 0; \
110 } \
111 i->nr_segs -= bvec - i->bvec; \
112 i->bvec = bvec; \
113 } else if (unlikely(i->type & ITER_KVEC)) { \
114 const struct kvec *kvec; \
115 struct kvec v; \
116 iterate_kvec(i, n, v, kvec, skip, (K)) \
117 if (skip == kvec->iov_len) { \
118 kvec++; \
119 skip = 0; \
120 } \
121 i->nr_segs -= kvec - i->kvec; \
122 i->kvec = kvec; \
123 } else { \
124 const struct iovec *iov; \
125 struct iovec v; \
126 iterate_iovec(i, n, v, iov, skip, (I)) \
127 if (skip == iov->iov_len) { \
128 iov++; \
129 skip = 0; \
130 } \
131 i->nr_segs -= iov - i->iov; \
132 i->iov = iov; \
133 } \
134 i->count -= n; \
135 i->iov_offset = skip; \
95} 136}
96 137
97static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 138static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
@@ -256,134 +297,6 @@ done:
256 return wanted - bytes; 297 return wanted - bytes;
257} 298}
258 299
259static size_t zero_iovec(size_t bytes, struct iov_iter *i)
260{
261 size_t skip, copy, left, wanted;
262 const struct iovec *iov;
263 char __user *buf;
264
265 if (unlikely(bytes > i->count))
266 bytes = i->count;
267
268 if (unlikely(!bytes))
269 return 0;
270
271 wanted = bytes;
272 iov = i->iov;
273 skip = i->iov_offset;
274 buf = iov->iov_base + skip;
275 copy = min(bytes, iov->iov_len - skip);
276
277 left = __clear_user(buf, copy);
278 copy -= left;
279 skip += copy;
280 bytes -= copy;
281
282 while (unlikely(!left && bytes)) {
283 iov++;
284 buf = iov->iov_base;
285 copy = min(bytes, iov->iov_len);
286 left = __clear_user(buf, copy);
287 copy -= left;
288 skip = copy;
289 bytes -= copy;
290 }
291
292 if (skip == iov->iov_len) {
293 iov++;
294 skip = 0;
295 }
296 i->count -= wanted - bytes;
297 i->nr_segs -= iov - i->iov;
298 i->iov = iov;
299 i->iov_offset = skip;
300 return wanted - bytes;
301}
302
303static size_t __iovec_copy_from_user_inatomic(char *vaddr,
304 const struct iovec *iov, size_t base, size_t bytes)
305{
306 size_t copied = 0, left = 0;
307
308 while (bytes) {
309 char __user *buf = iov->iov_base + base;
310 int copy = min(bytes, iov->iov_len - base);
311
312 base = 0;
313 left = __copy_from_user_inatomic(vaddr, buf, copy);
314 copied += copy;
315 bytes -= copy;
316 vaddr += copy;
317 iov++;
318
319 if (unlikely(left))
320 break;
321 }
322 return copied - left;
323}
324
325/*
326 * Copy as much as we can into the page and return the number of bytes which
327 * were successfully copied. If a fault is encountered then return the number of
328 * bytes which were copied.
329 */
330static size_t copy_from_user_atomic_iovec(struct page *page,
331 struct iov_iter *i, unsigned long offset, size_t bytes)
332{
333 char *kaddr;
334 size_t copied;
335
336 kaddr = kmap_atomic(page);
337 if (likely(i->nr_segs == 1)) {
338 int left;
339 char __user *buf = i->iov->iov_base + i->iov_offset;
340 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
341 copied = bytes - left;
342 } else {
343 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
344 i->iov, i->iov_offset, bytes);
345 }
346 kunmap_atomic(kaddr);
347
348 return copied;
349}
350
351static void advance_iovec(struct iov_iter *i, size_t bytes)
352{
353 BUG_ON(i->count < bytes);
354
355 if (likely(i->nr_segs == 1)) {
356 i->iov_offset += bytes;
357 i->count -= bytes;
358 } else {
359 const struct iovec *iov = i->iov;
360 size_t base = i->iov_offset;
361 unsigned long nr_segs = i->nr_segs;
362
363 /*
364 * The !iov->iov_len check ensures we skip over unlikely
365 * zero-length segments (without overruning the iovec).
366 */
367 while (bytes || unlikely(i->count && !iov->iov_len)) {
368 int copy;
369
370 copy = min(bytes, iov->iov_len - base);
371 BUG_ON(!i->count || i->count < copy);
372 i->count -= copy;
373 bytes -= copy;
374 base += copy;
375 if (iov->iov_len == base) {
376 iov++;
377 nr_segs--;
378 base = 0;
379 }
380 }
381 i->iov = iov;
382 i->iov_offset = base;
383 i->nr_segs = nr_segs;
384 }
385}
386
387/* 300/*
388 * Fault in the first iovec of the given iov_iter, to a maximum length 301 * Fault in the first iovec of the given iov_iter, to a maximum length
389 * of bytes. Returns 0 on success, or non-zero if the memory could not be 302 * of bytes. Returns 0 on success, or non-zero if the memory could not be
@@ -395,7 +308,7 @@ static void advance_iovec(struct iov_iter *i, size_t bytes)
395 */ 308 */
396int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) 309int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
397{ 310{
398 if (!(i->type & ITER_BVEC)) { 311 if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
399 char __user *buf = i->iov->iov_base + i->iov_offset; 312 char __user *buf = i->iov->iov_base + i->iov_offset;
400 bytes = min(bytes, i->iov->iov_len - i->iov_offset); 313 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
401 return fault_in_pages_readable(buf, bytes); 314 return fault_in_pages_readable(buf, bytes);
@@ -404,136 +317,25 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
404} 317}
405EXPORT_SYMBOL(iov_iter_fault_in_readable); 318EXPORT_SYMBOL(iov_iter_fault_in_readable);
406 319
407static unsigned long alignment_iovec(const struct iov_iter *i)
408{
409 const struct iovec *iov = i->iov;
410 unsigned long res;
411 size_t size = i->count;
412 size_t n;
413
414 if (!size)
415 return 0;
416
417 res = (unsigned long)iov->iov_base + i->iov_offset;
418 n = iov->iov_len - i->iov_offset;
419 if (n >= size)
420 return res | size;
421 size -= n;
422 res |= n;
423 while (size > (++iov)->iov_len) {
424 res |= (unsigned long)iov->iov_base | iov->iov_len;
425 size -= iov->iov_len;
426 }
427 res |= (unsigned long)iov->iov_base | size;
428 return res;
429}
430
431void iov_iter_init(struct iov_iter *i, int direction, 320void iov_iter_init(struct iov_iter *i, int direction,
432 const struct iovec *iov, unsigned long nr_segs, 321 const struct iovec *iov, unsigned long nr_segs,
433 size_t count) 322 size_t count)
434{ 323{
435 /* It will get better. Eventually... */ 324 /* It will get better. Eventually... */
436 if (segment_eq(get_fs(), KERNEL_DS)) 325 if (segment_eq(get_fs(), KERNEL_DS)) {
437 direction |= ITER_KVEC; 326 direction |= ITER_KVEC;
438 i->type = direction; 327 i->type = direction;
439 i->iov = iov; 328 i->kvec = (struct kvec *)iov;
329 } else {
330 i->type = direction;
331 i->iov = iov;
332 }
440 i->nr_segs = nr_segs; 333 i->nr_segs = nr_segs;
441 i->iov_offset = 0; 334 i->iov_offset = 0;
442 i->count = count; 335 i->count = count;
443} 336}
444EXPORT_SYMBOL(iov_iter_init); 337EXPORT_SYMBOL(iov_iter_init);
445 338
446static ssize_t get_pages_iovec(struct iov_iter *i,
447 struct page **pages, size_t maxsize, unsigned maxpages,
448 size_t *start)
449{
450 size_t offset = i->iov_offset;
451 const struct iovec *iov = i->iov;
452 size_t len;
453 unsigned long addr;
454 int n;
455 int res;
456
457 len = iov->iov_len - offset;
458 if (len > i->count)
459 len = i->count;
460 if (len > maxsize)
461 len = maxsize;
462 addr = (unsigned long)iov->iov_base + offset;
463 len += *start = addr & (PAGE_SIZE - 1);
464 if (len > maxpages * PAGE_SIZE)
465 len = maxpages * PAGE_SIZE;
466 addr &= ~(PAGE_SIZE - 1);
467 n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
468 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
469 if (unlikely(res < 0))
470 return res;
471 return (res == n ? len : res * PAGE_SIZE) - *start;
472}
473
474static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
475 struct page ***pages, size_t maxsize,
476 size_t *start)
477{
478 size_t offset = i->iov_offset;
479 const struct iovec *iov = i->iov;
480 size_t len;
481 unsigned long addr;
482 void *p;
483 int n;
484 int res;
485
486 len = iov->iov_len - offset;
487 if (len > i->count)
488 len = i->count;
489 if (len > maxsize)
490 len = maxsize;
491 addr = (unsigned long)iov->iov_base + offset;
492 len += *start = addr & (PAGE_SIZE - 1);
493 addr &= ~(PAGE_SIZE - 1);
494 n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
495
496 p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
497 if (!p)
498 p = vmalloc(n * sizeof(struct page *));
499 if (!p)
500 return -ENOMEM;
501
502 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
503 if (unlikely(res < 0)) {
504 kvfree(p);
505 return res;
506 }
507 *pages = p;
508 return (res == n ? len : res * PAGE_SIZE) - *start;
509}
510
511static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
512{
513 size_t offset = i->iov_offset;
514 size_t size = i->count;
515 const struct iovec *iov = i->iov;
516 int npages = 0;
517 int n;
518
519 for (n = 0; size && n < i->nr_segs; n++, iov++) {
520 unsigned long addr = (unsigned long)iov->iov_base + offset;
521 size_t len = iov->iov_len - offset;
522 offset = 0;
523 if (unlikely(!len)) /* empty segment */
524 continue;
525 if (len > size)
526 len = size;
527 npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
528 - addr / PAGE_SIZE;
529 if (npages >= maxpages) /* don't bother going further */
530 return maxpages;
531 size -= len;
532 offset = 0;
533 }
534 return min(npages, maxpages);
535}
536
537static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) 339static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
538{ 340{
539 char *from = kmap_atomic(page); 341 char *from = kmap_atomic(page);
@@ -555,293 +357,78 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
555 kunmap_atomic(addr); 357 kunmap_atomic(addr);
556} 358}
557 359
558static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i) 360size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
559{ 361{
560 size_t skip, copy, wanted; 362 char *from = addr;
561 const struct bio_vec *bvec;
562
563 if (unlikely(bytes > i->count)) 363 if (unlikely(bytes > i->count))
564 bytes = i->count; 364 bytes = i->count;
565 365
566 if (unlikely(!bytes)) 366 if (unlikely(!bytes))
567 return 0; 367 return 0;
568 368
569 wanted = bytes; 369 iterate_and_advance(i, bytes, v,
570 bvec = i->bvec; 370 __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
571 skip = i->iov_offset; 371 v.iov_len),
572 copy = min_t(size_t, bytes, bvec->bv_len - skip); 372 memcpy_to_page(v.bv_page, v.bv_offset,
373 (from += v.bv_len) - v.bv_len, v.bv_len),
374 memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
375 )
573 376
574 memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); 377 return bytes;
575 skip += copy;
576 from += copy;
577 bytes -= copy;
578 while (bytes) {
579 bvec++;
580 copy = min(bytes, (size_t)bvec->bv_len);
581 memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
582 skip = copy;
583 from += copy;
584 bytes -= copy;
585 }
586 if (skip == bvec->bv_len) {
587 bvec++;
588 skip = 0;
589 }
590 i->count -= wanted - bytes;
591 i->nr_segs -= bvec - i->bvec;
592 i->bvec = bvec;
593 i->iov_offset = skip;
594 return wanted - bytes;
595} 378}
379EXPORT_SYMBOL(copy_to_iter);
596 380
597static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i) 381size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
598{ 382{
599 size_t skip, copy, wanted; 383 char *to = addr;
600 const struct bio_vec *bvec;
601
602 if (unlikely(bytes > i->count)) 384 if (unlikely(bytes > i->count))
603 bytes = i->count; 385 bytes = i->count;
604 386
605 if (unlikely(!bytes)) 387 if (unlikely(!bytes))
606 return 0; 388 return 0;
607 389
608 wanted = bytes; 390 iterate_and_advance(i, bytes, v,
609 bvec = i->bvec; 391 __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
610 skip = i->iov_offset; 392 v.iov_len),
611 393 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
612 copy = min(bytes, bvec->bv_len - skip); 394 v.bv_offset, v.bv_len),
613 395 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
614 memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); 396 )
615
616 to += copy;
617 skip += copy;
618 bytes -= copy;
619
620 while (bytes) {
621 bvec++;
622 copy = min(bytes, (size_t)bvec->bv_len);
623 memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
624 skip = copy;
625 to += copy;
626 bytes -= copy;
627 }
628 if (skip == bvec->bv_len) {
629 bvec++;
630 skip = 0;
631 }
632 i->count -= wanted;
633 i->nr_segs -= bvec - i->bvec;
634 i->bvec = bvec;
635 i->iov_offset = skip;
636 return wanted;
637}
638
639static size_t copy_page_to_iter_bvec(struct page *page, size_t offset,
640 size_t bytes, struct iov_iter *i)
641{
642 void *kaddr = kmap_atomic(page);
643 size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i);
644 kunmap_atomic(kaddr);
645 return wanted;
646}
647 397
648static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, 398 return bytes;
649 size_t bytes, struct iov_iter *i)
650{
651 void *kaddr = kmap_atomic(page);
652 size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i);
653 kunmap_atomic(kaddr);
654 return wanted;
655} 399}
400EXPORT_SYMBOL(copy_from_iter);
656 401
657static size_t zero_bvec(size_t bytes, struct iov_iter *i) 402size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
658{ 403{
659 size_t skip, copy, wanted; 404 char *to = addr;
660 const struct bio_vec *bvec;
661
662 if (unlikely(bytes > i->count)) 405 if (unlikely(bytes > i->count))
663 bytes = i->count; 406 bytes = i->count;
664 407
665 if (unlikely(!bytes)) 408 if (unlikely(!bytes))
666 return 0; 409 return 0;
667 410
668 wanted = bytes; 411 iterate_and_advance(i, bytes, v,
669 bvec = i->bvec; 412 __copy_from_user_nocache((to += v.iov_len) - v.iov_len,
670 skip = i->iov_offset; 413 v.iov_base, v.iov_len),
671 copy = min_t(size_t, bytes, bvec->bv_len - skip); 414 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
415 v.bv_offset, v.bv_len),
416 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
417 )
672 418
673 memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy);
674 skip += copy;
675 bytes -= copy;
676 while (bytes) {
677 bvec++;
678 copy = min(bytes, (size_t)bvec->bv_len);
679 memzero_page(bvec->bv_page, bvec->bv_offset, copy);
680 skip = copy;
681 bytes -= copy;
682 }
683 if (skip == bvec->bv_len) {
684 bvec++;
685 skip = 0;
686 }
687 i->count -= wanted - bytes;
688 i->nr_segs -= bvec - i->bvec;
689 i->bvec = bvec;
690 i->iov_offset = skip;
691 return wanted - bytes;
692}
693
694static size_t copy_from_user_bvec(struct page *page,
695 struct iov_iter *i, unsigned long offset, size_t bytes)
696{
697 char *kaddr;
698 size_t left;
699 const struct bio_vec *bvec;
700 size_t base = i->iov_offset;
701
702 kaddr = kmap_atomic(page);
703 for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
704 size_t copy = min(left, bvec->bv_len - base);
705 if (!bvec->bv_len)
706 continue;
707 memcpy_from_page(kaddr + offset, bvec->bv_page,
708 bvec->bv_offset + base, copy);
709 offset += copy;
710 left -= copy;
711 }
712 kunmap_atomic(kaddr);
713 return bytes; 419 return bytes;
714} 420}
715 421EXPORT_SYMBOL(copy_from_iter_nocache);
716static void advance_bvec(struct iov_iter *i, size_t bytes)
717{
718 BUG_ON(i->count < bytes);
719
720 if (likely(i->nr_segs == 1)) {
721 i->iov_offset += bytes;
722 i->count -= bytes;
723 } else {
724 const struct bio_vec *bvec = i->bvec;
725 size_t base = i->iov_offset;
726 unsigned long nr_segs = i->nr_segs;
727
728 /*
729 * The !iov->iov_len check ensures we skip over unlikely
730 * zero-length segments (without overruning the iovec).
731 */
732 while (bytes || unlikely(i->count && !bvec->bv_len)) {
733 int copy;
734
735 copy = min(bytes, bvec->bv_len - base);
736 BUG_ON(!i->count || i->count < copy);
737 i->count -= copy;
738 bytes -= copy;
739 base += copy;
740 if (bvec->bv_len == base) {
741 bvec++;
742 nr_segs--;
743 base = 0;
744 }
745 }
746 i->bvec = bvec;
747 i->iov_offset = base;
748 i->nr_segs = nr_segs;
749 }
750}
751
752static unsigned long alignment_bvec(const struct iov_iter *i)
753{
754 const struct bio_vec *bvec = i->bvec;
755 unsigned long res;
756 size_t size = i->count;
757 size_t n;
758
759 if (!size)
760 return 0;
761
762 res = bvec->bv_offset + i->iov_offset;
763 n = bvec->bv_len - i->iov_offset;
764 if (n >= size)
765 return res | size;
766 size -= n;
767 res |= n;
768 while (size > (++bvec)->bv_len) {
769 res |= bvec->bv_offset | bvec->bv_len;
770 size -= bvec->bv_len;
771 }
772 res |= bvec->bv_offset | size;
773 return res;
774}
775
776static ssize_t get_pages_bvec(struct iov_iter *i,
777 struct page **pages, size_t maxsize, unsigned maxpages,
778 size_t *start)
779{
780 const struct bio_vec *bvec = i->bvec;
781 size_t len = bvec->bv_len - i->iov_offset;
782 if (len > i->count)
783 len = i->count;
784 if (len > maxsize)
785 len = maxsize;
786 /* can't be more than PAGE_SIZE */
787 *start = bvec->bv_offset + i->iov_offset;
788
789 get_page(*pages = bvec->bv_page);
790
791 return len;
792}
793
794static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
795 struct page ***pages, size_t maxsize,
796 size_t *start)
797{
798 const struct bio_vec *bvec = i->bvec;
799 size_t len = bvec->bv_len - i->iov_offset;
800 if (len > i->count)
801 len = i->count;
802 if (len > maxsize)
803 len = maxsize;
804 *start = bvec->bv_offset + i->iov_offset;
805
806 *pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
807 if (!*pages)
808 return -ENOMEM;
809
810 get_page(**pages = bvec->bv_page);
811
812 return len;
813}
814
815static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
816{
817 size_t offset = i->iov_offset;
818 size_t size = i->count;
819 const struct bio_vec *bvec = i->bvec;
820 int npages = 0;
821 int n;
822
823 for (n = 0; size && n < i->nr_segs; n++, bvec++) {
824 size_t len = bvec->bv_len - offset;
825 offset = 0;
826 if (unlikely(!len)) /* empty segment */
827 continue;
828 if (len > size)
829 len = size;
830 npages++;
831 if (npages >= maxpages) /* don't bother going further */
832 return maxpages;
833 size -= len;
834 offset = 0;
835 }
836 return min(npages, maxpages);
837}
838 422
839size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 423size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
840 struct iov_iter *i) 424 struct iov_iter *i)
841{ 425{
842 if (i->type & ITER_BVEC) 426 if (i->type & (ITER_BVEC|ITER_KVEC)) {
843 return copy_page_to_iter_bvec(page, offset, bytes, i); 427 void *kaddr = kmap_atomic(page);
844 else 428 size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
429 kunmap_atomic(kaddr);
430 return wanted;
431 } else
845 return copy_page_to_iter_iovec(page, offset, bytes, i); 432 return copy_page_to_iter_iovec(page, offset, bytes, i);
846} 433}
847EXPORT_SYMBOL(copy_page_to_iter); 434EXPORT_SYMBOL(copy_page_to_iter);
@@ -849,57 +436,53 @@ EXPORT_SYMBOL(copy_page_to_iter);
849size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 436size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
850 struct iov_iter *i) 437 struct iov_iter *i)
851{ 438{
852 if (i->type & ITER_BVEC) 439 if (i->type & (ITER_BVEC|ITER_KVEC)) {
853 return copy_page_from_iter_bvec(page, offset, bytes, i); 440 void *kaddr = kmap_atomic(page);
854 else 441 size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
442 kunmap_atomic(kaddr);
443 return wanted;
444 } else
855 return copy_page_from_iter_iovec(page, offset, bytes, i); 445 return copy_page_from_iter_iovec(page, offset, bytes, i);
856} 446}
857EXPORT_SYMBOL(copy_page_from_iter); 447EXPORT_SYMBOL(copy_page_from_iter);
858 448
859size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) 449size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
860{ 450{
861 if (i->type & ITER_BVEC) 451 if (unlikely(bytes > i->count))
862 return copy_to_iter_bvec(addr, bytes, i); 452 bytes = i->count;
863 else
864 return copy_to_iter_iovec(addr, bytes, i);
865}
866EXPORT_SYMBOL(copy_to_iter);
867 453
868size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 454 if (unlikely(!bytes))
869{ 455 return 0;
870 if (i->type & ITER_BVEC)
871 return copy_from_iter_bvec(addr, bytes, i);
872 else
873 return copy_from_iter_iovec(addr, bytes, i);
874}
875EXPORT_SYMBOL(copy_from_iter);
876 456
877size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 457 iterate_and_advance(i, bytes, v,
878{ 458 __clear_user(v.iov_base, v.iov_len),
879 if (i->type & ITER_BVEC) { 459 memzero_page(v.bv_page, v.bv_offset, v.bv_len),
880 return zero_bvec(bytes, i); 460 memset(v.iov_base, 0, v.iov_len)
881 } else { 461 )
882 return zero_iovec(bytes, i); 462
883 } 463 return bytes;
884} 464}
885EXPORT_SYMBOL(iov_iter_zero); 465EXPORT_SYMBOL(iov_iter_zero);
886 466
887size_t iov_iter_copy_from_user_atomic(struct page *page, 467size_t iov_iter_copy_from_user_atomic(struct page *page,
888 struct iov_iter *i, unsigned long offset, size_t bytes) 468 struct iov_iter *i, unsigned long offset, size_t bytes)
889{ 469{
890 if (i->type & ITER_BVEC) 470 char *kaddr = kmap_atomic(page), *p = kaddr + offset;
891 return copy_from_user_bvec(page, i, offset, bytes); 471 iterate_all_kinds(i, bytes, v,
892 else 472 __copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
893 return copy_from_user_atomic_iovec(page, i, offset, bytes); 473 v.iov_base, v.iov_len),
474 memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
475 v.bv_offset, v.bv_len),
476 memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
477 )
478 kunmap_atomic(kaddr);
479 return bytes;
894} 480}
895EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); 481EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
896 482
897void iov_iter_advance(struct iov_iter *i, size_t size) 483void iov_iter_advance(struct iov_iter *i, size_t size)
898{ 484{
899 if (i->type & ITER_BVEC) 485 iterate_and_advance(i, size, v, 0, 0, 0)
900 advance_bvec(i, size);
901 else
902 advance_iovec(i, size);
903} 486}
904EXPORT_SYMBOL(iov_iter_advance); 487EXPORT_SYMBOL(iov_iter_advance);
905 488
@@ -911,18 +494,39 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
911 if (i->nr_segs == 1) 494 if (i->nr_segs == 1)
912 return i->count; 495 return i->count;
913 else if (i->type & ITER_BVEC) 496 else if (i->type & ITER_BVEC)
914 return min(i->count, i->iov->iov_len - i->iov_offset);
915 else
916 return min(i->count, i->bvec->bv_len - i->iov_offset); 497 return min(i->count, i->bvec->bv_len - i->iov_offset);
498 else
499 return min(i->count, i->iov->iov_len - i->iov_offset);
917} 500}
918EXPORT_SYMBOL(iov_iter_single_seg_count); 501EXPORT_SYMBOL(iov_iter_single_seg_count);
919 502
503void iov_iter_kvec(struct iov_iter *i, int direction,
504 const struct kvec *iov, unsigned long nr_segs,
505 size_t count)
506{
507 BUG_ON(!(direction & ITER_KVEC));
508 i->type = direction;
509 i->kvec = (struct kvec *)iov;
510 i->nr_segs = nr_segs;
511 i->iov_offset = 0;
512 i->count = count;
513}
514EXPORT_SYMBOL(iov_iter_kvec);
515
920unsigned long iov_iter_alignment(const struct iov_iter *i) 516unsigned long iov_iter_alignment(const struct iov_iter *i)
921{ 517{
922 if (i->type & ITER_BVEC) 518 unsigned long res = 0;
923 return alignment_bvec(i); 519 size_t size = i->count;
924 else 520
925 return alignment_iovec(i); 521 if (!size)
522 return 0;
523
524 iterate_all_kinds(i, size, v,
525 (res |= (unsigned long)v.iov_base | v.iov_len, 0),
526 res |= v.bv_offset | v.bv_len,
527 res |= (unsigned long)v.iov_base | v.iov_len
528 )
529 return res;
926} 530}
927EXPORT_SYMBOL(iov_iter_alignment); 531EXPORT_SYMBOL(iov_iter_alignment);
928 532
@@ -930,29 +534,207 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
930 struct page **pages, size_t maxsize, unsigned maxpages, 534 struct page **pages, size_t maxsize, unsigned maxpages,
931 size_t *start) 535 size_t *start)
932{ 536{
933 if (i->type & ITER_BVEC) 537 if (maxsize > i->count)
934 return get_pages_bvec(i, pages, maxsize, maxpages, start); 538 maxsize = i->count;
935 else 539
936 return get_pages_iovec(i, pages, maxsize, maxpages, start); 540 if (!maxsize)
541 return 0;
542
543 iterate_all_kinds(i, maxsize, v, ({
544 unsigned long addr = (unsigned long)v.iov_base;
545 size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
546 int n;
547 int res;
548
549 if (len > maxpages * PAGE_SIZE)
550 len = maxpages * PAGE_SIZE;
551 addr &= ~(PAGE_SIZE - 1);
552 n = DIV_ROUND_UP(len, PAGE_SIZE);
553 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
554 if (unlikely(res < 0))
555 return res;
556 return (res == n ? len : res * PAGE_SIZE) - *start;
557 0;}),({
558 /* can't be more than PAGE_SIZE */
559 *start = v.bv_offset;
560 get_page(*pages = v.bv_page);
561 return v.bv_len;
562 }),({
563 return -EFAULT;
564 })
565 )
566 return 0;
937} 567}
938EXPORT_SYMBOL(iov_iter_get_pages); 568EXPORT_SYMBOL(iov_iter_get_pages);
939 569
570static struct page **get_pages_array(size_t n)
571{
572 struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
573 if (!p)
574 p = vmalloc(n * sizeof(struct page *));
575 return p;
576}
577
940ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 578ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
941 struct page ***pages, size_t maxsize, 579 struct page ***pages, size_t maxsize,
942 size_t *start) 580 size_t *start)
943{ 581{
944 if (i->type & ITER_BVEC) 582 struct page **p;
945 return get_pages_alloc_bvec(i, pages, maxsize, start); 583
946 else 584 if (maxsize > i->count)
947 return get_pages_alloc_iovec(i, pages, maxsize, start); 585 maxsize = i->count;
586
587 if (!maxsize)
588 return 0;
589
590 iterate_all_kinds(i, maxsize, v, ({
591 unsigned long addr = (unsigned long)v.iov_base;
592 size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
593 int n;
594 int res;
595
596 addr &= ~(PAGE_SIZE - 1);
597 n = DIV_ROUND_UP(len, PAGE_SIZE);
598 p = get_pages_array(n);
599 if (!p)
600 return -ENOMEM;
601 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
602 if (unlikely(res < 0)) {
603 kvfree(p);
604 return res;
605 }
606 *pages = p;
607 return (res == n ? len : res * PAGE_SIZE) - *start;
608 0;}),({
609 /* can't be more than PAGE_SIZE */
610 *start = v.bv_offset;
611 *pages = p = get_pages_array(1);
612 if (!p)
613 return -ENOMEM;
614 get_page(*p = v.bv_page);
615 return v.bv_len;
616 }),({
617 return -EFAULT;
618 })
619 )
620 return 0;
948} 621}
949EXPORT_SYMBOL(iov_iter_get_pages_alloc); 622EXPORT_SYMBOL(iov_iter_get_pages_alloc);
950 623
624size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
625 struct iov_iter *i)
626{
627 char *to = addr;
628 __wsum sum, next;
629 size_t off = 0;
630 if (unlikely(bytes > i->count))
631 bytes = i->count;
632
633 if (unlikely(!bytes))
634 return 0;
635
636 sum = *csum;
637 iterate_and_advance(i, bytes, v, ({
638 int err = 0;
639 next = csum_and_copy_from_user(v.iov_base,
640 (to += v.iov_len) - v.iov_len,
641 v.iov_len, 0, &err);
642 if (!err) {
643 sum = csum_block_add(sum, next, off);
644 off += v.iov_len;
645 }
646 err ? v.iov_len : 0;
647 }), ({
648 char *p = kmap_atomic(v.bv_page);
649 next = csum_partial_copy_nocheck(p + v.bv_offset,
650 (to += v.bv_len) - v.bv_len,
651 v.bv_len, 0);
652 kunmap_atomic(p);
653 sum = csum_block_add(sum, next, off);
654 off += v.bv_len;
655 }),({
656 next = csum_partial_copy_nocheck(v.iov_base,
657 (to += v.iov_len) - v.iov_len,
658 v.iov_len, 0);
659 sum = csum_block_add(sum, next, off);
660 off += v.iov_len;
661 })
662 )
663 *csum = sum;
664 return bytes;
665}
666EXPORT_SYMBOL(csum_and_copy_from_iter);
667
668size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum,
669 struct iov_iter *i)
670{
671 char *from = addr;
672 __wsum sum, next;
673 size_t off = 0;
674 if (unlikely(bytes > i->count))
675 bytes = i->count;
676
677 if (unlikely(!bytes))
678 return 0;
679
680 sum = *csum;
681 iterate_and_advance(i, bytes, v, ({
682 int err = 0;
683 next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
684 v.iov_base,
685 v.iov_len, 0, &err);
686 if (!err) {
687 sum = csum_block_add(sum, next, off);
688 off += v.iov_len;
689 }
690 err ? v.iov_len : 0;
691 }), ({
692 char *p = kmap_atomic(v.bv_page);
693 next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
694 p + v.bv_offset,
695 v.bv_len, 0);
696 kunmap_atomic(p);
697 sum = csum_block_add(sum, next, off);
698 off += v.bv_len;
699 }),({
700 next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len,
701 v.iov_base,
702 v.iov_len, 0);
703 sum = csum_block_add(sum, next, off);
704 off += v.iov_len;
705 })
706 )
707 *csum = sum;
708 return bytes;
709}
710EXPORT_SYMBOL(csum_and_copy_to_iter);
711
951int iov_iter_npages(const struct iov_iter *i, int maxpages) 712int iov_iter_npages(const struct iov_iter *i, int maxpages)
952{ 713{
953 if (i->type & ITER_BVEC) 714 size_t size = i->count;
954 return iov_iter_npages_bvec(i, maxpages); 715 int npages = 0;
955 else 716
956 return iov_iter_npages_iovec(i, maxpages); 717 if (!size)
718 return 0;
719
720 iterate_all_kinds(i, size, v, ({
721 unsigned long p = (unsigned long)v.iov_base;
722 npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
723 - p / PAGE_SIZE;
724 if (npages >= maxpages)
725 return maxpages;
726 0;}),({
727 npages++;
728 if (npages >= maxpages)
729 return maxpages;
730 }),({
731 unsigned long p = (unsigned long)v.iov_base;
732 npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
733 - p / PAGE_SIZE;
734 if (npages >= maxpages)
735 return maxpages;
736 })
737 )
738 return npages;
957} 739}
958EXPORT_SYMBOL(iov_iter_npages); 740EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c3385181b16..ee48428cf8e3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1536 * start move here. 1536 * start move here.
1537 */ 1537 */
1538 1538
1539/* for quick checking without looking up memcg */
1540atomic_t memcg_moving __read_mostly;
1541
1542static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1539static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1543{ 1540{
1544 atomic_inc(&memcg_moving);
1545 atomic_inc(&memcg->moving_account); 1541 atomic_inc(&memcg->moving_account);
1546 synchronize_rcu(); 1542 synchronize_rcu();
1547} 1543}
@@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1552 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1548 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1553 * We check NULL in callee rather than caller. 1549 * We check NULL in callee rather than caller.
1554 */ 1550 */
1555 if (memcg) { 1551 if (memcg)
1556 atomic_dec(&memcg_moving);
1557 atomic_dec(&memcg->moving_account); 1552 atomic_dec(&memcg->moving_account);
1558 }
1559} 1553}
1560 1554
1561/* 1555/*
@@ -2204,41 +2198,52 @@ cleanup:
2204 return true; 2198 return true;
2205} 2199}
2206 2200
2207/* 2201/**
2208 * Used to update mapped file or writeback or other statistics. 2202 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
2203 * @page: page that is going to change accounted state
2204 * @locked: &memcg->move_lock slowpath was taken
2205 * @flags: IRQ-state flags for &memcg->move_lock
2209 * 2206 *
2210 * Notes: Race condition 2207 * This function must mark the beginning of an accounted page state
2208 * change to prevent double accounting when the page is concurrently
2209 * being moved to another memcg:
2211 * 2210 *
2212 * Charging occurs during page instantiation, while the page is 2211 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
2213 * unmapped and locked in page migration, or while the page table is 2212 * if (TestClearPageState(page))
2214 * locked in THP migration. No race is possible. 2213 * mem_cgroup_update_page_stat(memcg, state, -1);
2214 * mem_cgroup_end_page_stat(memcg, locked, flags);
2215 * 2215 *
2216 * Uncharge happens to pages with zero references, no race possible. 2216 * The RCU lock is held throughout the transaction. The fast path can
2217 * get away without acquiring the memcg->move_lock (@locked is false)
2218 * because page moving starts with an RCU grace period.
2217 * 2219 *
2218 * Charge moving between groups is protected by checking mm->moving 2220 * The RCU lock also protects the memcg from being freed when the page
2219 * account and taking the move_lock in the slowpath. 2221 * state that is going to change is the only thing preventing the page
2222 * from being uncharged. E.g. end-writeback clearing PageWriteback(),
2223 * which allows migration to go ahead and uncharge the page before the
2224 * account transaction might be complete.
2220 */ 2225 */
2221 2226struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
2222void __mem_cgroup_begin_update_page_stat(struct page *page, 2227 bool *locked,
2223 bool *locked, unsigned long *flags) 2228 unsigned long *flags)
2224{ 2229{
2225 struct mem_cgroup *memcg; 2230 struct mem_cgroup *memcg;
2226 struct page_cgroup *pc; 2231 struct page_cgroup *pc;
2227 2232
2233 rcu_read_lock();
2234
2235 if (mem_cgroup_disabled())
2236 return NULL;
2237
2228 pc = lookup_page_cgroup(page); 2238 pc = lookup_page_cgroup(page);
2229again: 2239again:
2230 memcg = pc->mem_cgroup; 2240 memcg = pc->mem_cgroup;
2231 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2241 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2232 return; 2242 return NULL;
2233 /* 2243
2234 * If this memory cgroup is not under account moving, we don't 2244 *locked = false;
2235 * need to take move_lock_mem_cgroup(). Because we already hold
2236 * rcu_read_lock(), any calls to move_account will be delayed until
2237 * rcu_read_unlock().
2238 */
2239 VM_BUG_ON(!rcu_read_lock_held());
2240 if (atomic_read(&memcg->moving_account) <= 0) 2245 if (atomic_read(&memcg->moving_account) <= 0)
2241 return; 2246 return memcg;
2242 2247
2243 move_lock_mem_cgroup(memcg, flags); 2248 move_lock_mem_cgroup(memcg, flags);
2244 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2249 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
@@ -2246,36 +2251,40 @@ again:
2246 goto again; 2251 goto again;
2247 } 2252 }
2248 *locked = true; 2253 *locked = true;
2254
2255 return memcg;
2249} 2256}
2250 2257
2251void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2258/**
2259 * mem_cgroup_end_page_stat - finish a page state statistics transaction
2260 * @memcg: the memcg that was accounted against
2261 * @locked: value received from mem_cgroup_begin_page_stat()
2262 * @flags: value received from mem_cgroup_begin_page_stat()
2263 */
2264void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
2265 unsigned long flags)
2252{ 2266{
2253 struct page_cgroup *pc = lookup_page_cgroup(page); 2267 if (memcg && locked)
2268 move_unlock_mem_cgroup(memcg, &flags);
2254 2269
2255 /* 2270 rcu_read_unlock();
2256 * It's guaranteed that pc->mem_cgroup never changes while
2257 * lock is held because a routine modifies pc->mem_cgroup
2258 * should take move_lock_mem_cgroup().
2259 */
2260 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2261} 2271}
2262 2272
2263void mem_cgroup_update_page_stat(struct page *page, 2273/**
2274 * mem_cgroup_update_page_stat - update page state statistics
2275 * @memcg: memcg to account against
2276 * @idx: page state item to account
2277 * @val: number of pages (positive or negative)
2278 *
2279 * See mem_cgroup_begin_page_stat() for locking requirements.
2280 */
2281void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
2264 enum mem_cgroup_stat_index idx, int val) 2282 enum mem_cgroup_stat_index idx, int val)
2265{ 2283{
2266 struct mem_cgroup *memcg;
2267 struct page_cgroup *pc = lookup_page_cgroup(page);
2268 unsigned long uninitialized_var(flags);
2269
2270 if (mem_cgroup_disabled())
2271 return;
2272
2273 VM_BUG_ON(!rcu_read_lock_held()); 2284 VM_BUG_ON(!rcu_read_lock_held());
2274 memcg = pc->mem_cgroup;
2275 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2276 return;
2277 2285
2278 this_cpu_add(memcg->stat->count[idx], val); 2286 if (memcg)
2287 this_cpu_add(memcg->stat->count[idx], val);
2279} 2288}
2280 2289
2281/* 2290/*
diff --git a/mm/memory.c b/mm/memory.c
index 1cc6bfbd872e..3e503831e042 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1147,6 +1147,7 @@ again:
1147 print_bad_pte(vma, addr, ptent, page); 1147 print_bad_pte(vma, addr, ptent, page);
1148 if (unlikely(!__tlb_remove_page(tlb, page))) { 1148 if (unlikely(!__tlb_remove_page(tlb, page))) {
1149 force_flush = 1; 1149 force_flush = 1;
1150 addr += PAGE_SIZE;
1150 break; 1151 break;
1151 } 1152 }
1152 continue; 1153 continue;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 29d8693d0c61..1bf4807cb21e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
31#include <linux/stop_machine.h> 31#include <linux/stop_machine.h>
32#include <linux/hugetlb.h> 32#include <linux/hugetlb.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/bootmem.h>
34 35
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
36 37
@@ -1066,6 +1067,16 @@ out:
1066} 1067}
1067#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1068#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1068 1069
1070static void reset_node_present_pages(pg_data_t *pgdat)
1071{
1072 struct zone *z;
1073
1074 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1075 z->present_pages = 0;
1076
1077 pgdat->node_present_pages = 0;
1078}
1079
1069/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1080/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1070static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1081static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1071{ 1082{
@@ -1096,6 +1107,21 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1096 build_all_zonelists(pgdat, NULL); 1107 build_all_zonelists(pgdat, NULL);
1097 mutex_unlock(&zonelists_mutex); 1108 mutex_unlock(&zonelists_mutex);
1098 1109
1110 /*
1111 * zone->managed_pages is set to an approximate value in
1112 * free_area_init_core(), which will cause
1113 * /sys/device/system/node/nodeX/meminfo has wrong data.
1114 * So reset it to 0 before any memory is onlined.
1115 */
1116 reset_node_managed_pages(pgdat);
1117
1118 /*
1119 * When memory is hot-added, all the memory is in offline state. So
1120 * clear all zones' present_pages because they will be updated in
1121 * online_pages() and offline_pages().
1122 */
1123 reset_node_present_pages(pgdat);
1124
1099 return pgdat; 1125 return pgdat;
1100} 1126}
1101 1127
@@ -1912,7 +1938,6 @@ void try_offline_node(int nid)
1912 unsigned long start_pfn = pgdat->node_start_pfn; 1938 unsigned long start_pfn = pgdat->node_start_pfn;
1913 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1939 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1914 unsigned long pfn; 1940 unsigned long pfn;
1915 struct page *pgdat_page = virt_to_page(pgdat);
1916 int i; 1941 int i;
1917 1942
1918 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1943 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -1941,10 +1966,6 @@ void try_offline_node(int nid)
1941 node_set_offline(nid); 1966 node_set_offline(nid);
1942 unregister_one_node(nid); 1967 unregister_one_node(nid);
1943 1968
1944 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
1945 /* node data is allocated from boot memory */
1946 return;
1947
1948 /* free waittable in each zone */ 1969 /* free waittable in each zone */
1949 for (i = 0; i < MAX_NR_ZONES; i++) { 1970 for (i = 0; i < MAX_NR_ZONES; i++) {
1950 struct zone *zone = pgdat->node_zones + i; 1971 struct zone *zone = pgdat->node_zones + i;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f855206e7fb..87e82b38453c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1080 end, prev->vm_pgoff, NULL); 1080 end, prev->vm_pgoff, NULL);
1081 if (err) 1081 if (err)
1082 return NULL; 1082 return NULL;
1083 khugepaged_enter_vma_merge(prev); 1083 khugepaged_enter_vma_merge(prev, vm_flags);
1084 return prev; 1084 return prev;
1085 } 1085 }
1086 1086
@@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1099 next->vm_pgoff - pglen, NULL); 1099 next->vm_pgoff - pglen, NULL);
1100 if (err) 1100 if (err)
1101 return NULL; 1101 return NULL;
1102 khugepaged_enter_vma_merge(area); 1102 khugepaged_enter_vma_merge(area, vm_flags);
1103 return area; 1103 return area;
1104 } 1104 }
1105 1105
@@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2208 } 2208 }
2209 } 2209 }
2210 vma_unlock_anon_vma(vma); 2210 vma_unlock_anon_vma(vma);
2211 khugepaged_enter_vma_merge(vma); 2211 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2212 validate_mm(vma->vm_mm); 2212 validate_mm(vma->vm_mm);
2213 return error; 2213 return error;
2214} 2214}
@@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma,
2277 } 2277 }
2278 } 2278 }
2279 vma_unlock_anon_vma(vma); 2279 vma_unlock_anon_vma(vma);
2280 khugepaged_enter_vma_merge(vma); 2280 khugepaged_enter_vma_merge(vma, vma->vm_flags);
2281 validate_mm(vma->vm_mm); 2281 validate_mm(vma->vm_mm);
2282 return error; 2282 return error;
2283} 2283}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7c7ab32ee503..90b50468333e 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -145,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void)
145 145
146static int reset_managed_pages_done __initdata; 146static int reset_managed_pages_done __initdata;
147 147
148static inline void __init reset_node_managed_pages(pg_data_t *pgdat) 148void reset_node_managed_pages(pg_data_t *pgdat)
149{ 149{
150 struct zone *z; 150 struct zone *z;
151 151
152 if (reset_managed_pages_done)
153 return;
154 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 152 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
155 z->managed_pages = 0; 153 z->managed_pages = 0;
156} 154}
@@ -159,8 +157,12 @@ void __init reset_all_zones_managed_pages(void)
159{ 157{
160 struct pglist_data *pgdat; 158 struct pglist_data *pgdat;
161 159
160 if (reset_managed_pages_done)
161 return;
162
162 for_each_online_pgdat(pgdat) 163 for_each_online_pgdat(pgdat)
163 reset_node_managed_pages(pgdat); 164 reset_node_managed_pages(pgdat);
165
164 reset_managed_pages_done = 1; 166 reset_managed_pages_done = 1;
165} 167}
166 168
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ff24c9d83112..19ceae87522d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2116,23 +2116,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
2116EXPORT_SYMBOL(account_page_dirtied); 2116EXPORT_SYMBOL(account_page_dirtied);
2117 2117
2118/* 2118/*
2119 * Helper function for set_page_writeback family.
2120 *
2121 * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
2122 * while calling this function.
2123 * See test_set_page_writeback for example.
2124 *
2125 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
2126 * wrt interrupts.
2127 */
2128void account_page_writeback(struct page *page)
2129{
2130 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2131 inc_zone_page_state(page, NR_WRITEBACK);
2132}
2133EXPORT_SYMBOL(account_page_writeback);
2134
2135/*
2136 * For address_spaces which do not use buffers. Just tag the page as dirty in 2119 * For address_spaces which do not use buffers. Just tag the page as dirty in
2137 * its radix tree. 2120 * its radix tree.
2138 * 2121 *
@@ -2344,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
2344int test_clear_page_writeback(struct page *page) 2327int test_clear_page_writeback(struct page *page)
2345{ 2328{
2346 struct address_space *mapping = page_mapping(page); 2329 struct address_space *mapping = page_mapping(page);
2347 int ret;
2348 bool locked;
2349 unsigned long memcg_flags; 2330 unsigned long memcg_flags;
2331 struct mem_cgroup *memcg;
2332 bool locked;
2333 int ret;
2350 2334
2351 mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); 2335 memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
2352 if (mapping) { 2336 if (mapping) {
2353 struct backing_dev_info *bdi = mapping->backing_dev_info; 2337 struct backing_dev_info *bdi = mapping->backing_dev_info;
2354 unsigned long flags; 2338 unsigned long flags;
@@ -2369,22 +2353,23 @@ int test_clear_page_writeback(struct page *page)
2369 ret = TestClearPageWriteback(page); 2353 ret = TestClearPageWriteback(page);
2370 } 2354 }
2371 if (ret) { 2355 if (ret) {
2372 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); 2356 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
2373 dec_zone_page_state(page, NR_WRITEBACK); 2357 dec_zone_page_state(page, NR_WRITEBACK);
2374 inc_zone_page_state(page, NR_WRITTEN); 2358 inc_zone_page_state(page, NR_WRITTEN);
2375 } 2359 }
2376 mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); 2360 mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
2377 return ret; 2361 return ret;
2378} 2362}
2379 2363
2380int __test_set_page_writeback(struct page *page, bool keep_write) 2364int __test_set_page_writeback(struct page *page, bool keep_write)
2381{ 2365{
2382 struct address_space *mapping = page_mapping(page); 2366 struct address_space *mapping = page_mapping(page);
2383 int ret;
2384 bool locked;
2385 unsigned long memcg_flags; 2367 unsigned long memcg_flags;
2368 struct mem_cgroup *memcg;
2369 bool locked;
2370 int ret;
2386 2371
2387 mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); 2372 memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
2388 if (mapping) { 2373 if (mapping) {
2389 struct backing_dev_info *bdi = mapping->backing_dev_info; 2374 struct backing_dev_info *bdi = mapping->backing_dev_info;
2390 unsigned long flags; 2375 unsigned long flags;
@@ -2410,9 +2395,11 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2410 } else { 2395 } else {
2411 ret = TestSetPageWriteback(page); 2396 ret = TestSetPageWriteback(page);
2412 } 2397 }
2413 if (!ret) 2398 if (!ret) {
2414 account_page_writeback(page); 2399 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
2415 mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); 2400 inc_zone_page_state(page, NR_WRITEBACK);
2401 }
2402 mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
2416 return ret; 2403 return ret;
2417 2404
2418} 2405}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9cd36b822444..616a2c956b4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -467,29 +467,6 @@ static inline void rmv_page_order(struct page *page)
467} 467}
468 468
469/* 469/*
470 * Locate the struct page for both the matching buddy in our
471 * pair (buddy1) and the combined O(n+1) page they form (page).
472 *
473 * 1) Any buddy B1 will have an order O twin B2 which satisfies
474 * the following equation:
475 * B2 = B1 ^ (1 << O)
476 * For example, if the starting buddy (buddy2) is #8 its order
477 * 1 buddy is #10:
478 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
479 *
480 * 2) Any buddy B will have an order O+1 parent P which
481 * satisfies the following equation:
482 * P = B & ~(1 << O)
483 *
484 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
485 */
486static inline unsigned long
487__find_buddy_index(unsigned long page_idx, unsigned int order)
488{
489 return page_idx ^ (1 << order);
490}
491
492/*
493 * This function checks whether a page is free && is the buddy 470 * This function checks whether a page is free && is the buddy
494 * we can do coalesce a page and its buddy if 471 * we can do coalesce a page and its buddy if
495 * (a) the buddy is not in a hole && 472 * (a) the buddy is not in a hole &&
@@ -569,6 +546,7 @@ static inline void __free_one_page(struct page *page,
569 unsigned long combined_idx; 546 unsigned long combined_idx;
570 unsigned long uninitialized_var(buddy_idx); 547 unsigned long uninitialized_var(buddy_idx);
571 struct page *buddy; 548 struct page *buddy;
549 int max_order = MAX_ORDER;
572 550
573 VM_BUG_ON(!zone_is_initialized(zone)); 551 VM_BUG_ON(!zone_is_initialized(zone));
574 552
@@ -577,13 +555,24 @@ static inline void __free_one_page(struct page *page,
577 return; 555 return;
578 556
579 VM_BUG_ON(migratetype == -1); 557 VM_BUG_ON(migratetype == -1);
558 if (is_migrate_isolate(migratetype)) {
559 /*
560 * We restrict max order of merging to prevent merge
561 * between freepages on isolate pageblock and normal
562 * pageblock. Without this, pageblock isolation
563 * could cause incorrect freepage accounting.
564 */
565 max_order = min(MAX_ORDER, pageblock_order + 1);
566 } else {
567 __mod_zone_freepage_state(zone, 1 << order, migratetype);
568 }
580 569
581 page_idx = pfn & ((1 << MAX_ORDER) - 1); 570 page_idx = pfn & ((1 << max_order) - 1);
582 571
583 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); 572 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
584 VM_BUG_ON_PAGE(bad_range(zone, page), page); 573 VM_BUG_ON_PAGE(bad_range(zone, page), page);
585 574
586 while (order < MAX_ORDER-1) { 575 while (order < max_order - 1) {
587 buddy_idx = __find_buddy_index(page_idx, order); 576 buddy_idx = __find_buddy_index(page_idx, order);
588 buddy = page + (buddy_idx - page_idx); 577 buddy = page + (buddy_idx - page_idx);
589 if (!page_is_buddy(page, buddy, order)) 578 if (!page_is_buddy(page, buddy, order))
@@ -594,9 +583,11 @@ static inline void __free_one_page(struct page *page,
594 */ 583 */
595 if (page_is_guard(buddy)) { 584 if (page_is_guard(buddy)) {
596 clear_page_guard_flag(buddy); 585 clear_page_guard_flag(buddy);
597 set_page_private(page, 0); 586 set_page_private(buddy, 0);
598 __mod_zone_freepage_state(zone, 1 << order, 587 if (!is_migrate_isolate(migratetype)) {
599 migratetype); 588 __mod_zone_freepage_state(zone, 1 << order,
589 migratetype);
590 }
600 } else { 591 } else {
601 list_del(&buddy->lru); 592 list_del(&buddy->lru);
602 zone->free_area[order].nr_free--; 593 zone->free_area[order].nr_free--;
@@ -715,14 +706,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
715 /* must delete as __free_one_page list manipulates */ 706 /* must delete as __free_one_page list manipulates */
716 list_del(&page->lru); 707 list_del(&page->lru);
717 mt = get_freepage_migratetype(page); 708 mt = get_freepage_migratetype(page);
709 if (unlikely(has_isolate_pageblock(zone)))
710 mt = get_pageblock_migratetype(page);
711
718 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 712 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
719 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 713 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
720 trace_mm_page_pcpu_drain(page, 0, mt); 714 trace_mm_page_pcpu_drain(page, 0, mt);
721 if (likely(!is_migrate_isolate_page(page))) {
722 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
723 if (is_migrate_cma(mt))
724 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
725 }
726 } while (--to_free && --batch_free && !list_empty(list)); 715 } while (--to_free && --batch_free && !list_empty(list));
727 } 716 }
728 spin_unlock(&zone->lock); 717 spin_unlock(&zone->lock);
@@ -739,9 +728,11 @@ static void free_one_page(struct zone *zone,
739 if (nr_scanned) 728 if (nr_scanned)
740 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 729 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
741 730
731 if (unlikely(has_isolate_pageblock(zone) ||
732 is_migrate_isolate(migratetype))) {
733 migratetype = get_pfnblock_migratetype(page, pfn);
734 }
742 __free_one_page(page, pfn, zone, order, migratetype); 735 __free_one_page(page, pfn, zone, order, migratetype);
743 if (unlikely(!is_migrate_isolate(migratetype)))
744 __mod_zone_freepage_state(zone, 1 << order, migratetype);
745 spin_unlock(&zone->lock); 736 spin_unlock(&zone->lock);
746} 737}
747 738
@@ -1484,7 +1475,7 @@ void split_page(struct page *page, unsigned int order)
1484} 1475}
1485EXPORT_SYMBOL_GPL(split_page); 1476EXPORT_SYMBOL_GPL(split_page);
1486 1477
1487static int __isolate_free_page(struct page *page, unsigned int order) 1478int __isolate_free_page(struct page *page, unsigned int order)
1488{ 1479{
1489 unsigned long watermark; 1480 unsigned long watermark;
1490 struct zone *zone; 1481 struct zone *zone;
@@ -6408,13 +6399,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6408 6399
6409 /* Make sure the range is really isolated. */ 6400 /* Make sure the range is really isolated. */
6410 if (test_pages_isolated(outer_start, end, false)) { 6401 if (test_pages_isolated(outer_start, end, false)) {
6411 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6402 pr_info("%s: [%lx, %lx) PFNs busy\n",
6412 outer_start, end); 6403 __func__, outer_start, end);
6413 ret = -EBUSY; 6404 ret = -EBUSY;
6414 goto done; 6405 goto done;
6415 } 6406 }
6416 6407
6417
6418 /* Grab isolated pages from freelists. */ 6408 /* Grab isolated pages from freelists. */
6419 outer_end = isolate_freepages_range(&cc, outer_start, end); 6409 outer_end = isolate_freepages_range(&cc, outer_start, end);
6420 if (!outer_end) { 6410 if (!outer_end) {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3708264d2833..5331c2bd85a2 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -171,6 +171,7 @@ static void free_page_cgroup(void *addr)
171 sizeof(struct page_cgroup) * PAGES_PER_SECTION; 171 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
172 172
173 BUG_ON(PageReserved(page)); 173 BUG_ON(PageReserved(page));
174 kmemleak_free(addr);
174 free_pages_exact(addr, table_size); 175 free_pages_exact(addr, table_size);
175 } 176 }
176} 177}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index d1473b2e9481..c8778f7e208e 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -60,6 +60,7 @@ out:
60 int migratetype = get_pageblock_migratetype(page); 60 int migratetype = get_pageblock_migratetype(page);
61 61
62 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 62 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
63 zone->nr_isolate_pageblock++;
63 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); 64 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
64 65
65 __mod_zone_freepage_state(zone, -nr_pages, migratetype); 66 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -75,16 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
75{ 76{
76 struct zone *zone; 77 struct zone *zone;
77 unsigned long flags, nr_pages; 78 unsigned long flags, nr_pages;
79 struct page *isolated_page = NULL;
80 unsigned int order;
81 unsigned long page_idx, buddy_idx;
82 struct page *buddy;
78 83
79 zone = page_zone(page); 84 zone = page_zone(page);
80 spin_lock_irqsave(&zone->lock, flags); 85 spin_lock_irqsave(&zone->lock, flags);
81 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 86 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
82 goto out; 87 goto out;
83 nr_pages = move_freepages_block(zone, page, migratetype); 88
84 __mod_zone_freepage_state(zone, nr_pages, migratetype); 89 /*
90 * Because freepage with more than pageblock_order on isolated
91 * pageblock is restricted to merge due to freepage counting problem,
92 * it is possible that there is free buddy page.
93 * move_freepages_block() doesn't care of merge so we need other
94 * approach in order to merge them. Isolation and free will make
95 * these pages to be merged.
96 */
97 if (PageBuddy(page)) {
98 order = page_order(page);
99 if (order >= pageblock_order) {
100 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
101 buddy_idx = __find_buddy_index(page_idx, order);
102 buddy = page + (buddy_idx - page_idx);
103
104 if (!is_migrate_isolate_page(buddy)) {
105 __isolate_free_page(page, order);
106 set_page_refcounted(page);
107 isolated_page = page;
108 }
109 }
110 }
111
112 /*
113 * If we isolate freepage with more than pageblock_order, there
114 * should be no freepage in the range, so we could avoid costly
115 * pageblock scanning for freepage moving.
116 */
117 if (!isolated_page) {
118 nr_pages = move_freepages_block(zone, page, migratetype);
119 __mod_zone_freepage_state(zone, nr_pages, migratetype);
120 }
85 set_pageblock_migratetype(page, migratetype); 121 set_pageblock_migratetype(page, migratetype);
122 zone->nr_isolate_pageblock--;
86out: 123out:
87 spin_unlock_irqrestore(&zone->lock, flags); 124 spin_unlock_irqrestore(&zone->lock, flags);
125 if (isolated_page)
126 __free_pages(isolated_page, order);
88} 127}
89 128
90static inline struct page * 129static inline struct page *
diff --git a/mm/rmap.c b/mm/rmap.c
index 116a5053415b..19886fb2f13a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1042,15 +1042,46 @@ void page_add_new_anon_rmap(struct page *page,
1042 */ 1042 */
1043void page_add_file_rmap(struct page *page) 1043void page_add_file_rmap(struct page *page)
1044{ 1044{
1045 bool locked; 1045 struct mem_cgroup *memcg;
1046 unsigned long flags; 1046 unsigned long flags;
1047 bool locked;
1047 1048
1048 mem_cgroup_begin_update_page_stat(page, &locked, &flags); 1049 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
1049 if (atomic_inc_and_test(&page->_mapcount)) { 1050 if (atomic_inc_and_test(&page->_mapcount)) {
1050 __inc_zone_page_state(page, NR_FILE_MAPPED); 1051 __inc_zone_page_state(page, NR_FILE_MAPPED);
1051 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1052 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1052 } 1053 }
1053 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1054 mem_cgroup_end_page_stat(memcg, locked, flags);
1055}
1056
1057static void page_remove_file_rmap(struct page *page)
1058{
1059 struct mem_cgroup *memcg;
1060 unsigned long flags;
1061 bool locked;
1062
1063 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
1064
1065 /* page still mapped by someone else? */
1066 if (!atomic_add_negative(-1, &page->_mapcount))
1067 goto out;
1068
1069 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
1070 if (unlikely(PageHuge(page)))
1071 goto out;
1072
1073 /*
1074 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1075 * these counters are not modified in interrupt context, and
1076 * pte lock(a spinlock) is held, which implies preemption disabled.
1077 */
1078 __dec_zone_page_state(page, NR_FILE_MAPPED);
1079 mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1080
1081 if (unlikely(PageMlocked(page)))
1082 clear_page_mlock(page);
1083out:
1084 mem_cgroup_end_page_stat(memcg, locked, flags);
1054} 1085}
1055 1086
1056/** 1087/**
@@ -1061,46 +1092,33 @@ void page_add_file_rmap(struct page *page)
1061 */ 1092 */
1062void page_remove_rmap(struct page *page) 1093void page_remove_rmap(struct page *page)
1063{ 1094{
1064 bool anon = PageAnon(page); 1095 if (!PageAnon(page)) {
1065 bool locked; 1096 page_remove_file_rmap(page);
1066 unsigned long flags; 1097 return;
1067 1098 }
1068 /*
1069 * The anon case has no mem_cgroup page_stat to update; but may
1070 * uncharge_page() below, where the lock ordering can deadlock if
1071 * we hold the lock against page_stat move: so avoid it on anon.
1072 */
1073 if (!anon)
1074 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1075 1099
1076 /* page still mapped by someone else? */ 1100 /* page still mapped by someone else? */
1077 if (!atomic_add_negative(-1, &page->_mapcount)) 1101 if (!atomic_add_negative(-1, &page->_mapcount))
1078 goto out; 1102 return;
1103
1104 /* Hugepages are not counted in NR_ANON_PAGES for now. */
1105 if (unlikely(PageHuge(page)))
1106 return;
1079 1107
1080 /* 1108 /*
1081 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1082 * and not charged by memcg for now.
1083 *
1084 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1109 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1085 * these counters are not modified in interrupt context, and 1110 * these counters are not modified in interrupt context, and
1086 * these counters are not modified in interrupt context, and
1087 * pte lock(a spinlock) is held, which implies preemption disabled. 1111 * pte lock(a spinlock) is held, which implies preemption disabled.
1088 */ 1112 */
1089 if (unlikely(PageHuge(page))) 1113 if (PageTransHuge(page))
1090 goto out; 1114 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1091 if (anon) { 1115
1092 if (PageTransHuge(page)) 1116 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1093 __dec_zone_page_state(page, 1117 -hpage_nr_pages(page));
1094 NR_ANON_TRANSPARENT_HUGEPAGES); 1118
1095 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1096 -hpage_nr_pages(page));
1097 } else {
1098 __dec_zone_page_state(page, NR_FILE_MAPPED);
1099 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1100 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1101 }
1102 if (unlikely(PageMlocked(page))) 1119 if (unlikely(PageMlocked(page)))
1103 clear_page_mlock(page); 1120 clear_page_mlock(page);
1121
1104 /* 1122 /*
1105 * It would be tidy to reset the PageAnon mapping here, 1123 * It would be tidy to reset the PageAnon mapping here,
1106 * but that might overwrite a racing page_add_anon_rmap 1124 * but that might overwrite a racing page_add_anon_rmap
@@ -1110,10 +1128,6 @@ void page_remove_rmap(struct page *page)
1110 * Leaving it set also helps swapoff to reinstate ptes 1128 * Leaving it set also helps swapoff to reinstate ptes
1111 * faster for those pages still in swapcache. 1129 * faster for those pages still in swapcache.
1112 */ 1130 */
1113 return;
1114out:
1115 if (!anon)
1116 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1117} 1131}
1118 1132
1119/* 1133/*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3a6e0cfdf03a..dcdab81bd240 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -93,16 +93,6 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
93 s->object_size); 93 s->object_size);
94 continue; 94 continue;
95 } 95 }
96
97#if !defined(CONFIG_SLUB)
98 if (!strcmp(s->name, name)) {
99 pr_err("%s (%s): Cache name already exists.\n",
100 __func__, name);
101 dump_stack();
102 s = NULL;
103 return -EINVAL;
104 }
105#endif
106 } 96 }
107 97
108 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 98 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
@@ -269,6 +259,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
269 if (s->size - size >= sizeof(void *)) 259 if (s->size - size >= sizeof(void *))
270 continue; 260 continue;
271 261
262 if (IS_ENABLED(CONFIG_SLAB) && align &&
263 (align > s->align || s->align % align))
264 continue;
265
272 return s; 266 return s;
273 } 267 }
274 return NULL; 268 return NULL;
diff --git a/mm/truncate.c b/mm/truncate.c
index 261eaf6e5a19..f1e4d6052369 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache);
715 * necessary) to @newsize. It will be typically be called from the filesystem's 715 * necessary) to @newsize. It will be typically be called from the filesystem's
716 * setattr function when ATTR_SIZE is passed in. 716 * setattr function when ATTR_SIZE is passed in.
717 * 717 *
718 * Must be called with inode_mutex held and before all filesystem specific 718 * Must be called with a lock serializing truncates and writes (generally
719 * block truncation has been performed. 719 * i_mutex but e.g. xfs uses a different lock) and before all filesystem
720 * specific block truncation has been performed.
720 */ 721 */
721void truncate_setsize(struct inode *inode, loff_t newsize) 722void truncate_setsize(struct inode *inode, loff_t newsize)
722{ 723{
@@ -755,7 +756,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
755 struct page *page; 756 struct page *page;
756 pgoff_t index; 757 pgoff_t index;
757 758
758 WARN_ON(!mutex_is_locked(&inode->i_mutex));
759 WARN_ON(to > inode->i_size); 759 WARN_ON(to > inode->i_size);
760 760
761 if (from >= to || bsize == PAGE_CACHE_SIZE) 761 if (from >= to || bsize == PAGE_CACHE_SIZE)