aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bounce.c302
-rw-r--r--mm/filemap.c186
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/highmem.c281
-rw-r--r--mm/memory.c21
-rw-r--r--mm/memory_hotplug.c71
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/oom_kill.c53
-rw-r--r--mm/page-writeback.c160
-rw-r--r--mm/shmem.c117
-rw-r--r--mm/shmem_acl.c197
-rw-r--r--mm/slab.c24
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/truncate.c60
-rw-r--r--mm/util.c18
21 files changed, 1014 insertions, 510 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8f5b45615f7b..5d88489ef2de 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,12 +115,17 @@ config SPARSEMEM_EXTREME
115# eventually, we can have this option just 'select SPARSEMEM' 115# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG 116config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add" 117 bool "Allow for memory hot-add"
118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG 118 depends on SPARSEMEM || X86_64_ACPI_NUMA
119 depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
119 depends on (IA64 || X86 || PPC64) 120 depends on (IA64 || X86 || PPC64)
120 121
121comment "Memory hotplug is currently incompatible with Software Suspend" 122comment "Memory hotplug is currently incompatible with Software Suspend"
122 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND 123 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
123 124
125config MEMORY_HOTPLUG_SPARSE
126 def_bool y
127 depends on SPARSEMEM && MEMORY_HOTPLUG
128
124# Heavily threaded applications may benefit from splitting the mm-wide 129# Heavily threaded applications may benefit from splitting the mm-wide
125# page_table_lock, so that faults on different parts of the user address 130# page_table_lock, so that faults on different parts of the user address
126# space can be handled with less contention: split it at this NR_CPUS. 131# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 60c56c0b5e10..12b3a4eee88d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,11 +12,15 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) 13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
14 14
15ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
16obj-y += bounce.o
17endif
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 18obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 19obj-$(CONFIG_HUGETLBFS) += hugetlb.o
17obj-$(CONFIG_NUMA) += mempolicy.o 20obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 21obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 22obj-$(CONFIG_SHMEM) += shmem.o
23obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 24obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o 26obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 000000000000..e4b62d2a4024
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,302 @@
1/* bounce buffer handling for block devices
2 *
3 * - Split from highmem.c
4 */
5
6#include <linux/mm.h>
7#include <linux/module.h>
8#include <linux/swap.h>
9#include <linux/bio.h>
10#include <linux/pagemap.h>
11#include <linux/mempool.h>
12#include <linux/blkdev.h>
13#include <linux/init.h>
14#include <linux/hash.h>
15#include <linux/highmem.h>
16#include <linux/blktrace_api.h>
17#include <asm/tlbflush.h>
18
19#define POOL_SIZE 64
20#define ISA_POOL_SIZE 16
21
22static mempool_t *page_pool, *isa_page_pool;
23
24#ifdef CONFIG_HIGHMEM
25static __init int init_emergency_pool(void)
26{
27 struct sysinfo i;
28 si_meminfo(&i);
29 si_swapinfo(&i);
30
31 if (!i.totalhigh)
32 return 0;
33
34 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
35 BUG_ON(!page_pool);
36 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
37
38 return 0;
39}
40
41__initcall(init_emergency_pool);
42
43/*
44 * highmem version, map in to vec
45 */
46static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
47{
48 unsigned long flags;
49 unsigned char *vto;
50
51 local_irq_save(flags);
52 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
53 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
54 kunmap_atomic(vto, KM_BOUNCE_READ);
55 local_irq_restore(flags);
56}
57
58#else /* CONFIG_HIGHMEM */
59
60#define bounce_copy_vec(to, vfrom) \
61 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
62
63#endif /* CONFIG_HIGHMEM */
64
65/*
66 * allocate pages in the DMA region for the ISA pool
67 */
68static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
69{
70 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
71}
72
73/*
74 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
75 * as the max address, so check if the pool has already been created.
76 */
77int init_emergency_isa_pool(void)
78{
79 if (isa_page_pool)
80 return 0;
81
82 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
83 mempool_free_pages, (void *) 0);
84 BUG_ON(!isa_page_pool);
85
86 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
87 return 0;
88}
89
90/*
91 * Simple bounce buffer support for highmem pages. Depending on the
92 * queue gfp mask set, *to may or may not be a highmem page. kmap it
93 * always, it will do the Right Thing
94 */
95static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
96{
97 unsigned char *vfrom;
98 struct bio_vec *tovec, *fromvec;
99 int i;
100
101 __bio_for_each_segment(tovec, to, i, 0) {
102 fromvec = from->bi_io_vec + i;
103
104 /*
105 * not bounced
106 */
107 if (tovec->bv_page == fromvec->bv_page)
108 continue;
109
110 /*
111 * fromvec->bv_offset and fromvec->bv_len might have been
112 * modified by the block layer, so use the original copy,
113 * bounce_copy_vec already uses tovec->bv_len
114 */
115 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
116
117 flush_dcache_page(tovec->bv_page);
118 bounce_copy_vec(tovec, vfrom);
119 }
120}
121
122static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
123{
124 struct bio *bio_orig = bio->bi_private;
125 struct bio_vec *bvec, *org_vec;
126 int i;
127
128 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
129 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
130
131 /*
132 * free up bounce indirect pages used
133 */
134 __bio_for_each_segment(bvec, bio, i, 0) {
135 org_vec = bio_orig->bi_io_vec + i;
136 if (bvec->bv_page == org_vec->bv_page)
137 continue;
138
139 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
140 mempool_free(bvec->bv_page, pool);
141 }
142
143 bio_endio(bio_orig, bio_orig->bi_size, err);
144 bio_put(bio);
145}
146
147static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
148{
149 if (bio->bi_size)
150 return 1;
151
152 bounce_end_io(bio, page_pool, err);
153 return 0;
154}
155
156static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
157{
158 if (bio->bi_size)
159 return 1;
160
161 bounce_end_io(bio, isa_page_pool, err);
162 return 0;
163}
164
165static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
166{
167 struct bio *bio_orig = bio->bi_private;
168
169 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
170 copy_to_high_bio_irq(bio_orig, bio);
171
172 bounce_end_io(bio, pool, err);
173}
174
175static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
176{
177 if (bio->bi_size)
178 return 1;
179
180 __bounce_end_io_read(bio, page_pool, err);
181 return 0;
182}
183
184static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
185{
186 if (bio->bi_size)
187 return 1;
188
189 __bounce_end_io_read(bio, isa_page_pool, err);
190 return 0;
191}
192
193static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
194 mempool_t *pool)
195{
196 struct page *page;
197 struct bio *bio = NULL;
198 int i, rw = bio_data_dir(*bio_orig);
199 struct bio_vec *to, *from;
200
201 bio_for_each_segment(from, *bio_orig, i) {
202 page = from->bv_page;
203
204 /*
205 * is destination page below bounce pfn?
206 */
207 if (page_to_pfn(page) < q->bounce_pfn)
208 continue;
209
210 /*
211 * irk, bounce it
212 */
213 if (!bio)
214 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
215
216 to = bio->bi_io_vec + i;
217
218 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
219 to->bv_len = from->bv_len;
220 to->bv_offset = from->bv_offset;
221 inc_zone_page_state(to->bv_page, NR_BOUNCE);
222
223 if (rw == WRITE) {
224 char *vto, *vfrom;
225
226 flush_dcache_page(from->bv_page);
227 vto = page_address(to->bv_page) + to->bv_offset;
228 vfrom = kmap(from->bv_page) + from->bv_offset;
229 memcpy(vto, vfrom, to->bv_len);
230 kunmap(from->bv_page);
231 }
232 }
233
234 /*
235 * no pages bounced
236 */
237 if (!bio)
238 return;
239
240 /*
241 * at least one page was bounced, fill in possible non-highmem
242 * pages
243 */
244 __bio_for_each_segment(from, *bio_orig, i, 0) {
245 to = bio_iovec_idx(bio, i);
246 if (!to->bv_page) {
247 to->bv_page = from->bv_page;
248 to->bv_len = from->bv_len;
249 to->bv_offset = from->bv_offset;
250 }
251 }
252
253 bio->bi_bdev = (*bio_orig)->bi_bdev;
254 bio->bi_flags |= (1 << BIO_BOUNCED);
255 bio->bi_sector = (*bio_orig)->bi_sector;
256 bio->bi_rw = (*bio_orig)->bi_rw;
257
258 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
259 bio->bi_idx = (*bio_orig)->bi_idx;
260 bio->bi_size = (*bio_orig)->bi_size;
261
262 if (pool == page_pool) {
263 bio->bi_end_io = bounce_end_io_write;
264 if (rw == READ)
265 bio->bi_end_io = bounce_end_io_read;
266 } else {
267 bio->bi_end_io = bounce_end_io_write_isa;
268 if (rw == READ)
269 bio->bi_end_io = bounce_end_io_read_isa;
270 }
271
272 bio->bi_private = *bio_orig;
273 *bio_orig = bio;
274}
275
276void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
277{
278 mempool_t *pool;
279
280 /*
281 * for non-isa bounce case, just check if the bounce pfn is equal
282 * to or bigger than the highest pfn in the system -- in that case,
283 * don't waste time iterating over bio segments
284 */
285 if (!(q->bounce_gfp & GFP_DMA)) {
286 if (q->bounce_pfn >= blk_max_pfn)
287 return;
288 pool = page_pool;
289 } else {
290 BUG_ON(!isa_page_pool);
291 pool = isa_page_pool;
292 }
293
294 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
295
296 /*
297 * slow path
298 */
299 __blk_queue_bounce(q, bio_orig, pool);
300}
301
302EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/filemap.c b/mm/filemap.c
index 87d4a398cd16..fef7d879ddf5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1149,13 +1149,14 @@ success:
1149 * that can use the page cache directly. 1149 * that can use the page cache directly.
1150 */ 1150 */
1151ssize_t 1151ssize_t
1152__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 1152generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1153 unsigned long nr_segs, loff_t *ppos) 1153 unsigned long nr_segs, loff_t pos)
1154{ 1154{
1155 struct file *filp = iocb->ki_filp; 1155 struct file *filp = iocb->ki_filp;
1156 ssize_t retval; 1156 ssize_t retval;
1157 unsigned long seg; 1157 unsigned long seg;
1158 size_t count; 1158 size_t count;
1159 loff_t *ppos = &iocb->ki_pos;
1159 1160
1160 count = 0; 1161 count = 0;
1161 for (seg = 0; seg < nr_segs; seg++) { 1162 for (seg = 0; seg < nr_segs; seg++) {
@@ -1179,7 +1180,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1179 1180
1180 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1181 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1181 if (filp->f_flags & O_DIRECT) { 1182 if (filp->f_flags & O_DIRECT) {
1182 loff_t pos = *ppos, size; 1183 loff_t size;
1183 struct address_space *mapping; 1184 struct address_space *mapping;
1184 struct inode *inode; 1185 struct inode *inode;
1185 1186
@@ -1225,33 +1226,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1225out: 1226out:
1226 return retval; 1227 return retval;
1227} 1228}
1228EXPORT_SYMBOL(__generic_file_aio_read);
1229
1230ssize_t
1231generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1232{
1233 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1234
1235 BUG_ON(iocb->ki_pos != pos);
1236 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1237}
1238EXPORT_SYMBOL(generic_file_aio_read); 1229EXPORT_SYMBOL(generic_file_aio_read);
1239 1230
1240ssize_t
1241generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1242{
1243 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1244 struct kiocb kiocb;
1245 ssize_t ret;
1246
1247 init_sync_kiocb(&kiocb, filp);
1248 ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1249 if (-EIOCBQUEUED == ret)
1250 ret = wait_on_sync_kiocb(&kiocb);
1251 return ret;
1252}
1253EXPORT_SYMBOL(generic_file_read);
1254
1255int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) 1231int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1256{ 1232{
1257 ssize_t written; 1233 ssize_t written;
@@ -1473,7 +1449,7 @@ outside_data_content:
1473 * accessible.. 1449 * accessible..
1474 */ 1450 */
1475 if (area->vm_mm == current->mm) 1451 if (area->vm_mm == current->mm)
1476 return NULL; 1452 return NOPAGE_SIGBUS;
1477 /* Fall through to the non-read-ahead case */ 1453 /* Fall through to the non-read-ahead case */
1478no_cached_page: 1454no_cached_page:
1479 /* 1455 /*
@@ -1498,7 +1474,7 @@ no_cached_page:
1498 */ 1474 */
1499 if (error == -ENOMEM) 1475 if (error == -ENOMEM)
1500 return NOPAGE_OOM; 1476 return NOPAGE_OOM;
1501 return NULL; 1477 return NOPAGE_SIGBUS;
1502 1478
1503page_not_uptodate: 1479page_not_uptodate:
1504 if (!did_readaround) { 1480 if (!did_readaround) {
@@ -1567,7 +1543,7 @@ page_not_uptodate:
1567 */ 1543 */
1568 shrink_readahead_size_eio(file, ra); 1544 shrink_readahead_size_eio(file, ra);
1569 page_cache_release(page); 1545 page_cache_release(page);
1570 return NULL; 1546 return NOPAGE_SIGBUS;
1571} 1547}
1572EXPORT_SYMBOL(filemap_nopage); 1548EXPORT_SYMBOL(filemap_nopage);
1573 1549
@@ -2022,6 +1998,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
2022 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) 1998 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2023 *count = inode->i_sb->s_maxbytes - *pos; 1999 *count = inode->i_sb->s_maxbytes - *pos;
2024 } else { 2000 } else {
2001#ifdef CONFIG_BLOCK
2025 loff_t isize; 2002 loff_t isize;
2026 if (bdev_read_only(I_BDEV(inode))) 2003 if (bdev_read_only(I_BDEV(inode)))
2027 return -EPERM; 2004 return -EPERM;
@@ -2033,6 +2010,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
2033 2010
2034 if (*pos + *count > isize) 2011 if (*pos + *count > isize)
2035 *count = isize - *pos; 2012 *count = isize - *pos;
2013#else
2014 return -EPERM;
2015#endif
2036 } 2016 }
2037 return 0; 2017 return 0;
2038} 2018}
@@ -2313,22 +2293,22 @@ out:
2313 current->backing_dev_info = NULL; 2293 current->backing_dev_info = NULL;
2314 return written ? written : err; 2294 return written ? written : err;
2315} 2295}
2316EXPORT_SYMBOL(generic_file_aio_write_nolock);
2317 2296
2318ssize_t 2297ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
2319generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2298 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
2320 unsigned long nr_segs, loff_t *ppos)
2321{ 2299{
2322 struct file *file = iocb->ki_filp; 2300 struct file *file = iocb->ki_filp;
2323 struct address_space *mapping = file->f_mapping; 2301 struct address_space *mapping = file->f_mapping;
2324 struct inode *inode = mapping->host; 2302 struct inode *inode = mapping->host;
2325 ssize_t ret; 2303 ssize_t ret;
2326 loff_t pos = *ppos;
2327 2304
2328 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); 2305 BUG_ON(iocb->ki_pos != pos);
2306
2307 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2308 &iocb->ki_pos);
2329 2309
2330 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2310 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2331 int err; 2311 ssize_t err;
2332 2312
2333 err = sync_page_range_nolock(inode, mapping, pos, ret); 2313 err = sync_page_range_nolock(inode, mapping, pos, ret);
2334 if (err < 0) 2314 if (err < 0)
@@ -2336,51 +2316,21 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2336 } 2316 }
2337 return ret; 2317 return ret;
2338} 2318}
2319EXPORT_SYMBOL(generic_file_aio_write_nolock);
2339 2320
2340static ssize_t 2321ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2341__generic_file_write_nolock(struct file *file, const struct iovec *iov, 2322 unsigned long nr_segs, loff_t pos)
2342 unsigned long nr_segs, loff_t *ppos)
2343{
2344 struct kiocb kiocb;
2345 ssize_t ret;
2346
2347 init_sync_kiocb(&kiocb, file);
2348 ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2349 if (ret == -EIOCBQUEUED)
2350 ret = wait_on_sync_kiocb(&kiocb);
2351 return ret;
2352}
2353
2354ssize_t
2355generic_file_write_nolock(struct file *file, const struct iovec *iov,
2356 unsigned long nr_segs, loff_t *ppos)
2357{
2358 struct kiocb kiocb;
2359 ssize_t ret;
2360
2361 init_sync_kiocb(&kiocb, file);
2362 ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2363 if (-EIOCBQUEUED == ret)
2364 ret = wait_on_sync_kiocb(&kiocb);
2365 return ret;
2366}
2367EXPORT_SYMBOL(generic_file_write_nolock);
2368
2369ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2370 size_t count, loff_t pos)
2371{ 2323{
2372 struct file *file = iocb->ki_filp; 2324 struct file *file = iocb->ki_filp;
2373 struct address_space *mapping = file->f_mapping; 2325 struct address_space *mapping = file->f_mapping;
2374 struct inode *inode = mapping->host; 2326 struct inode *inode = mapping->host;
2375 ssize_t ret; 2327 ssize_t ret;
2376 struct iovec local_iov = { .iov_base = (void __user *)buf,
2377 .iov_len = count };
2378 2328
2379 BUG_ON(iocb->ki_pos != pos); 2329 BUG_ON(iocb->ki_pos != pos);
2380 2330
2381 mutex_lock(&inode->i_mutex); 2331 mutex_lock(&inode->i_mutex);
2382 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, 2332 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2383 &iocb->ki_pos); 2333 &iocb->ki_pos);
2384 mutex_unlock(&inode->i_mutex); 2334 mutex_unlock(&inode->i_mutex);
2385 2335
2386 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2336 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2394,66 +2344,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2394} 2344}
2395EXPORT_SYMBOL(generic_file_aio_write); 2345EXPORT_SYMBOL(generic_file_aio_write);
2396 2346
2397ssize_t generic_file_write(struct file *file, const char __user *buf,
2398 size_t count, loff_t *ppos)
2399{
2400 struct address_space *mapping = file->f_mapping;
2401 struct inode *inode = mapping->host;
2402 ssize_t ret;
2403 struct iovec local_iov = { .iov_base = (void __user *)buf,
2404 .iov_len = count };
2405
2406 mutex_lock(&inode->i_mutex);
2407 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2408 mutex_unlock(&inode->i_mutex);
2409
2410 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2411 ssize_t err;
2412
2413 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2414 if (err < 0)
2415 ret = err;
2416 }
2417 return ret;
2418}
2419EXPORT_SYMBOL(generic_file_write);
2420
2421ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2422 unsigned long nr_segs, loff_t *ppos)
2423{
2424 struct kiocb kiocb;
2425 ssize_t ret;
2426
2427 init_sync_kiocb(&kiocb, filp);
2428 ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2429 if (-EIOCBQUEUED == ret)
2430 ret = wait_on_sync_kiocb(&kiocb);
2431 return ret;
2432}
2433EXPORT_SYMBOL(generic_file_readv);
2434
2435ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2436 unsigned long nr_segs, loff_t *ppos)
2437{
2438 struct address_space *mapping = file->f_mapping;
2439 struct inode *inode = mapping->host;
2440 ssize_t ret;
2441
2442 mutex_lock(&inode->i_mutex);
2443 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2444 mutex_unlock(&inode->i_mutex);
2445
2446 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2447 int err;
2448
2449 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2450 if (err < 0)
2451 ret = err;
2452 }
2453 return ret;
2454}
2455EXPORT_SYMBOL(generic_file_writev);
2456
2457/* 2347/*
2458 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something 2348 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2459 * went wrong during pagecache shootdown. 2349 * went wrong during pagecache shootdown.
@@ -2493,3 +2383,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2493 } 2383 }
2494 return retval; 2384 return retval;
2495} 2385}
2386
2387/**
2388 * try_to_release_page() - release old fs-specific metadata on a page
2389 *
2390 * @page: the page which the kernel is trying to free
2391 * @gfp_mask: memory allocation flags (and I/O mode)
2392 *
2393 * The address_space is to try to release any data against the page
2394 * (presumably at page->private). If the release was successful, return `1'.
2395 * Otherwise return zero.
2396 *
2397 * The @gfp_mask argument specifies whether I/O may be performed to release
2398 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
2399 *
2400 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2401 */
2402int try_to_release_page(struct page *page, gfp_t gfp_mask)
2403{
2404 struct address_space * const mapping = page->mapping;
2405
2406 BUG_ON(!PageLocked(page));
2407 if (PageWriteback(page))
2408 return 0;
2409
2410 if (mapping && mapping->a_ops->releasepage)
2411 return mapping->a_ops->releasepage(page, gfp_mask);
2412 return try_to_free_buffers(page);
2413}
2414
2415EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index aa30618ec6b2..7a9d0f5d246d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
39 } else { 39 } else {
40 if (!pte_file(pte)) 40 if (!pte_file(pte))
41 free_swap_and_cache(pte_to_swp_entry(pte)); 41 free_swap_and_cache(pte_to_swp_entry(pte));
42 pte_clear(mm, addr, ptep); 42 pte_clear_not_present_full(mm, addr, ptep, 0);
43 } 43 }
44 return !!page; 44 return !!page;
45} 45}
diff --git a/mm/highmem.c b/mm/highmem.c
index ee5519b176ee..0206e7e5018c 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,13 +29,6 @@
29#include <linux/blktrace_api.h> 29#include <linux/blktrace_api.h>
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32static mempool_t *page_pool, *isa_page_pool;
33
34static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
35{
36 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
37}
38
39/* 32/*
40 * Virtual_count is not a pure "count". 33 * Virtual_count is not a pure "count".
41 * 0 means that it is not mapped, and has not been mapped 34 * 0 means that it is not mapped, and has not been mapped
@@ -217,282 +210,8 @@ void fastcall kunmap_high(struct page *page)
217} 210}
218 211
219EXPORT_SYMBOL(kunmap_high); 212EXPORT_SYMBOL(kunmap_high);
220
221#define POOL_SIZE 64
222
223static __init int init_emergency_pool(void)
224{
225 struct sysinfo i;
226 si_meminfo(&i);
227 si_swapinfo(&i);
228
229 if (!i.totalhigh)
230 return 0;
231
232 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
233 BUG_ON(!page_pool);
234 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
235
236 return 0;
237}
238
239__initcall(init_emergency_pool);
240
241/*
242 * highmem version, map in to vec
243 */
244static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
245{
246 unsigned long flags;
247 unsigned char *vto;
248
249 local_irq_save(flags);
250 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
251 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
252 kunmap_atomic(vto, KM_BOUNCE_READ);
253 local_irq_restore(flags);
254}
255
256#else /* CONFIG_HIGHMEM */
257
258#define bounce_copy_vec(to, vfrom) \
259 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
260
261#endif 213#endif
262 214
263#define ISA_POOL_SIZE 16
264
265/*
266 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
267 * as the max address, so check if the pool has already been created.
268 */
269int init_emergency_isa_pool(void)
270{
271 if (isa_page_pool)
272 return 0;
273
274 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
275 mempool_free_pages, (void *) 0);
276 BUG_ON(!isa_page_pool);
277
278 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
279 return 0;
280}
281
282/*
283 * Simple bounce buffer support for highmem pages. Depending on the
284 * queue gfp mask set, *to may or may not be a highmem page. kmap it
285 * always, it will do the Right Thing
286 */
287static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
288{
289 unsigned char *vfrom;
290 struct bio_vec *tovec, *fromvec;
291 int i;
292
293 __bio_for_each_segment(tovec, to, i, 0) {
294 fromvec = from->bi_io_vec + i;
295
296 /*
297 * not bounced
298 */
299 if (tovec->bv_page == fromvec->bv_page)
300 continue;
301
302 /*
303 * fromvec->bv_offset and fromvec->bv_len might have been
304 * modified by the block layer, so use the original copy,
305 * bounce_copy_vec already uses tovec->bv_len
306 */
307 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
308
309 flush_dcache_page(tovec->bv_page);
310 bounce_copy_vec(tovec, vfrom);
311 }
312}
313
314static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
315{
316 struct bio *bio_orig = bio->bi_private;
317 struct bio_vec *bvec, *org_vec;
318 int i;
319
320 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
321 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
322
323 /*
324 * free up bounce indirect pages used
325 */
326 __bio_for_each_segment(bvec, bio, i, 0) {
327 org_vec = bio_orig->bi_io_vec + i;
328 if (bvec->bv_page == org_vec->bv_page)
329 continue;
330
331 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
332 mempool_free(bvec->bv_page, pool);
333 }
334
335 bio_endio(bio_orig, bio_orig->bi_size, err);
336 bio_put(bio);
337}
338
339static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
340{
341 if (bio->bi_size)
342 return 1;
343
344 bounce_end_io(bio, page_pool, err);
345 return 0;
346}
347
348static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
349{
350 if (bio->bi_size)
351 return 1;
352
353 bounce_end_io(bio, isa_page_pool, err);
354 return 0;
355}
356
357static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
358{
359 struct bio *bio_orig = bio->bi_private;
360
361 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
362 copy_to_high_bio_irq(bio_orig, bio);
363
364 bounce_end_io(bio, pool, err);
365}
366
367static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
368{
369 if (bio->bi_size)
370 return 1;
371
372 __bounce_end_io_read(bio, page_pool, err);
373 return 0;
374}
375
376static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
377{
378 if (bio->bi_size)
379 return 1;
380
381 __bounce_end_io_read(bio, isa_page_pool, err);
382 return 0;
383}
384
385static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
386 mempool_t *pool)
387{
388 struct page *page;
389 struct bio *bio = NULL;
390 int i, rw = bio_data_dir(*bio_orig);
391 struct bio_vec *to, *from;
392
393 bio_for_each_segment(from, *bio_orig, i) {
394 page = from->bv_page;
395
396 /*
397 * is destination page below bounce pfn?
398 */
399 if (page_to_pfn(page) < q->bounce_pfn)
400 continue;
401
402 /*
403 * irk, bounce it
404 */
405 if (!bio)
406 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
407
408 to = bio->bi_io_vec + i;
409
410 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
411 to->bv_len = from->bv_len;
412 to->bv_offset = from->bv_offset;
413 inc_zone_page_state(to->bv_page, NR_BOUNCE);
414
415 if (rw == WRITE) {
416 char *vto, *vfrom;
417
418 flush_dcache_page(from->bv_page);
419 vto = page_address(to->bv_page) + to->bv_offset;
420 vfrom = kmap(from->bv_page) + from->bv_offset;
421 memcpy(vto, vfrom, to->bv_len);
422 kunmap(from->bv_page);
423 }
424 }
425
426 /*
427 * no pages bounced
428 */
429 if (!bio)
430 return;
431
432 /*
433 * at least one page was bounced, fill in possible non-highmem
434 * pages
435 */
436 __bio_for_each_segment(from, *bio_orig, i, 0) {
437 to = bio_iovec_idx(bio, i);
438 if (!to->bv_page) {
439 to->bv_page = from->bv_page;
440 to->bv_len = from->bv_len;
441 to->bv_offset = from->bv_offset;
442 }
443 }
444
445 bio->bi_bdev = (*bio_orig)->bi_bdev;
446 bio->bi_flags |= (1 << BIO_BOUNCED);
447 bio->bi_sector = (*bio_orig)->bi_sector;
448 bio->bi_rw = (*bio_orig)->bi_rw;
449
450 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
451 bio->bi_idx = (*bio_orig)->bi_idx;
452 bio->bi_size = (*bio_orig)->bi_size;
453
454 if (pool == page_pool) {
455 bio->bi_end_io = bounce_end_io_write;
456 if (rw == READ)
457 bio->bi_end_io = bounce_end_io_read;
458 } else {
459 bio->bi_end_io = bounce_end_io_write_isa;
460 if (rw == READ)
461 bio->bi_end_io = bounce_end_io_read_isa;
462 }
463
464 bio->bi_private = *bio_orig;
465 *bio_orig = bio;
466}
467
468void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
469{
470 mempool_t *pool;
471
472 /*
473 * for non-isa bounce case, just check if the bounce pfn is equal
474 * to or bigger than the highest pfn in the system -- in that case,
475 * don't waste time iterating over bio segments
476 */
477 if (!(q->bounce_gfp & GFP_DMA)) {
478 if (q->bounce_pfn >= blk_max_pfn)
479 return;
480 pool = page_pool;
481 } else {
482 BUG_ON(!isa_page_pool);
483 pool = isa_page_pool;
484 }
485
486 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
487
488 /*
489 * slow path
490 */
491 __blk_queue_bounce(q, bio_orig, pool);
492}
493
494EXPORT_SYMBOL(blk_queue_bounce);
495
496#if defined(HASHED_PAGE_VIRTUAL) 215#if defined(HASHED_PAGE_VIRTUAL)
497 216
498#define PA_HASH_ORDER 7 217#define PA_HASH_ORDER 7
diff --git a/mm/memory.c b/mm/memory.c
index 601159a46ab6..9cf3f341a28a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -467,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
467 */ 467 */
468 if (is_cow_mapping(vm_flags)) { 468 if (is_cow_mapping(vm_flags)) {
469 ptep_set_wrprotect(src_mm, addr, src_pte); 469 ptep_set_wrprotect(src_mm, addr, src_pte);
470 pte = *src_pte; 470 pte = pte_wrprotect(pte);
471 } 471 }
472 472
473 /* 473 /*
@@ -506,6 +506,7 @@ again:
506 src_pte = pte_offset_map_nested(src_pmd, addr); 506 src_pte = pte_offset_map_nested(src_pmd, addr);
507 src_ptl = pte_lockptr(src_mm, src_pmd); 507 src_ptl = pte_lockptr(src_mm, src_pmd);
508 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 508 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
509 arch_enter_lazy_mmu_mode();
509 510
510 do { 511 do {
511 /* 512 /*
@@ -527,6 +528,7 @@ again:
527 progress += 8; 528 progress += 8;
528 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 529 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
529 530
531 arch_leave_lazy_mmu_mode();
530 spin_unlock(src_ptl); 532 spin_unlock(src_ptl);
531 pte_unmap_nested(src_pte - 1); 533 pte_unmap_nested(src_pte - 1);
532 add_mm_rss(dst_mm, rss[0], rss[1]); 534 add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -628,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
628 int anon_rss = 0; 630 int anon_rss = 0;
629 631
630 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 632 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
633 arch_enter_lazy_mmu_mode();
631 do { 634 do {
632 pte_t ptent = *pte; 635 pte_t ptent = *pte;
633 if (pte_none(ptent)) { 636 if (pte_none(ptent)) {
@@ -690,10 +693,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
690 continue; 693 continue;
691 if (!pte_file(ptent)) 694 if (!pte_file(ptent))
692 free_swap_and_cache(pte_to_swp_entry(ptent)); 695 free_swap_and_cache(pte_to_swp_entry(ptent));
693 pte_clear_full(mm, addr, pte, tlb->fullmm); 696 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
694 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 697 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
695 698
696 add_mm_rss(mm, file_rss, anon_rss); 699 add_mm_rss(mm, file_rss, anon_rss);
700 arch_leave_lazy_mmu_mode();
697 pte_unmap_unlock(pte - 1, ptl); 701 pte_unmap_unlock(pte - 1, ptl);
698 702
699 return addr; 703 return addr;
@@ -1109,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1109 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1113 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1110 if (!pte) 1114 if (!pte)
1111 return -ENOMEM; 1115 return -ENOMEM;
1116 arch_enter_lazy_mmu_mode();
1112 do { 1117 do {
1113 struct page *page = ZERO_PAGE(addr); 1118 struct page *page = ZERO_PAGE(addr);
1114 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); 1119 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1118,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1118 BUG_ON(!pte_none(*pte)); 1123 BUG_ON(!pte_none(*pte));
1119 set_pte_at(mm, addr, pte, zero_pte); 1124 set_pte_at(mm, addr, pte, zero_pte);
1120 } while (pte++, addr += PAGE_SIZE, addr != end); 1125 } while (pte++, addr += PAGE_SIZE, addr != end);
1126 arch_leave_lazy_mmu_mode();
1121 pte_unmap_unlock(pte - 1, ptl); 1127 pte_unmap_unlock(pte - 1, ptl);
1122 return 0; 1128 return 0;
1123} 1129}
@@ -1275,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1275 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1281 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1276 if (!pte) 1282 if (!pte)
1277 return -ENOMEM; 1283 return -ENOMEM;
1284 arch_enter_lazy_mmu_mode();
1278 do { 1285 do {
1279 BUG_ON(!pte_none(*pte)); 1286 BUG_ON(!pte_none(*pte));
1280 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1287 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1281 pfn++; 1288 pfn++;
1282 } while (pte++, addr += PAGE_SIZE, addr != end); 1289 } while (pte++, addr += PAGE_SIZE, addr != end);
1290 arch_leave_lazy_mmu_mode();
1283 pte_unmap_unlock(pte - 1, ptl); 1291 pte_unmap_unlock(pte - 1, ptl);
1284 return 0; 1292 return 0;
1285} 1293}
@@ -1577,7 +1585,14 @@ gotten:
1577 entry = mk_pte(new_page, vma->vm_page_prot); 1585 entry = mk_pte(new_page, vma->vm_page_prot);
1578 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1586 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1579 lazy_mmu_prot_update(entry); 1587 lazy_mmu_prot_update(entry);
1580 ptep_establish(vma, address, page_table, entry); 1588 /*
1589 * Clear the pte entry and flush it first, before updating the
1590 * pte with the new entry. This will avoid a race condition
1591 * seen in the presence of one thread doing SMC and another
1592 * thread doing COW.
1593 */
1594 ptep_clear_flush(vma, address, page_table);
1595 set_pte_at(mm, address, page_table, entry);
1581 update_mmu_cache(vma, address, entry); 1596 update_mmu_cache(vma, address, entry);
1582 lru_cache_add_active(new_page); 1597 lru_cache_add_active(new_page);
1583 page_add_new_anon_rmap(new_page, vma, address); 1598 page_add_new_anon_rmap(new_page, vma, address);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c37319542b70..fd678a662eae 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/writeback.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/sysctl.h> 18#include <linux/sysctl.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -21,11 +22,41 @@
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
23#include <linux/ioport.h> 24#include <linux/ioport.h>
25#include <linux/cpuset.h>
24 26
25#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
26 28
27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 29/* add this memory to iomem resource */
28 unsigned long size); 30static struct resource *register_memory_resource(u64 start, u64 size)
31{
32 struct resource *res;
33 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
34 BUG_ON(!res);
35
36 res->name = "System RAM";
37 res->start = start;
38 res->end = start + size - 1;
39 res->flags = IORESOURCE_MEM;
40 if (request_resource(&iomem_resource, res) < 0) {
41 printk("System RAM resource %llx - %llx cannot be added\n",
42 (unsigned long long)res->start, (unsigned long long)res->end);
43 kfree(res);
44 res = NULL;
45 }
46 return res;
47}
48
49static void release_memory_resource(struct resource *res)
50{
51 if (!res)
52 return;
53 release_resource(res);
54 kfree(res);
55 return;
56}
57
58
59#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
29static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) 60static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
30{ 61{
31 struct pglist_data *pgdat = zone->zone_pgdat; 62 struct pglist_data *pgdat = zone->zone_pgdat;
@@ -45,8 +76,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
45 return 0; 76 return 0;
46} 77}
47 78
48extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
49 int nr_pages);
50static int __add_section(struct zone *zone, unsigned long phys_start_pfn) 79static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
51{ 80{
52 int nr_pages = PAGES_PER_SECTION; 81 int nr_pages = PAGES_PER_SECTION;
@@ -191,8 +220,10 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
191 if (need_zonelists_rebuild) 220 if (need_zonelists_rebuild)
192 build_all_zonelists(); 221 build_all_zonelists();
193 vm_total_pages = nr_free_pagecache_pages(); 222 vm_total_pages = nr_free_pagecache_pages();
223 writeback_set_ratelimit();
194 return 0; 224 return 0;
195} 225}
226#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
196 227
197static pg_data_t *hotadd_new_pgdat(int nid, u64 start) 228static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
198{ 229{
@@ -222,36 +253,6 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
222 return; 253 return;
223} 254}
224 255
225/* add this memory to iomem resource */
226static struct resource *register_memory_resource(u64 start, u64 size)
227{
228 struct resource *res;
229 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
230 BUG_ON(!res);
231
232 res->name = "System RAM";
233 res->start = start;
234 res->end = start + size - 1;
235 res->flags = IORESOURCE_MEM;
236 if (request_resource(&iomem_resource, res) < 0) {
237 printk("System RAM resource %llx - %llx cannot be added\n",
238 (unsigned long long)res->start, (unsigned long long)res->end);
239 kfree(res);
240 res = NULL;
241 }
242 return res;
243}
244
245static void release_memory_resource(struct resource *res)
246{
247 if (!res)
248 return;
249 release_resource(res);
250 kfree(res);
251 return;
252}
253
254
255 256
256int add_memory(int nid, u64 start, u64 size) 257int add_memory(int nid, u64 start, u64 size)
257{ 258{
@@ -283,6 +284,8 @@ int add_memory(int nid, u64 start, u64 size)
283 /* we online node here. we can't roll back from here. */ 284 /* we online node here. we can't roll back from here. */
284 node_set_online(nid); 285 node_set_online(nid);
285 286
287 cpuset_track_online_nodes();
288
286 if (new_pgdat) { 289 if (new_pgdat) {
287 ret = register_one_node(nid); 290 ret = register_one_node(nid);
288 /* 291 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cf18f0942553..25788b1b7fcf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1324,12 +1324,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1324 atomic_set(&new->refcnt, 1); 1324 atomic_set(&new->refcnt, 1);
1325 if (new->policy == MPOL_BIND) { 1325 if (new->policy == MPOL_BIND) {
1326 int sz = ksize(old->v.zonelist); 1326 int sz = ksize(old->v.zonelist);
1327 new->v.zonelist = kmalloc(sz, SLAB_KERNEL); 1327 new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
1328 if (!new->v.zonelist) { 1328 if (!new->v.zonelist) {
1329 kmem_cache_free(policy_cache, new); 1329 kmem_cache_free(policy_cache, new);
1330 return ERR_PTR(-ENOMEM); 1330 return ERR_PTR(-ENOMEM);
1331 } 1331 }
1332 memcpy(new->v.zonelist, old->v.zonelist, sz);
1333 } 1332 }
1334 return new; 1333 return new;
1335} 1334}
diff --git a/mm/migrate.c b/mm/migrate.c
index 20a8c2687b1e..ba2453f9483d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -409,6 +409,7 @@ int migrate_page(struct address_space *mapping,
409} 409}
410EXPORT_SYMBOL(migrate_page); 410EXPORT_SYMBOL(migrate_page);
411 411
412#ifdef CONFIG_BLOCK
412/* 413/*
413 * Migration function for pages with buffers. This function can only be used 414 * Migration function for pages with buffers. This function can only be used
414 * if the underlying filesystem guarantees that no other references to "page" 415 * if the underlying filesystem guarantees that no other references to "page"
@@ -466,6 +467,7 @@ int buffer_migrate_page(struct address_space *mapping,
466 return 0; 467 return 0;
467} 468}
468EXPORT_SYMBOL(buffer_migrate_page); 469EXPORT_SYMBOL(buffer_migrate_page);
470#endif
469 471
470/* 472/*
471 * Writeback a page to clean the dirty state 473 * Writeback a page to clean the dirty state
@@ -525,7 +527,7 @@ static int fallback_migrate_page(struct address_space *mapping,
525 * Buffers may be managed in a filesystem specific way. 527 * Buffers may be managed in a filesystem specific way.
526 * We must have no buffers or drop them. 528 * We must have no buffers or drop them.
527 */ 529 */
528 if (page_has_buffers(page) && 530 if (PagePrivate(page) &&
529 !try_to_release_page(page, GFP_KERNEL)) 531 !try_to_release_page(page, GFP_KERNEL))
530 return -EAGAIN; 532 return -EAGAIN;
531 533
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 955f9d0e38aa..3b8f3c0c63f3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -34,6 +34,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
34 spinlock_t *ptl; 34 spinlock_t *ptl;
35 35
36 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 36 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
37 arch_enter_lazy_mmu_mode();
37 do { 38 do {
38 oldpte = *pte; 39 oldpte = *pte;
39 if (pte_present(oldpte)) { 40 if (pte_present(oldpte)) {
@@ -70,6 +71,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
70 } 71 }
71 72
72 } while (pte++, addr += PAGE_SIZE, addr != end); 73 } while (pte++, addr += PAGE_SIZE, addr != end);
74 arch_leave_lazy_mmu_mode();
73 pte_unmap_unlock(pte - 1, ptl); 75 pte_unmap_unlock(pte - 1, ptl);
74} 76}
75 77
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c15cf3373ad..9c769fa29f32 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
98 new_ptl = pte_lockptr(mm, new_pmd); 98 new_ptl = pte_lockptr(mm, new_pmd);
99 if (new_ptl != old_ptl) 99 if (new_ptl != old_ptl)
100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 100 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
101 arch_enter_lazy_mmu_mode();
101 102
102 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 103 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
103 new_pte++, new_addr += PAGE_SIZE) { 104 new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
109 set_pte_at(mm, new_addr, new_pte, pte); 110 set_pte_at(mm, new_addr, new_pte, pte);
110 } 111 }
111 112
113 arch_leave_lazy_mmu_mode();
112 if (new_ptl != old_ptl) 114 if (new_ptl != old_ptl)
113 spin_unlock(new_ptl); 115 spin_unlock(new_ptl);
114 pte_unmap_nested(new_pte - 1); 116 pte_unmap_nested(new_pte - 1);
diff --git a/mm/nommu.c b/mm/nommu.c
index 564540662192..365019599df8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -948,7 +948,8 @@ unsigned long do_mmap_pgoff(struct file *file,
948 up_write(&nommu_vma_sem); 948 up_write(&nommu_vma_sem);
949 kfree(vml); 949 kfree(vml);
950 if (vma) { 950 if (vma) {
951 fput(vma->vm_file); 951 if (vma->vm_file)
952 fput(vma->vm_file);
952 kfree(vma); 953 kfree(vma);
953 } 954 }
954 return ret; 955 return ret;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bada3d03119f..20f41b082e16 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -204,16 +204,30 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
204 do_posix_clock_monotonic_gettime(&uptime); 204 do_posix_clock_monotonic_gettime(&uptime);
205 do_each_thread(g, p) { 205 do_each_thread(g, p) {
206 unsigned long points; 206 unsigned long points;
207 int releasing;
208 207
209 /* skip kernel threads */ 208 /*
209 * skip kernel threads and tasks which have already released
210 * their mm.
211 */
210 if (!p->mm) 212 if (!p->mm)
211 continue; 213 continue;
212 /* skip the init task with pid == 1 */ 214 /* skip the init task */
213 if (p->pid == 1) 215 if (is_init(p))
214 continue; 216 continue;
215 217
216 /* 218 /*
219 * This task already has access to memory reserves and is
220 * being killed. Don't allow any other task access to the
221 * memory reserve.
222 *
223 * Note: this may have a chance of deadlock if it gets
224 * blocked waiting for another task which itself is waiting
225 * for memory. Is there a better alternative?
226 */
227 if (test_tsk_thread_flag(p, TIF_MEMDIE))
228 return ERR_PTR(-1UL);
229
230 /*
217 * This is in the process of releasing memory so wait for it 231 * This is in the process of releasing memory so wait for it
218 * to finish before killing some other task by mistake. 232 * to finish before killing some other task by mistake.
219 * 233 *
@@ -221,21 +235,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
221 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 235 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
222 * which will allow it to gain access to memory reserves in 236 * which will allow it to gain access to memory reserves in
223 * the process of exiting and releasing its resources. 237 * the process of exiting and releasing its resources.
224 * Otherwise we could get an OOM deadlock. 238 * Otherwise we could get an easy OOM deadlock.
225 */ 239 */
226 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || 240 if (p->flags & PF_EXITING) {
227 p->flags & PF_EXITING; 241 if (p != current)
228 if (releasing) { 242 return ERR_PTR(-1UL);
229 /* PF_DEAD tasks have already released their mm */ 243
230 if (p->flags & PF_DEAD) 244 chosen = p;
231 continue; 245 *ppoints = ULONG_MAX;
232 if (p->flags & PF_EXITING && p == current) {
233 chosen = p;
234 *ppoints = ULONG_MAX;
235 break;
236 }
237 return ERR_PTR(-1UL);
238 } 246 }
247
239 if (p->oomkilladj == OOM_DISABLE) 248 if (p->oomkilladj == OOM_DISABLE)
240 continue; 249 continue;
241 250
@@ -245,6 +254,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
245 *ppoints = points; 254 *ppoints = points;
246 } 255 }
247 } while_each_thread(g, p); 256 } while_each_thread(g, p);
257
248 return chosen; 258 return chosen;
249} 259}
250 260
@@ -255,20 +265,17 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
255 */ 265 */
256static void __oom_kill_task(struct task_struct *p, const char *message) 266static void __oom_kill_task(struct task_struct *p, const char *message)
257{ 267{
258 if (p->pid == 1) { 268 if (is_init(p)) {
259 WARN_ON(1); 269 WARN_ON(1);
260 printk(KERN_WARNING "tried to kill init!\n"); 270 printk(KERN_WARNING "tried to kill init!\n");
261 return; 271 return;
262 } 272 }
263 273
264 task_lock(p); 274 if (!p->mm) {
265 if (!p->mm || p->mm == &init_mm) {
266 WARN_ON(1); 275 WARN_ON(1);
267 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 276 printk(KERN_WARNING "tried to kill an mm-less task!\n");
268 task_unlock(p);
269 return; 277 return;
270 } 278 }
271 task_unlock(p);
272 279
273 if (message) { 280 if (message) {
274 printk(KERN_ERR "%s: Killed process %d (%s).\n", 281 printk(KERN_ERR "%s: Killed process %d (%s).\n",
@@ -302,7 +309,7 @@ static int oom_kill_task(struct task_struct *p, const char *message)
302 * However, this is of no concern to us. 309 * However, this is of no concern to us.
303 */ 310 */
304 311
305 if (mm == NULL || mm == &init_mm) 312 if (mm == NULL)
306 return 1; 313 return 1;
307 314
308 __oom_kill_task(p, message); 315 __oom_kill_task(p, message);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 555752907dc3..c0d4ce144dec 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -30,6 +30,8 @@
30#include <linux/sysctl.h> 30#include <linux/sysctl.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/syscalls.h> 32#include <linux/syscalls.h>
33#include <linux/buffer_head.h>
34#include <linux/pagevec.h>
33 35
34/* 36/*
35 * The maximum number of pages to writeout in a single bdflush/kupdate 37 * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -46,7 +48,6 @@
46 */ 48 */
47static long ratelimit_pages = 32; 49static long ratelimit_pages = 32;
48 50
49static long total_pages; /* The total number of pages in the machine. */
50static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ 51static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
51 52
52/* 53/*
@@ -126,7 +127,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
126 int unmapped_ratio; 127 int unmapped_ratio;
127 long background; 128 long background;
128 long dirty; 129 long dirty;
129 unsigned long available_memory = total_pages; 130 unsigned long available_memory = vm_total_pages;
130 struct task_struct *tsk; 131 struct task_struct *tsk;
131 132
132#ifdef CONFIG_HIGHMEM 133#ifdef CONFIG_HIGHMEM
@@ -141,7 +142,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
141 142
142 unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + 143 unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
143 global_page_state(NR_ANON_PAGES)) * 100) / 144 global_page_state(NR_ANON_PAGES)) * 100) /
144 total_pages; 145 vm_total_pages;
145 146
146 dirty_ratio = vm_dirty_ratio; 147 dirty_ratio = vm_dirty_ratio;
147 if (dirty_ratio > unmapped_ratio / 2) 148 if (dirty_ratio > unmapped_ratio / 2)
@@ -502,9 +503,9 @@ void laptop_sync_completion(void)
502 * will write six megabyte chunks, max. 503 * will write six megabyte chunks, max.
503 */ 504 */
504 505
505static void set_ratelimit(void) 506void writeback_set_ratelimit(void)
506{ 507{
507 ratelimit_pages = total_pages / (num_online_cpus() * 32); 508 ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
508 if (ratelimit_pages < 16) 509 if (ratelimit_pages < 16)
509 ratelimit_pages = 16; 510 ratelimit_pages = 16;
510 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) 511 if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
@@ -514,7 +515,7 @@ static void set_ratelimit(void)
514static int __cpuinit 515static int __cpuinit
515ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) 516ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
516{ 517{
517 set_ratelimit(); 518 writeback_set_ratelimit();
518 return 0; 519 return 0;
519} 520}
520 521
@@ -533,9 +534,7 @@ void __init page_writeback_init(void)
533 long buffer_pages = nr_free_buffer_pages(); 534 long buffer_pages = nr_free_buffer_pages();
534 long correction; 535 long correction;
535 536
536 total_pages = nr_free_pagecache_pages(); 537 correction = (100 * 4 * buffer_pages) / vm_total_pages;
537
538 correction = (100 * 4 * buffer_pages) / total_pages;
539 538
540 if (correction < 100) { 539 if (correction < 100) {
541 dirty_background_ratio *= correction; 540 dirty_background_ratio *= correction;
@@ -549,10 +548,143 @@ void __init page_writeback_init(void)
549 vm_dirty_ratio = 1; 548 vm_dirty_ratio = 1;
550 } 549 }
551 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 550 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
552 set_ratelimit(); 551 writeback_set_ratelimit();
553 register_cpu_notifier(&ratelimit_nb); 552 register_cpu_notifier(&ratelimit_nb);
554} 553}
555 554
555/**
556 * generic_writepages - walk the list of dirty pages of the given
557 * address space and writepage() all of them.
558 *
559 * @mapping: address space structure to write
560 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
561 *
562 * This is a library function, which implements the writepages()
563 * address_space_operation.
564 *
565 * If a page is already under I/O, generic_writepages() skips it, even
566 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
567 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
568 * and msync() need to guarantee that all the data which was dirty at the time
569 * the call was made get new I/O started against them. If wbc->sync_mode is
570 * WB_SYNC_ALL then we were called for data integrity and we must wait for
571 * existing IO to complete.
572 *
573 * Derived from mpage_writepages() - if you fix this you should check that
574 * also!
575 */
576int generic_writepages(struct address_space *mapping,
577 struct writeback_control *wbc)
578{
579 struct backing_dev_info *bdi = mapping->backing_dev_info;
580 int ret = 0;
581 int done = 0;
582 int (*writepage)(struct page *page, struct writeback_control *wbc);
583 struct pagevec pvec;
584 int nr_pages;
585 pgoff_t index;
586 pgoff_t end; /* Inclusive */
587 int scanned = 0;
588 int range_whole = 0;
589
590 if (wbc->nonblocking && bdi_write_congested(bdi)) {
591 wbc->encountered_congestion = 1;
592 return 0;
593 }
594
595 writepage = mapping->a_ops->writepage;
596
597 /* deal with chardevs and other special file */
598 if (!writepage)
599 return 0;
600
601 pagevec_init(&pvec, 0);
602 if (wbc->range_cyclic) {
603 index = mapping->writeback_index; /* Start from prev offset */
604 end = -1;
605 } else {
606 index = wbc->range_start >> PAGE_CACHE_SHIFT;
607 end = wbc->range_end >> PAGE_CACHE_SHIFT;
608 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
609 range_whole = 1;
610 scanned = 1;
611 }
612retry:
613 while (!done && (index <= end) &&
614 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
615 PAGECACHE_TAG_DIRTY,
616 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
617 unsigned i;
618
619 scanned = 1;
620 for (i = 0; i < nr_pages; i++) {
621 struct page *page = pvec.pages[i];
622
623 /*
624 * At this point we hold neither mapping->tree_lock nor
625 * lock on the page itself: the page may be truncated or
626 * invalidated (changing page->mapping to NULL), or even
627 * swizzled back from swapper_space to tmpfs file
628 * mapping
629 */
630 lock_page(page);
631
632 if (unlikely(page->mapping != mapping)) {
633 unlock_page(page);
634 continue;
635 }
636
637 if (!wbc->range_cyclic && page->index > end) {
638 done = 1;
639 unlock_page(page);
640 continue;
641 }
642
643 if (wbc->sync_mode != WB_SYNC_NONE)
644 wait_on_page_writeback(page);
645
646 if (PageWriteback(page) ||
647 !clear_page_dirty_for_io(page)) {
648 unlock_page(page);
649 continue;
650 }
651
652 ret = (*writepage)(page, wbc);
653 if (ret) {
654 if (ret == -ENOSPC)
655 set_bit(AS_ENOSPC, &mapping->flags);
656 else
657 set_bit(AS_EIO, &mapping->flags);
658 }
659
660 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
661 unlock_page(page);
662 if (ret || (--(wbc->nr_to_write) <= 0))
663 done = 1;
664 if (wbc->nonblocking && bdi_write_congested(bdi)) {
665 wbc->encountered_congestion = 1;
666 done = 1;
667 }
668 }
669 pagevec_release(&pvec);
670 cond_resched();
671 }
672 if (!scanned && !done) {
673 /*
674 * We hit the last page and there is more work to be done: wrap
675 * back to the start of the file
676 */
677 scanned = 1;
678 index = 0;
679 goto retry;
680 }
681 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
682 mapping->writeback_index = index;
683 return ret;
684}
685
686EXPORT_SYMBOL(generic_writepages);
687
556int do_writepages(struct address_space *mapping, struct writeback_control *wbc) 688int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
557{ 689{
558 int ret; 690 int ret;
@@ -675,9 +807,11 @@ int fastcall set_page_dirty(struct page *page)
675 807
676 if (likely(mapping)) { 808 if (likely(mapping)) {
677 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 809 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
678 if (spd) 810#ifdef CONFIG_BLOCK
679 return (*spd)(page); 811 if (!spd)
680 return __set_page_dirty_buffers(page); 812 spd = __set_page_dirty_buffers;
813#endif
814 return (*spd)(page);
681 } 815 }
682 if (!PageDirty(page)) { 816 if (!PageDirty(page)) {
683 if (!TestSetPageDirty(page)) 817 if (!TestSetPageDirty(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index eda907c3a86a..bb8ca7ef7094 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -26,6 +26,8 @@
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/xattr.h>
30#include <linux/generic_acl.h>
29#include <linux/mm.h> 31#include <linux/mm.h>
30#include <linux/mman.h> 32#include <linux/mman.h>
31#include <linux/file.h> 33#include <linux/file.h>
@@ -177,6 +179,7 @@ static const struct address_space_operations shmem_aops;
177static struct file_operations shmem_file_operations; 179static struct file_operations shmem_file_operations;
178static struct inode_operations shmem_inode_operations; 180static struct inode_operations shmem_inode_operations;
179static struct inode_operations shmem_dir_inode_operations; 181static struct inode_operations shmem_dir_inode_operations;
182static struct inode_operations shmem_special_inode_operations;
180static struct vm_operations_struct shmem_vm_ops; 183static struct vm_operations_struct shmem_vm_ops;
181 184
182static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 185static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
@@ -637,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
637 struct page *page = NULL; 640 struct page *page = NULL;
638 int error; 641 int error;
639 642
640 if (attr->ia_valid & ATTR_SIZE) { 643 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
641 if (attr->ia_size < inode->i_size) { 644 if (attr->ia_size < inode->i_size) {
642 /* 645 /*
643 * If truncating down to a partial page, then 646 * If truncating down to a partial page, then
@@ -670,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
670 error = inode_change_ok(inode, attr); 673 error = inode_change_ok(inode, attr);
671 if (!error) 674 if (!error)
672 error = inode_setattr(inode, attr); 675 error = inode_setattr(inode, attr);
676#ifdef CONFIG_TMPFS_POSIX_ACL
677 if (!error && (attr->ia_valid & ATTR_MODE))
678 error = generic_acl_chmod(inode, &shmem_acl_ops);
679#endif
673 if (page) 680 if (page)
674 page_cache_release(page); 681 page_cache_release(page);
675 return error; 682 return error;
@@ -1362,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1362 1369
1363 switch (mode & S_IFMT) { 1370 switch (mode & S_IFMT) {
1364 default: 1371 default:
1372 inode->i_op = &shmem_special_inode_operations;
1365 init_special_inode(inode, mode, dev); 1373 init_special_inode(inode, mode, dev);
1366 break; 1374 break;
1367 case S_IFREG: 1375 case S_IFREG:
@@ -1371,7 +1379,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1371 &sbinfo->policy_nodes); 1379 &sbinfo->policy_nodes);
1372 break; 1380 break;
1373 case S_IFDIR: 1381 case S_IFDIR:
1374 inode->i_nlink++; 1382 inc_nlink(inode);
1375 /* Some things misbehave if size == 0 on a directory */ 1383 /* Some things misbehave if size == 0 on a directory */
1376 inode->i_size = 2 * BOGO_DIRENT_SIZE; 1384 inode->i_size = 2 * BOGO_DIRENT_SIZE;
1377 inode->i_op = &shmem_dir_inode_operations; 1385 inode->i_op = &shmem_dir_inode_operations;
@@ -1682,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1682 iput(inode); 1690 iput(inode);
1683 return error; 1691 return error;
1684 } 1692 }
1685 error = 0; 1693 }
1694 error = shmem_acl_init(inode, dir);
1695 if (error) {
1696 iput(inode);
1697 return error;
1686 } 1698 }
1687 if (dir->i_mode & S_ISGID) { 1699 if (dir->i_mode & S_ISGID) {
1688 inode->i_gid = dir->i_gid; 1700 inode->i_gid = dir->i_gid;
@@ -1703,7 +1715,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1703 1715
1704 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 1716 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1705 return error; 1717 return error;
1706 dir->i_nlink++; 1718 inc_nlink(dir);
1707 return 0; 1719 return 0;
1708} 1720}
1709 1721
@@ -1738,7 +1750,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1738 1750
1739 dir->i_size += BOGO_DIRENT_SIZE; 1751 dir->i_size += BOGO_DIRENT_SIZE;
1740 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1752 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1741 inode->i_nlink++; 1753 inc_nlink(inode);
1742 atomic_inc(&inode->i_count); /* New dentry reference */ 1754 atomic_inc(&inode->i_count); /* New dentry reference */
1743 dget(dentry); /* Extra pinning count for the created dentry */ 1755 dget(dentry); /* Extra pinning count for the created dentry */
1744 d_instantiate(dentry, inode); 1756 d_instantiate(dentry, inode);
@@ -1760,7 +1772,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1760 1772
1761 dir->i_size -= BOGO_DIRENT_SIZE; 1773 dir->i_size -= BOGO_DIRENT_SIZE;
1762 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1774 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1763 inode->i_nlink--; 1775 drop_nlink(inode);
1764 dput(dentry); /* Undo the count from "create" - this does all the work */ 1776 dput(dentry); /* Undo the count from "create" - this does all the work */
1765 return 0; 1777 return 0;
1766} 1778}
@@ -1770,8 +1782,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1770 if (!simple_empty(dentry)) 1782 if (!simple_empty(dentry))
1771 return -ENOTEMPTY; 1783 return -ENOTEMPTY;
1772 1784
1773 dentry->d_inode->i_nlink--; 1785 drop_nlink(dentry->d_inode);
1774 dir->i_nlink--; 1786 drop_nlink(dir);
1775 return shmem_unlink(dir, dentry); 1787 return shmem_unlink(dir, dentry);
1776} 1788}
1777 1789
@@ -1792,10 +1804,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
1792 if (new_dentry->d_inode) { 1804 if (new_dentry->d_inode) {
1793 (void) shmem_unlink(new_dir, new_dentry); 1805 (void) shmem_unlink(new_dir, new_dentry);
1794 if (they_are_dirs) 1806 if (they_are_dirs)
1795 old_dir->i_nlink--; 1807 drop_nlink(old_dir);
1796 } else if (they_are_dirs) { 1808 } else if (they_are_dirs) {
1797 old_dir->i_nlink--; 1809 drop_nlink(old_dir);
1798 new_dir->i_nlink++; 1810 inc_nlink(new_dir);
1799 } 1811 }
1800 1812
1801 old_dir->i_size -= BOGO_DIRENT_SIZE; 1813 old_dir->i_size -= BOGO_DIRENT_SIZE;
@@ -1897,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = {
1897 .put_link = shmem_put_link, 1909 .put_link = shmem_put_link,
1898}; 1910};
1899 1911
1912#ifdef CONFIG_TMPFS_POSIX_ACL
1913/**
1914 * Superblocks without xattr inode operations will get security.* xattr
1915 * support from the VFS "for free". As soon as we have any other xattrs
1916 * like ACLs, we also need to implement the security.* handlers at
1917 * filesystem level, though.
1918 */
1919
1920static size_t shmem_xattr_security_list(struct inode *inode, char *list,
1921 size_t list_len, const char *name,
1922 size_t name_len)
1923{
1924 return security_inode_listsecurity(inode, list, list_len);
1925}
1926
1927static int shmem_xattr_security_get(struct inode *inode, const char *name,
1928 void *buffer, size_t size)
1929{
1930 if (strcmp(name, "") == 0)
1931 return -EINVAL;
1932 return security_inode_getsecurity(inode, name, buffer, size,
1933 -EOPNOTSUPP);
1934}
1935
1936static int shmem_xattr_security_set(struct inode *inode, const char *name,
1937 const void *value, size_t size, int flags)
1938{
1939 if (strcmp(name, "") == 0)
1940 return -EINVAL;
1941 return security_inode_setsecurity(inode, name, value, size, flags);
1942}
1943
1944struct xattr_handler shmem_xattr_security_handler = {
1945 .prefix = XATTR_SECURITY_PREFIX,
1946 .list = shmem_xattr_security_list,
1947 .get = shmem_xattr_security_get,
1948 .set = shmem_xattr_security_set,
1949};
1950
1951static struct xattr_handler *shmem_xattr_handlers[] = {
1952 &shmem_xattr_acl_access_handler,
1953 &shmem_xattr_acl_default_handler,
1954 &shmem_xattr_security_handler,
1955 NULL
1956};
1957#endif
1958
1900static int shmem_parse_options(char *options, int *mode, uid_t *uid, 1959static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1901 gid_t *gid, unsigned long *blocks, unsigned long *inodes, 1960 gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1902 int *policy, nodemask_t *policy_nodes) 1961 int *policy, nodemask_t *policy_nodes)
@@ -2094,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb,
2094 sb->s_magic = TMPFS_MAGIC; 2153 sb->s_magic = TMPFS_MAGIC;
2095 sb->s_op = &shmem_ops; 2154 sb->s_op = &shmem_ops;
2096 sb->s_time_gran = 1; 2155 sb->s_time_gran = 1;
2156#ifdef CONFIG_TMPFS_POSIX_ACL
2157 sb->s_xattr = shmem_xattr_handlers;
2158 sb->s_flags |= MS_POSIXACL;
2159#endif
2097 2160
2098 inode = shmem_get_inode(sb, S_IFDIR | mode, 0); 2161 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2099 if (!inode) 2162 if (!inode)
@@ -2130,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode)
2130 /* only struct inode is valid if it's an inline symlink */ 2193 /* only struct inode is valid if it's an inline symlink */
2131 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2194 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2132 } 2195 }
2196 shmem_acl_destroy_inode(inode);
2133 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2197 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2134} 2198}
2135 2199
@@ -2141,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep,
2141 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 2205 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2142 SLAB_CTOR_CONSTRUCTOR) { 2206 SLAB_CTOR_CONSTRUCTOR) {
2143 inode_init_once(&p->vfs_inode); 2207 inode_init_once(&p->vfs_inode);
2208#ifdef CONFIG_TMPFS_POSIX_ACL
2209 p->i_acl = NULL;
2210 p->i_default_acl = NULL;
2211#endif
2144 } 2212 }
2145} 2213}
2146 2214
@@ -2184,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = {
2184 .truncate = shmem_truncate, 2252 .truncate = shmem_truncate,
2185 .setattr = shmem_notify_change, 2253 .setattr = shmem_notify_change,
2186 .truncate_range = shmem_truncate_range, 2254 .truncate_range = shmem_truncate_range,
2255#ifdef CONFIG_TMPFS_POSIX_ACL
2256 .setxattr = generic_setxattr,
2257 .getxattr = generic_getxattr,
2258 .listxattr = generic_listxattr,
2259 .removexattr = generic_removexattr,
2260 .permission = shmem_permission,
2261#endif
2262
2187}; 2263};
2188 2264
2189static struct inode_operations shmem_dir_inode_operations = { 2265static struct inode_operations shmem_dir_inode_operations = {
@@ -2198,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = {
2198 .mknod = shmem_mknod, 2274 .mknod = shmem_mknod,
2199 .rename = shmem_rename, 2275 .rename = shmem_rename,
2200#endif 2276#endif
2277#ifdef CONFIG_TMPFS_POSIX_ACL
2278 .setattr = shmem_notify_change,
2279 .setxattr = generic_setxattr,
2280 .getxattr = generic_getxattr,
2281 .listxattr = generic_listxattr,
2282 .removexattr = generic_removexattr,
2283 .permission = shmem_permission,
2284#endif
2285};
2286
2287static struct inode_operations shmem_special_inode_operations = {
2288#ifdef CONFIG_TMPFS_POSIX_ACL
2289 .setattr = shmem_notify_change,
2290 .setxattr = generic_setxattr,
2291 .getxattr = generic_getxattr,
2292 .listxattr = generic_listxattr,
2293 .removexattr = generic_removexattr,
2294 .permission = shmem_permission,
2295#endif
2201}; 2296};
2202 2297
2203static struct super_operations shmem_ops = { 2298static struct super_operations shmem_ops = {
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
new file mode 100644
index 000000000000..c946bf468718
--- /dev/null
+++ b/mm/shmem_acl.c
@@ -0,0 +1,197 @@
1/*
2 * mm/shmem_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/fs.h>
10#include <linux/shmem_fs.h>
11#include <linux/xattr.h>
12#include <linux/generic_acl.h>
13
14/**
15 * shmem_get_acl - generic_acl_operations->getacl() operation
16 */
17static struct posix_acl *
18shmem_get_acl(struct inode *inode, int type)
19{
20 struct posix_acl *acl = NULL;
21
22 spin_lock(&inode->i_lock);
23 switch(type) {
24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
26 break;
27
28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
30 break;
31 }
32 spin_unlock(&inode->i_lock);
33
34 return acl;
35}
36
37/**
38 * shmem_get_acl - generic_acl_operations->setacl() operation
39 */
40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
42{
43 struct posix_acl *free = NULL;
44
45 spin_lock(&inode->i_lock);
46 switch(type) {
47 case ACL_TYPE_ACCESS:
48 free = SHMEM_I(inode)->i_acl;
49 SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
50 break;
51
52 case ACL_TYPE_DEFAULT:
53 free = SHMEM_I(inode)->i_default_acl;
54 SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
55 break;
56 }
57 spin_unlock(&inode->i_lock);
58 posix_acl_release(free);
59}
60
61struct generic_acl_operations shmem_acl_ops = {
62 .getacl = shmem_get_acl,
63 .setacl = shmem_set_acl,
64};
65
66/**
67 * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
68 * shmem_xattr_acl_access_handler - plumbing code to implement the
69 * system.posix_acl_access xattr using the generic acl functions.
70 */
71
72static size_t
73shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
74 const char *name, size_t name_len)
75{
76 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
77 list, list_size);
78}
79
80static int
81shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
82 size_t size)
83{
84 if (strcmp(name, "") != 0)
85 return -EINVAL;
86 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
87 size);
88}
89
90static int
91shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
92 size_t size, int flags)
93{
94 if (strcmp(name, "") != 0)
95 return -EINVAL;
96 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
97 size);
98}
99
100struct xattr_handler shmem_xattr_acl_access_handler = {
101 .prefix = POSIX_ACL_XATTR_ACCESS,
102 .list = shmem_list_acl_access,
103 .get = shmem_get_acl_access,
104 .set = shmem_set_acl_access,
105};
106
107/**
108 * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
109 * shmem_xattr_acl_default_handler - plumbing code to implement the
110 * system.posix_acl_default xattr using the generic acl functions.
111 */
112
113static size_t
114shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
115 const char *name, size_t name_len)
116{
117 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
118 list, list_size);
119}
120
121static int
122shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
123 size_t size)
124{
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
128 size);
129}
130
131static int
132shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
133 size_t size, int flags)
134{
135 if (strcmp(name, "") != 0)
136 return -EINVAL;
137 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
138 size);
139}
140
141struct xattr_handler shmem_xattr_acl_default_handler = {
142 .prefix = POSIX_ACL_XATTR_DEFAULT,
143 .list = shmem_list_acl_default,
144 .get = shmem_get_acl_default,
145 .set = shmem_set_acl_default,
146};
147
148/**
149 * shmem_acl_init - Inizialize the acl(s) of a new inode
150 */
151int
152shmem_acl_init(struct inode *inode, struct inode *dir)
153{
154 return generic_acl_init(inode, dir, &shmem_acl_ops);
155}
156
157/**
158 * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
159 *
160 * This is done before destroying the actual inode.
161 */
162
163void
164shmem_acl_destroy_inode(struct inode *inode)
165{
166 if (SHMEM_I(inode)->i_acl)
167 posix_acl_release(SHMEM_I(inode)->i_acl);
168 SHMEM_I(inode)->i_acl = NULL;
169 if (SHMEM_I(inode)->i_default_acl)
170 posix_acl_release(SHMEM_I(inode)->i_default_acl);
171 SHMEM_I(inode)->i_default_acl = NULL;
172}
173
174/**
175 * shmem_check_acl - check_acl() callback for generic_permission()
176 */
177static int
178shmem_check_acl(struct inode *inode, int mask)
179{
180 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
181
182 if (acl) {
183 int error = posix_acl_permission(inode, acl, mask);
184 posix_acl_release(acl);
185 return error;
186 }
187 return -EAGAIN;
188}
189
190/**
191 * shmem_permission - permission() inode operation
192 */
193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
195{
196 return generic_permission(inode, mask, shmem_check_acl);
197}
diff --git a/mm/slab.c b/mm/slab.c
index 792bfe320a8b..3dbd6f4e7477 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1683,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1683static void dump_line(char *data, int offset, int limit) 1683static void dump_line(char *data, int offset, int limit)
1684{ 1684{
1685 int i; 1685 int i;
1686 unsigned char error = 0;
1687 int bad_count = 0;
1688
1686 printk(KERN_ERR "%03x:", offset); 1689 printk(KERN_ERR "%03x:", offset);
1687 for (i = 0; i < limit; i++) 1690 for (i = 0; i < limit; i++) {
1691 if (data[offset + i] != POISON_FREE) {
1692 error = data[offset + i];
1693 bad_count++;
1694 }
1688 printk(" %02x", (unsigned char)data[offset + i]); 1695 printk(" %02x", (unsigned char)data[offset + i]);
1696 }
1689 printk("\n"); 1697 printk("\n");
1698
1699 if (bad_count == 1) {
1700 error ^= POISON_FREE;
1701 if (!(error & (error - 1))) {
1702 printk(KERN_ERR "Single bit error detected. Probably "
1703 "bad RAM.\n");
1704#ifdef CONFIG_X86
1705 printk(KERN_ERR "Run memtest86+ or a similar memory "
1706 "test tool.\n");
1707#else
1708 printk(KERN_ERR "Run a memory test tool.\n");
1709#endif
1710 }
1711 }
1690} 1712}
1691#endif 1713#endif
1692 1714
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1f5ec783781..a15def63f28f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type)
1723 */ 1723 */
1724int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1724int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1725{ 1725{
1726 int ret = 0, i = 1 << page_cluster; 1726 int our_page_cluster = page_cluster;
1727 int ret = 0, i = 1 << our_page_cluster;
1727 unsigned long toff; 1728 unsigned long toff;
1728 struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 1729 struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
1729 1730
1730 if (!page_cluster) /* no readahead */ 1731 if (!our_page_cluster) /* no readahead */
1731 return 0; 1732 return 0;
1732 toff = (swp_offset(entry) >> page_cluster) << page_cluster; 1733 toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
1733 if (!toff) /* first page is swap header */ 1734 if (!toff) /* first page is swap header */
1734 toff++, i--; 1735 toff++, i--;
1735 *offset = toff; 1736 *offset = toff;
diff --git a/mm/truncate.c b/mm/truncate.c
index a654928323dc..f4edbc179d14 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -17,6 +17,32 @@
17 do_invalidatepage */ 17 do_invalidatepage */
18 18
19 19
20/**
21 * do_invalidatepage - invalidate part of all of a page
22 * @page: the page which is affected
23 * @offset: the index of the truncation point
24 *
25 * do_invalidatepage() is called when all or part of the page has become
26 * invalidated by a truncate operation.
27 *
28 * do_invalidatepage() does not have to release all buffers, but it must
29 * ensure that no dirty buffer is left outside @offset and that no I/O
30 * is underway against any of the blocks which are outside the truncation
31 * point. Because the caller is about to free (and possibly reuse) those
32 * blocks on-disk.
33 */
34void do_invalidatepage(struct page *page, unsigned long offset)
35{
36 void (*invalidatepage)(struct page *, unsigned long);
37 invalidatepage = page->mapping->a_ops->invalidatepage;
38#ifdef CONFIG_BLOCK
39 if (!invalidatepage)
40 invalidatepage = block_invalidatepage;
41#endif
42 if (invalidatepage)
43 (*invalidatepage)(page, offset);
44}
45
20static inline void truncate_partial_page(struct page *page, unsigned partial) 46static inline void truncate_partial_page(struct page *page, unsigned partial)
21{ 47{
22 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 48 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
@@ -261,9 +287,39 @@ unsigned long invalidate_inode_pages(struct address_space *mapping)
261{ 287{
262 return invalidate_mapping_pages(mapping, 0, ~0UL); 288 return invalidate_mapping_pages(mapping, 0, ~0UL);
263} 289}
264
265EXPORT_SYMBOL(invalidate_inode_pages); 290EXPORT_SYMBOL(invalidate_inode_pages);
266 291
292/*
293 * This is like invalidate_complete_page(), except it ignores the page's
294 * refcount. We do this because invalidate_inode_pages2() needs stronger
295 * invalidation guarantees, and cannot afford to leave pages behind because
296 * shrink_list() has a temp ref on them, or because they're transiently sitting
297 * in the lru_cache_add() pagevecs.
298 */
299static int
300invalidate_complete_page2(struct address_space *mapping, struct page *page)
301{
302 if (page->mapping != mapping)
303 return 0;
304
305 if (PagePrivate(page) && !try_to_release_page(page, 0))
306 return 0;
307
308 write_lock_irq(&mapping->tree_lock);
309 if (PageDirty(page))
310 goto failed;
311
312 BUG_ON(PagePrivate(page));
313 __remove_from_page_cache(page);
314 write_unlock_irq(&mapping->tree_lock);
315 ClearPageUptodate(page);
316 page_cache_release(page); /* pagecache ref */
317 return 1;
318failed:
319 write_unlock_irq(&mapping->tree_lock);
320 return 0;
321}
322
267/** 323/**
268 * invalidate_inode_pages2_range - remove range of pages from an address_space 324 * invalidate_inode_pages2_range - remove range of pages from an address_space
269 * @mapping: the address_space 325 * @mapping: the address_space
@@ -330,7 +386,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
330 } 386 }
331 } 387 }
332 was_dirty = test_clear_page_dirty(page); 388 was_dirty = test_clear_page_dirty(page);
333 if (!invalidate_complete_page(mapping, page)) { 389 if (!invalidate_complete_page2(mapping, page)) {
334 if (was_dirty) 390 if (was_dirty)
335 set_page_dirty(page); 391 set_page_dirty(page);
336 ret = -EIO; 392 ret = -EIO;
diff --git a/mm/util.c b/mm/util.c
index 7368479220b3..e14fa84ef39a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -40,6 +40,24 @@ char *kstrdup(const char *s, gfp_t gfp)
40} 40}
41EXPORT_SYMBOL(kstrdup); 41EXPORT_SYMBOL(kstrdup);
42 42
43/**
44 * kmemdup - duplicate region of memory
45 *
46 * @src: memory region to duplicate
47 * @len: memory region length
48 * @gfp: GFP mask to use
49 */
50void *kmemdup(const void *src, size_t len, gfp_t gfp)
51{
52 void *p;
53
54 p = ____kmalloc(len, gfp);
55 if (p)
56 memcpy(p, src, len);
57 return p;
58}
59EXPORT_SYMBOL(kmemdup);
60
43/* 61/*
44 * strndup_user - duplicate an existing string from user space 62 * strndup_user - duplicate an existing string from user space
45 * 63 *