aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cachetlb.txt9
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--Documentation/m68k/kernel-options.txt24
-rw-r--r--arch/alpha/mm/numa.c3
-rw-r--r--arch/alpha/mm/remap.c6
-rw-r--r--arch/arm/kernel/signal.c96
-rw-r--r--arch/arm/kernel/traps.c14
-rw-r--r--arch/arm/mm/consistent.c6
-rw-r--r--arch/arm/mm/fault-armv.c7
-rw-r--r--arch/arm/mm/ioremap.c4
-rw-r--r--arch/arm/mm/mm-armv.c15
-rw-r--r--arch/arm/oprofile/backtrace.c46
-rw-r--r--arch/arm26/mm/memc.c18
-rw-r--r--arch/cris/arch-v32/mm/tlb.c6
-rw-r--r--arch/cris/mm/ioremap.c4
-rw-r--r--arch/frv/mm/dma-alloc.c5
-rw-r--r--arch/frv/mm/pgalloc.c4
-rw-r--r--arch/i386/kernel/vm86.c17
-rw-r--r--arch/i386/mm/discontig.c4
-rw-r--r--arch/i386/mm/init.c62
-rw-r--r--arch/i386/mm/ioremap.c4
-rw-r--r--arch/i386/mm/pgtable.c11
-rw-r--r--arch/i386/oprofile/backtrace.c38
-rw-r--r--arch/ia64/kernel/perfmon.c3
-rw-r--r--arch/ia64/mm/discontig.c7
-rw-r--r--arch/ia64/mm/fault.c34
-rw-r--r--arch/ia64/mm/init.c13
-rw-r--r--arch/ia64/mm/tlb.c2
-rw-r--r--arch/m32r/mm/init.c9
-rw-r--r--arch/m32r/mm/ioremap.c4
-rw-r--r--arch/m68k/Kconfig24
-rw-r--r--arch/m68k/atari/stram.c918
-rw-r--r--arch/m68k/mm/kmap.c2
-rw-r--r--arch/m68k/sun3x/dvma.c2
-rw-r--r--arch/mips/kernel/irixelf.c1
-rw-r--r--arch/mips/mm/ioremap.c4
-rw-r--r--arch/parisc/kernel/cache.c24
-rw-r--r--arch/parisc/kernel/pci-dma.c2
-rw-r--r--arch/parisc/mm/init.c3
-rw-r--r--arch/parisc/mm/ioremap.c6
-rw-r--r--arch/ppc/kernel/dma-mapping.c6
-rw-r--r--arch/ppc/mm/4xx_mmu.c4
-rw-r--r--arch/ppc/mm/pgtable.c4
-rw-r--r--arch/ppc64/kernel/vdso.c12
-rw-r--r--arch/ppc64/mm/imalloc.c5
-rw-r--r--arch/ppc64/mm/init.c87
-rw-r--r--arch/s390/mm/ioremap.c4
-rw-r--r--arch/sh/mm/fault.c40
-rw-r--r--arch/sh/mm/hugetlbpage.c2
-rw-r--r--arch/sh/mm/ioremap.c4
-rw-r--r--arch/sh64/mm/cache.c68
-rw-r--r--arch/sh64/mm/hugetlbpage.c188
-rw-r--r--arch/sh64/mm/ioremap.c4
-rw-r--r--arch/sparc/mm/generic.c7
-rw-r--r--arch/sparc64/kernel/binfmt_aout32.c1
-rw-r--r--arch/sparc64/mm/generic.c9
-rw-r--r--arch/sparc64/mm/tlb.c7
-rw-r--r--arch/um/include/tlb.h1
-rw-r--r--arch/um/kernel/process_kern.c8
-rw-r--r--arch/um/kernel/skas/mmu.c4
-rw-r--r--arch/um/kernel/tt/tlb.c36
-rw-r--r--arch/x86_64/ia32/ia32_aout.c1
-rw-r--r--arch/x86_64/mm/ioremap.c4
-rw-r--r--drivers/acpi/acpi_memhotplug.c5
-rw-r--r--drivers/base/Makefile1
-rw-r--r--drivers/base/init.c2
-rw-r--r--drivers/base/memory.c452
-rw-r--r--drivers/scsi/sg.c12
-rw-r--r--drivers/scsi/st.c10
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/binfmt_aout.c1
-rw-r--r--fs/binfmt_elf.c1
-rw-r--r--fs/binfmt_elf_fdpic.c7
-rw-r--r--fs/binfmt_flat.c1
-rw-r--r--fs/binfmt_som.c1
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/compat.c1
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/exec.c17
-rw-r--r--fs/hugetlbfs/inode.c206
-rw-r--r--fs/jfs/jfs_metapage.c12
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/task_mmu.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c7
-rw-r--r--include/asm-alpha/barrier.h2
-rw-r--r--include/asm-alpha/rwsem.h5
-rw-r--r--include/asm-arm/tlb.h23
-rw-r--r--include/asm-arm26/tlb.h47
-rw-r--r--include/asm-generic/4level-fixup.h11
-rw-r--r--include/asm-generic/pgtable.h2
-rw-r--r--include/asm-generic/tlb.h23
-rw-r--r--include/asm-i386/mmzone.h6
-rw-r--r--include/asm-i386/pgtable.h3
-rw-r--r--include/asm-i386/rwsem.h5
-rw-r--r--include/asm-ia64/rwsem.h5
-rw-r--r--include/asm-ia64/tlb.h19
-rw-r--r--include/asm-m32r/mmzone.h6
-rw-r--r--include/asm-parisc/cacheflush.h35
-rw-r--r--include/asm-parisc/mmzone.h6
-rw-r--r--include/asm-parisc/tlbflush.h3
-rw-r--r--include/asm-ppc/rwsem.h5
-rw-r--r--include/asm-ppc64/mmzone.h3
-rw-r--r--include/asm-ppc64/pgtable.h4
-rw-r--r--include/asm-ppc64/rwsem.h5
-rw-r--r--include/asm-s390/rwsem.h5
-rw-r--r--include/asm-sh/rwsem.h5
-rw-r--r--include/asm-sparc64/rwsem.h5
-rw-r--r--include/asm-sparc64/tlb.h29
-rw-r--r--include/asm-um/pgtable.h2
-rw-r--r--include/asm-x86_64/rwsem.h5
-rw-r--r--include/linux/buffer_head.h6
-rw-r--r--include/linux/hugetlb.h2
-rw-r--r--include/linux/memory.h94
-rw-r--r--include/linux/memory_hotplug.h104
-rw-r--r--include/linux/mempolicy.h7
-rw-r--r--include/linux/mm.h150
-rw-r--r--include/linux/mmzone.h28
-rw-r--r--include/linux/rmap.h4
-rw-r--r--include/linux/rwsem-spinlock.h5
-rw-r--r--include/linux/sched.h65
-rw-r--r--include/linux/vmalloc.h8
-rw-r--r--ipc/shm.c7
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/power/swsusp.c25
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/timer.c9
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c1
-rw-r--r--mm/filemap.c12
-rw-r--r--mm/filemap_xip.c22
-rw-r--r--mm/fremap.c86
-rw-r--r--mm/hugetlb.c207
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c993
-rw-r--r--mm/memory_hotplug.c138
-rw-r--r--mm/mempolicy.c393
-rw-r--r--mm/mmap.c126
-rw-r--r--mm/mprotect.c19
-rw-r--r--mm/mremap.c193
-rw-r--r--mm/msync.c78
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/page_alloc.c207
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/rmap.c146
-rw-r--r--mm/shmem.c28
-rw-r--r--mm/slab.c5
-rw-r--r--mm/sparse.c99
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c41
-rw-r--r--mm/thrash.c2
-rw-r--r--mm/vmalloc.c77
-rw-r--r--mm/vmscan.c6
-rw-r--r--sound/core/pcm_native.c9
159 files changed, 3217 insertions, 3338 deletions
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index e132fb1163b0..7eb715e07eda 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -49,9 +49,6 @@ changes occur:
49 page table operations such as what happens during 49 page table operations such as what happens during
50 fork, and exec. 50 fork, and exec.
51 51
52 Platform developers note that generic code will always
53 invoke this interface without mm->page_table_lock held.
54
553) void flush_tlb_range(struct vm_area_struct *vma, 523) void flush_tlb_range(struct vm_area_struct *vma,
56 unsigned long start, unsigned long end) 53 unsigned long start, unsigned long end)
57 54
@@ -72,9 +69,6 @@ changes occur:
72 call flush_tlb_page (see below) for each entry which may be 69 call flush_tlb_page (see below) for each entry which may be
73 modified. 70 modified.
74 71
75 Platform developers note that generic code will always
76 invoke this interface with mm->page_table_lock held.
77
784) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) 724) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
79 73
80 This time we need to remove the PAGE_SIZE sized translation 74 This time we need to remove the PAGE_SIZE sized translation
@@ -93,9 +87,6 @@ changes occur:
93 87
94 This is used primarily during fault processing. 88 This is used primarily during fault processing.
95 89
96 Platform developers note that generic code will always
97 invoke this interface with mm->page_table_lock held.
98
995) void flush_tlb_pgtables(struct mm_struct *mm, 905) void flush_tlb_pgtables(struct mm_struct *mm,
100 unsigned long start, unsigned long end) 91 unsigned long start, unsigned long end)
101 92
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 90766b75d1b7..5dffcfefc3c7 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1460,8 +1460,6 @@ running once the system is up.
1460 stifb= [HW] 1460 stifb= [HW]
1461 Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] 1461 Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
1462 1462
1463 stram_swap= [HW,M68k]
1464
1465 swiotlb= [IA-64] Number of I/O TLB slabs 1463 swiotlb= [IA-64] Number of I/O TLB slabs
1466 1464
1467 switches= [HW,M68k] 1465 switches= [HW,M68k]
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
index e191baad8308..d5d3f064f552 100644
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -626,7 +626,7 @@ ignored (others aren't affected).
626 can be performed in optimal order. Not all SCSI devices support 626 can be performed in optimal order. Not all SCSI devices support
627 tagged queuing (:-(). 627 tagged queuing (:-().
628 628
6294.6 switches= 6294.5 switches=
630------------- 630-------------
631 631
632Syntax: switches=<list of switches> 632Syntax: switches=<list of switches>
@@ -661,28 +661,6 @@ correctly.
661earlier initialization ("ov_"-less) takes precedence. But the 661earlier initialization ("ov_"-less) takes precedence. But the
662switching-off on reset still happens in this case. 662switching-off on reset still happens in this case.
663 663
6644.5) stram_swap=
665----------------
666
667Syntax: stram_swap=<do_swap>[,<max_swap>]
668
669 This option is available only if the kernel has been compiled with
670CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines
671dynamically whether to actually use ST-RAM as swap space. (Currently,
672the fraction of ST-RAM must be less or equal 1/3 of total memory to
673enable this swapping.) You can override the kernel's decision by
674specifying this option. 1 for <do_swap> means always enable the swap,
675even if you have less alternate RAM. 0 stands for never swap to
676ST-RAM, even if it's small enough compared to the rest of memory.
677
678 If ST-RAM swapping is enabled, the kernel usually uses all free
679ST-RAM as swap "device". If the kernel resides in ST-RAM, the region
680allocated by it is obviously never used for swapping :-) You can also
681limit this amount by specifying the second parameter, <max_swap>, if
682you want to use parts of ST-RAM as normal system memory. <max_swap> is
683in kBytes and the number should be a multiple of 4 (otherwise: rounded
684down).
685
6865) Options for Amiga Only: 6645) Options for Amiga Only:
687========================== 665==========================
688 666
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index c7481d59b6df..6d5251254f68 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -371,6 +371,8 @@ show_mem(void)
371 show_free_areas(); 371 show_free_areas();
372 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 372 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
373 for_each_online_node(nid) { 373 for_each_online_node(nid) {
374 unsigned long flags;
375 pgdat_resize_lock(NODE_DATA(nid), &flags);
374 i = node_spanned_pages(nid); 376 i = node_spanned_pages(nid);
375 while (i-- > 0) { 377 while (i-- > 0) {
376 struct page *page = nid_page_nr(nid, i); 378 struct page *page = nid_page_nr(nid, i);
@@ -384,6 +386,7 @@ show_mem(void)
384 else 386 else
385 shared += page_count(page) - 1; 387 shared += page_count(page) - 1;
386 } 388 }
389 pgdat_resize_unlock(NODE_DATA(nid), &flags);
387 } 390 }
388 printk("%ld pages of RAM\n",total); 391 printk("%ld pages of RAM\n",total);
389 printk("%ld free pages\n",free); 392 printk("%ld free pages\n",free);
diff --git a/arch/alpha/mm/remap.c b/arch/alpha/mm/remap.c
index 19817ad3d89b..a78356c3ead5 100644
--- a/arch/alpha/mm/remap.c
+++ b/arch/alpha/mm/remap.c
@@ -2,7 +2,6 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/cacheflush.h> 3#include <asm/cacheflush.h>
4 4
5/* called with the page_table_lock held */
6static inline void 5static inline void
7remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, 6remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
8 unsigned long phys_addr, unsigned long flags) 7 unsigned long phys_addr, unsigned long flags)
@@ -31,7 +30,6 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
31 } while (address && (address < end)); 30 } while (address && (address < end));
32} 31}
33 32
34/* called with the page_table_lock held */
35static inline int 33static inline int
36remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, 34remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
37 unsigned long phys_addr, unsigned long flags) 35 unsigned long phys_addr, unsigned long flags)
@@ -46,7 +44,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
46 if (address >= end) 44 if (address >= end)
47 BUG(); 45 BUG();
48 do { 46 do {
49 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 47 pte_t * pte = pte_alloc_kernel(pmd, address);
50 if (!pte) 48 if (!pte)
51 return -ENOMEM; 49 return -ENOMEM;
52 remap_area_pte(pte, address, end - address, 50 remap_area_pte(pte, address, end - address,
@@ -70,7 +68,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
70 flush_cache_all(); 68 flush_cache_all();
71 if (address >= end) 69 if (address >= end)
72 BUG(); 70 BUG();
73 spin_lock(&init_mm.page_table_lock);
74 do { 71 do {
75 pmd_t *pmd; 72 pmd_t *pmd;
76 pmd = pmd_alloc(&init_mm, dir, address); 73 pmd = pmd_alloc(&init_mm, dir, address);
@@ -84,7 +81,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
84 address = (address + PGDIR_SIZE) & PGDIR_MASK; 81 address = (address + PGDIR_SIZE) & PGDIR_MASK;
85 dir++; 82 dir++;
86 } while (address && (address < end)); 83 } while (address && (address < end));
87 spin_unlock(&init_mm.page_table_lock);
88 return error; 84 return error;
89} 85}
90 86
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index a94d75fef598..a917e3dd3666 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -139,93 +139,33 @@ struct iwmmxt_sigframe {
139 unsigned long storage[0x98/4]; 139 unsigned long storage[0x98/4];
140}; 140};
141 141
142static int page_present(struct mm_struct *mm, void __user *uptr, int wr)
143{
144 unsigned long addr = (unsigned long)uptr;
145 pgd_t *pgd = pgd_offset(mm, addr);
146 if (pgd_present(*pgd)) {
147 pmd_t *pmd = pmd_offset(pgd, addr);
148 if (pmd_present(*pmd)) {
149 pte_t *pte = pte_offset_map(pmd, addr);
150 return (pte_present(*pte) && (!wr || pte_write(*pte)));
151 }
152 }
153 return 0;
154}
155
156static int copy_locked(void __user *uptr, void *kptr, size_t size, int write,
157 void (*copyfn)(void *, void __user *))
158{
159 unsigned char v, __user *userptr = uptr;
160 int err = 0;
161
162 do {
163 struct mm_struct *mm;
164
165 if (write) {
166 __put_user_error(0, userptr, err);
167 __put_user_error(0, userptr + size - 1, err);
168 } else {
169 __get_user_error(v, userptr, err);
170 __get_user_error(v, userptr + size - 1, err);
171 }
172
173 if (err)
174 break;
175
176 mm = current->mm;
177 spin_lock(&mm->page_table_lock);
178 if (page_present(mm, userptr, write) &&
179 page_present(mm, userptr + size - 1, write)) {
180 copyfn(kptr, uptr);
181 } else
182 err = 1;
183 spin_unlock(&mm->page_table_lock);
184 } while (err);
185
186 return err;
187}
188
189static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame) 142static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
190{ 143{
191 int err = 0; 144 char kbuf[sizeof(*frame) + 8];
145 struct iwmmxt_sigframe *kframe;
192 146
193 /* the iWMMXt context must be 64 bit aligned */ 147 /* the iWMMXt context must be 64 bit aligned */
194 WARN_ON((unsigned long)frame & 7); 148 kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
195 149 kframe->magic0 = IWMMXT_MAGIC0;
196 __put_user_error(IWMMXT_MAGIC0, &frame->magic0, err); 150 kframe->magic1 = IWMMXT_MAGIC1;
197 __put_user_error(IWMMXT_MAGIC1, &frame->magic1, err); 151 iwmmxt_task_copy(current_thread_info(), &kframe->storage);
198 152 return __copy_to_user(frame, kframe, sizeof(*frame));
199 /*
200 * iwmmxt_task_copy() doesn't check user permissions.
201 * Let's do a dummy write on the upper boundary to ensure
202 * access to user mem is OK all way up.
203 */
204 err |= copy_locked(&frame->storage, current_thread_info(),
205 sizeof(frame->storage), 1, iwmmxt_task_copy);
206 return err;
207} 153}
208 154
209static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame) 155static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
210{ 156{
211 unsigned long magic0, magic1; 157 char kbuf[sizeof(*frame) + 8];
212 int err = 0; 158 struct iwmmxt_sigframe *kframe;
213 159
214 /* the iWMMXt context is 64 bit aligned */ 160 /* the iWMMXt context must be 64 bit aligned */
215 WARN_ON((unsigned long)frame & 7); 161 kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
216 162 if (__copy_from_user(kframe, frame, sizeof(*frame)))
217 /* 163 return -1;
218 * Validate iWMMXt context signature. 164 if (kframe->magic0 != IWMMXT_MAGIC0 ||
219 * Also, iwmmxt_task_restore() doesn't check user permissions. 165 kframe->magic1 != IWMMXT_MAGIC1)
220 * Let's do a dummy write on the upper boundary to ensure 166 return -1;
221 * access to user mem is OK all way up. 167 iwmmxt_task_restore(current_thread_info(), &kframe->storage);
222 */ 168 return 0;
223 __get_user_error(magic0, &frame->magic0, err);
224 __get_user_error(magic1, &frame->magic1, err);
225 if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
226 err = copy_locked(&frame->storage, current_thread_info(),
227 sizeof(frame->storage), 0, iwmmxt_task_restore);
228 return err;
229} 169}
230 170
231#endif 171#endif
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index baa09601a64e..66e5a0516f23 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
483 unsigned long addr = regs->ARM_r2; 483 unsigned long addr = regs->ARM_r2;
484 struct mm_struct *mm = current->mm; 484 struct mm_struct *mm = current->mm;
485 pgd_t *pgd; pmd_t *pmd; pte_t *pte; 485 pgd_t *pgd; pmd_t *pmd; pte_t *pte;
486 spinlock_t *ptl;
486 487
487 regs->ARM_cpsr &= ~PSR_C_BIT; 488 regs->ARM_cpsr &= ~PSR_C_BIT;
488 spin_lock(&mm->page_table_lock); 489 down_read(&mm->mmap_sem);
489 pgd = pgd_offset(mm, addr); 490 pgd = pgd_offset(mm, addr);
490 if (!pgd_present(*pgd)) 491 if (!pgd_present(*pgd))
491 goto bad_access; 492 goto bad_access;
492 pmd = pmd_offset(pgd, addr); 493 pmd = pmd_offset(pgd, addr);
493 if (!pmd_present(*pmd)) 494 if (!pmd_present(*pmd))
494 goto bad_access; 495 goto bad_access;
495 pte = pte_offset_map(pmd, addr); 496 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
496 if (!pte_present(*pte) || !pte_write(*pte)) 497 if (!pte_present(*pte) || !pte_write(*pte)) {
498 pte_unmap_unlock(pte, ptl);
497 goto bad_access; 499 goto bad_access;
500 }
498 val = *(unsigned long *)addr; 501 val = *(unsigned long *)addr;
499 val -= regs->ARM_r0; 502 val -= regs->ARM_r0;
500 if (val == 0) { 503 if (val == 0) {
501 *(unsigned long *)addr = regs->ARM_r1; 504 *(unsigned long *)addr = regs->ARM_r1;
502 regs->ARM_cpsr |= PSR_C_BIT; 505 regs->ARM_cpsr |= PSR_C_BIT;
503 } 506 }
504 spin_unlock(&mm->page_table_lock); 507 pte_unmap_unlock(pte, ptl);
508 up_read(&mm->mmap_sem);
505 return val; 509 return val;
506 510
507 bad_access: 511 bad_access:
508 spin_unlock(&mm->page_table_lock); 512 up_read(&mm->mmap_sem);
509 /* simulate a write access fault */ 513 /* simulate a write access fault */
510 do_DataAbort(addr, 15 + (1 << 11), regs); 514 do_DataAbort(addr, 15 + (1 << 11), regs);
511 return -1; 515 return -1;
diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c
index 82f4d5e27c54..47b0b767f080 100644
--- a/arch/arm/mm/consistent.c
+++ b/arch/arm/mm/consistent.c
@@ -397,8 +397,6 @@ static int __init consistent_init(void)
397 pte_t *pte; 397 pte_t *pte;
398 int ret = 0; 398 int ret = 0;
399 399
400 spin_lock(&init_mm.page_table_lock);
401
402 do { 400 do {
403 pgd = pgd_offset(&init_mm, CONSISTENT_BASE); 401 pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
404 pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); 402 pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -409,7 +407,7 @@ static int __init consistent_init(void)
409 } 407 }
410 WARN_ON(!pmd_none(*pmd)); 408 WARN_ON(!pmd_none(*pmd));
411 409
412 pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); 410 pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
413 if (!pte) { 411 if (!pte) {
414 printk(KERN_ERR "%s: no pte tables\n", __func__); 412 printk(KERN_ERR "%s: no pte tables\n", __func__);
415 ret = -ENOMEM; 413 ret = -ENOMEM;
@@ -419,8 +417,6 @@ static int __init consistent_init(void)
419 consistent_pte = pte; 417 consistent_pte = pte;
420 } while (0); 418 } while (0);
421 419
422 spin_unlock(&init_mm.page_table_lock);
423
424 return ret; 420 return ret;
425} 421}
426 422
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index be4ab3d73c91..7fc1b35a6746 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE;
26/* 26/*
27 * We take the easy way out of this problem - we make the 27 * We take the easy way out of this problem - we make the
28 * PTE uncacheable. However, we leave the write buffer on. 28 * PTE uncacheable. However, we leave the write buffer on.
29 *
30 * Note that the pte lock held when calling update_mmu_cache must also
31 * guard the pte (somewhere else in the same mm) that we modify here.
32 * Therefore those configurations which might call adjust_pte (those
33 * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
29 */ 34 */
30static int adjust_pte(struct vm_area_struct *vma, unsigned long address) 35static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
31{ 36{
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page);
127 * 2. If we have multiple shared mappings of the same space in 132 * 2. If we have multiple shared mappings of the same space in
128 * an object, we need to deal with the cache aliasing issues. 133 * an object, we need to deal with the cache aliasing issues.
129 * 134 *
130 * Note that the page_table_lock will be held. 135 * Note that the pte lock will be held.
131 */ 136 */
132void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 137void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
133{ 138{
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 6fb1258df1b5..0f128c28fee4 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
75 75
76 pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags); 76 pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags);
77 do { 77 do {
78 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 78 pte_t * pte = pte_alloc_kernel(pmd, address);
79 if (!pte) 79 if (!pte)
80 return -ENOMEM; 80 return -ENOMEM;
81 remap_area_pte(pte, address, end - address, address + phys_addr, pgprot); 81 remap_area_pte(pte, address, end - address, address + phys_addr, pgprot);
@@ -97,7 +97,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
97 phys_addr -= address; 97 phys_addr -= address;
98 dir = pgd_offset(&init_mm, address); 98 dir = pgd_offset(&init_mm, address);
99 BUG_ON(address >= end); 99 BUG_ON(address >= end);
100 spin_lock(&init_mm.page_table_lock);
101 do { 100 do {
102 pmd_t *pmd = pmd_alloc(&init_mm, dir, address); 101 pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
103 if (!pmd) { 102 if (!pmd) {
@@ -114,7 +113,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
114 dir++; 113 dir++;
115 } while (address && (address < end)); 114 } while (address && (address < end));
116 115
117 spin_unlock(&init_mm.page_table_lock);
118 flush_cache_vmap(start, end); 116 flush_cache_vmap(start, end);
119 return err; 117 return err;
120} 118}
diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
index 61bc2fa0511e..1221fdde1769 100644
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -180,11 +180,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
180 180
181 if (!vectors_high()) { 181 if (!vectors_high()) {
182 /* 182 /*
183 * This lock is here just to satisfy pmd_alloc and pte_lock
184 */
185 spin_lock(&mm->page_table_lock);
186
187 /*
188 * On ARM, first page must always be allocated since it 183 * On ARM, first page must always be allocated since it
189 * contains the machine vectors. 184 * contains the machine vectors.
190 */ 185 */
@@ -201,23 +196,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
201 set_pte(new_pte, *init_pte); 196 set_pte(new_pte, *init_pte);
202 pte_unmap_nested(init_pte); 197 pte_unmap_nested(init_pte);
203 pte_unmap(new_pte); 198 pte_unmap(new_pte);
204
205 spin_unlock(&mm->page_table_lock);
206 } 199 }
207 200
208 return new_pgd; 201 return new_pgd;
209 202
210no_pte: 203no_pte:
211 spin_unlock(&mm->page_table_lock);
212 pmd_free(new_pmd); 204 pmd_free(new_pmd);
213 free_pages((unsigned long)new_pgd, 2);
214 return NULL;
215
216no_pmd: 205no_pmd:
217 spin_unlock(&mm->page_table_lock);
218 free_pages((unsigned long)new_pgd, 2); 206 free_pages((unsigned long)new_pgd, 2);
219 return NULL;
220
221no_pgd: 207no_pgd:
222 return NULL; 208 return NULL;
223} 209}
@@ -243,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
243 pte = pmd_page(*pmd); 229 pte = pmd_page(*pmd);
244 pmd_clear(pmd); 230 pmd_clear(pmd);
245 dec_page_state(nr_page_table_pages); 231 dec_page_state(nr_page_table_pages);
232 pte_lock_deinit(pte);
246 pte_free(pte); 233 pte_free(pte);
247 pmd_free(pmd); 234 pmd_free(pmd);
248free: 235free:
diff --git a/arch/arm/oprofile/backtrace.c b/arch/arm/oprofile/backtrace.c
index df35c452a8bf..7c22c12618cc 100644
--- a/arch/arm/oprofile/backtrace.c
+++ b/arch/arm/oprofile/backtrace.c
@@ -49,42 +49,22 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail)
49 49
50static struct frame_tail* user_backtrace(struct frame_tail *tail) 50static struct frame_tail* user_backtrace(struct frame_tail *tail)
51{ 51{
52 struct frame_tail buftail; 52 struct frame_tail buftail[2];
53 53
54 /* hardware pte might not be valid due to dirty/accessed bit emulation 54 /* Also check accessibility of one struct frame_tail beyond */
55 * so we use copy_from_user and benefit from exception fixups */ 55 if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
56 if (copy_from_user(&buftail, tail, sizeof(struct frame_tail))) 56 return NULL;
57 if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail)))
57 return NULL; 58 return NULL;
58 59
59 oprofile_add_trace(buftail.lr); 60 oprofile_add_trace(buftail[0].lr);
60 61
61 /* frame pointers should strictly progress back up the stack 62 /* frame pointers should strictly progress back up the stack
62 * (towards higher addresses) */ 63 * (towards higher addresses) */
63 if (tail >= buftail.fp) 64 if (tail >= buftail[0].fp)
64 return NULL; 65 return NULL;
65 66
66 return buftail.fp-1; 67 return buftail[0].fp-1;
67}
68
69/* Compare two addresses and see if they're on the same page */
70#define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \
71 == ((((unsigned long) y) + offset) >> PAGE_SHIFT))
72
73/* check that the page(s) containing the frame tail are present */
74static int pages_present(struct frame_tail *tail)
75{
76 struct mm_struct * mm = current->mm;
77
78 if (!check_user_page_readable(mm, (unsigned long)tail))
79 return 0;
80
81 if (CMP_ADDR_EQUAL(tail, tail, 8))
82 return 1;
83
84 if (!check_user_page_readable(mm, ((unsigned long)tail) + 8))
85 return 0;
86
87 return 1;
88} 68}
89 69
90/* 70/*
@@ -118,7 +98,6 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs)
118void arm_backtrace(struct pt_regs * const regs, unsigned int depth) 98void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
119{ 99{
120 struct frame_tail *tail; 100 struct frame_tail *tail;
121 unsigned long last_address = 0;
122 101
123 tail = ((struct frame_tail *) regs->ARM_fp) - 1; 102 tail = ((struct frame_tail *) regs->ARM_fp) - 1;
124 103
@@ -132,13 +111,6 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
132 return; 111 return;
133 } 112 }
134 113
135 while (depth-- && tail && !((unsigned long) tail & 3)) { 114 while (depth-- && tail && !((unsigned long) tail & 3))
136 if ((!CMP_ADDR_EQUAL(last_address, tail, 0)
137 || !CMP_ADDR_EQUAL(last_address, tail, 8))
138 && !pages_present(tail))
139 return;
140 last_address = (unsigned long) tail;
141 tail = user_backtrace(tail); 115 tail = user_backtrace(tail);
142 }
143} 116}
144
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c
index 8e8a2bb2487d..34def6397c3c 100644
--- a/arch/arm26/mm/memc.c
+++ b/arch/arm26/mm/memc.c
@@ -79,12 +79,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
79 goto no_pgd; 79 goto no_pgd;
80 80
81 /* 81 /*
82 * This lock is here just to satisfy pmd_alloc and pte_lock
83 * FIXME: I bet we could avoid taking it pretty much altogether
84 */
85 spin_lock(&mm->page_table_lock);
86
87 /*
88 * On ARM, first page must always be allocated since it contains 82 * On ARM, first page must always be allocated since it contains
89 * the machine vectors. 83 * the machine vectors.
90 */ 84 */
@@ -92,7 +86,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
92 if (!new_pmd) 86 if (!new_pmd)
93 goto no_pmd; 87 goto no_pmd;
94 88
95 new_pte = pte_alloc_kernel(mm, new_pmd, 0); 89 new_pte = pte_alloc_map(mm, new_pmd, 0);
96 if (!new_pte) 90 if (!new_pte)
97 goto no_pte; 91 goto no_pte;
98 92
@@ -101,6 +95,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
101 init_pte = pte_offset(init_pmd, 0); 95 init_pte = pte_offset(init_pmd, 0);
102 96
103 set_pte(new_pte, *init_pte); 97 set_pte(new_pte, *init_pte);
98 pte_unmap(new_pte);
104 99
105 /* 100 /*
106 * the page table entries are zeroed 101 * the page table entries are zeroed
@@ -112,23 +107,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
112 memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, 107 memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
113 (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); 108 (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));
114 109
115 spin_unlock(&mm->page_table_lock);
116
117 /* update MEMC tables */ 110 /* update MEMC tables */
118 cpu_memc_update_all(new_pgd); 111 cpu_memc_update_all(new_pgd);
119 return new_pgd; 112 return new_pgd;
120 113
121no_pte: 114no_pte:
122 spin_unlock(&mm->page_table_lock);
123 pmd_free(new_pmd); 115 pmd_free(new_pmd);
124 free_pgd_slow(new_pgd);
125 return NULL;
126
127no_pmd: 116no_pmd:
128 spin_unlock(&mm->page_table_lock);
129 free_pgd_slow(new_pgd); 117 free_pgd_slow(new_pgd);
130 return NULL;
131
132no_pgd: 118no_pgd:
133 return NULL; 119 return NULL;
134} 120}
diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c
index 8233406798d3..b08a28bb58ab 100644
--- a/arch/cris/arch-v32/mm/tlb.c
+++ b/arch/cris/arch-v32/mm/tlb.c
@@ -175,6 +175,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
175 return 0; 175 return 0;
176} 176}
177 177
178static DEFINE_SPINLOCK(mmu_context_lock);
179
178/* Called in schedule() just before actually doing the switch_to. */ 180/* Called in schedule() just before actually doing the switch_to. */
179void 181void
180switch_mm(struct mm_struct *prev, struct mm_struct *next, 182switch_mm(struct mm_struct *prev, struct mm_struct *next,
@@ -183,10 +185,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
183 int cpu = smp_processor_id(); 185 int cpu = smp_processor_id();
184 186
185 /* Make sure there is a MMU context. */ 187 /* Make sure there is a MMU context. */
186 spin_lock(&next->page_table_lock); 188 spin_lock(&mmu_context_lock);
187 get_mmu_context(next); 189 get_mmu_context(next);
188 cpu_set(cpu, next->cpu_vm_mask); 190 cpu_set(cpu, next->cpu_vm_mask);
189 spin_unlock(&next->page_table_lock); 191 spin_unlock(&mmu_context_lock);
190 192
191 /* 193 /*
192 * Remember the pgd for the fault handlers. Keep a seperate copy of it 194 * Remember the pgd for the fault handlers. Keep a seperate copy of it
diff --git a/arch/cris/mm/ioremap.c b/arch/cris/mm/ioremap.c
index ebba11e270fa..a92ac9877582 100644
--- a/arch/cris/mm/ioremap.c
+++ b/arch/cris/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
52 if (address >= end) 52 if (address >= end)
53 BUG(); 53 BUG();
54 do { 54 do {
55 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 55 pte_t * pte = pte_alloc_kernel(pmd, address);
56 if (!pte) 56 if (!pte)
57 return -ENOMEM; 57 return -ENOMEM;
58 remap_area_pte(pte, address, end - address, address + phys_addr, prot); 58 remap_area_pte(pte, address, end - address, address + phys_addr, prot);
@@ -74,7 +74,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
74 flush_cache_all(); 74 flush_cache_all();
75 if (address >= end) 75 if (address >= end)
76 BUG(); 76 BUG();
77 spin_lock(&init_mm.page_table_lock);
78 do { 77 do {
79 pud_t *pud; 78 pud_t *pud;
80 pmd_t *pmd; 79 pmd_t *pmd;
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
94 address = (address + PGDIR_SIZE) & PGDIR_MASK; 93 address = (address + PGDIR_SIZE) & PGDIR_MASK;
95 dir++; 94 dir++;
96 } while (address && (address < end)); 95 } while (address && (address < end));
97 spin_unlock(&init_mm.page_table_lock);
98 flush_tlb_all(); 96 flush_tlb_all();
99 return error; 97 return error;
100} 98}
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c
index cfc4f97490c6..342823aad758 100644
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -55,21 +55,18 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot)
55 pte_t *pte; 55 pte_t *pte;
56 int err = -ENOMEM; 56 int err = -ENOMEM;
57 57
58 spin_lock(&init_mm.page_table_lock);
59
60 /* Use upper 10 bits of VA to index the first level map */ 58 /* Use upper 10 bits of VA to index the first level map */
61 pge = pgd_offset_k(va); 59 pge = pgd_offset_k(va);
62 pue = pud_offset(pge, va); 60 pue = pud_offset(pge, va);
63 pme = pmd_offset(pue, va); 61 pme = pmd_offset(pue, va);
64 62
65 /* Use middle 10 bits of VA to index the second-level map */ 63 /* Use middle 10 bits of VA to index the second-level map */
66 pte = pte_alloc_kernel(&init_mm, pme, va); 64 pte = pte_alloc_kernel(pme, va);
67 if (pte != 0) { 65 if (pte != 0) {
68 err = 0; 66 err = 0;
69 set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot)); 67 set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot));
70 } 68 }
71 69
72 spin_unlock(&init_mm.page_table_lock);
73 return err; 70 return err;
74} 71}
75 72
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index 4eaec0f3525b..2c67dfe5a6b3 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
87 if (pgd_list) 87 if (pgd_list)
88 pgd_list->private = (unsigned long) &page->index; 88 pgd_list->private = (unsigned long) &page->index;
89 pgd_list = page; 89 pgd_list = page;
90 page->private = (unsigned long) &pgd_list; 90 set_page_private(page, (unsigned long)&pgd_list);
91} 91}
92 92
93static inline void pgd_list_del(pgd_t *pgd) 93static inline void pgd_list_del(pgd_t *pgd)
94{ 94{
95 struct page *next, **pprev, *page = virt_to_page(pgd); 95 struct page *next, **pprev, *page = virt_to_page(pgd);
96 next = (struct page *) page->index; 96 next = (struct page *) page->index;
97 pprev = (struct page **) page->private; 97 pprev = (struct page **)page_private(page);
98 *pprev = next; 98 *pprev = next;
99 if (next) 99 if (next)
100 next->private = (unsigned long) pprev; 100 next->private = (unsigned long) pprev;
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index 16b485009622..fc1993564f98 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
134 return ret; 134 return ret;
135} 135}
136 136
137static void mark_screen_rdonly(struct task_struct * tsk) 137static void mark_screen_rdonly(struct mm_struct *mm)
138{ 138{
139 pgd_t *pgd; 139 pgd_t *pgd;
140 pud_t *pud; 140 pud_t *pud;
141 pmd_t *pmd; 141 pmd_t *pmd;
142 pte_t *pte, *mapped; 142 pte_t *pte;
143 spinlock_t *ptl;
143 int i; 144 int i;
144 145
145 preempt_disable(); 146 pgd = pgd_offset(mm, 0xA0000);
146 spin_lock(&tsk->mm->page_table_lock);
147 pgd = pgd_offset(tsk->mm, 0xA0000);
148 if (pgd_none_or_clear_bad(pgd)) 147 if (pgd_none_or_clear_bad(pgd))
149 goto out; 148 goto out;
150 pud = pud_offset(pgd, 0xA0000); 149 pud = pud_offset(pgd, 0xA0000);
@@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct task_struct * tsk)
153 pmd = pmd_offset(pud, 0xA0000); 152 pmd = pmd_offset(pud, 0xA0000);
154 if (pmd_none_or_clear_bad(pmd)) 153 if (pmd_none_or_clear_bad(pmd))
155 goto out; 154 goto out;
156 pte = mapped = pte_offset_map(pmd, 0xA0000); 155 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
157 for (i = 0; i < 32; i++) { 156 for (i = 0; i < 32; i++) {
158 if (pte_present(*pte)) 157 if (pte_present(*pte))
159 set_pte(pte, pte_wrprotect(*pte)); 158 set_pte(pte, pte_wrprotect(*pte));
160 pte++; 159 pte++;
161 } 160 }
162 pte_unmap(mapped); 161 pte_unmap_unlock(pte, ptl);
163out: 162out:
164 spin_unlock(&tsk->mm->page_table_lock);
165 preempt_enable();
166 flush_tlb(); 163 flush_tlb();
167} 164}
168 165
@@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
306 303
307 tsk->thread.screen_bitmap = info->screen_bitmap; 304 tsk->thread.screen_bitmap = info->screen_bitmap;
308 if (info->flags & VM86_SCREEN_BITMAP) 305 if (info->flags & VM86_SCREEN_BITMAP)
309 mark_screen_rdonly(tsk); 306 mark_screen_rdonly(tsk->mm);
310 __asm__ __volatile__( 307 __asm__ __volatile__(
311 "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" 308 "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
312 "movl %0,%%esp\n\t" 309 "movl %0,%%esp\n\t"
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 244d8ec66be2..c4af9638dbfa 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
98 98
99extern unsigned long find_max_low_pfn(void); 99extern unsigned long find_max_low_pfn(void);
100extern void find_max_pfn(void); 100extern void find_max_pfn(void);
101extern void one_highpage_init(struct page *, int, int); 101extern void add_one_highpage_init(struct page *, int, int);
102 102
103extern struct e820map e820; 103extern struct e820map e820;
104extern unsigned long init_pg_tables_end; 104extern unsigned long init_pg_tables_end;
@@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro)
427 if (!pfn_valid(node_pfn)) 427 if (!pfn_valid(node_pfn))
428 continue; 428 continue;
429 page = pfn_to_page(node_pfn); 429 page = pfn_to_page(node_pfn);
430 one_highpage_init(page, node_pfn, bad_ppro); 430 add_one_highpage_init(page, node_pfn, bad_ppro);
431 } 431 }
432 } 432 }
433 totalram_pages += totalhigh_pages; 433 totalram_pages += totalhigh_pages;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 2ebaf75f732e..542d9298da5e 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -27,6 +27,7 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
29#include <linux/efi.h> 29#include <linux/efi.h>
30#include <linux/memory_hotplug.h>
30 31
31#include <asm/processor.h> 32#include <asm/processor.h>
32#include <asm/system.h> 33#include <asm/system.h>
@@ -266,17 +267,46 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
266 pkmap_page_table = pte; 267 pkmap_page_table = pte;
267} 268}
268 269
269void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) 270void __devinit free_new_highpage(struct page *page)
271{
272 set_page_count(page, 1);
273 __free_page(page);
274 totalhigh_pages++;
275}
276
277void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
270{ 278{
271 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 279 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
272 ClearPageReserved(page); 280 ClearPageReserved(page);
273 set_page_count(page, 1); 281 free_new_highpage(page);
274 __free_page(page);
275 totalhigh_pages++;
276 } else 282 } else
277 SetPageReserved(page); 283 SetPageReserved(page);
278} 284}
279 285
286static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
287{
288 free_new_highpage(page);
289 totalram_pages++;
290#ifdef CONFIG_FLATMEM
291 max_mapnr = max(pfn, max_mapnr);
292#endif
293 num_physpages++;
294 return 0;
295}
296
297/*
298 * Not currently handling the NUMA case.
299 * Assuming single node and all memory that
300 * has been added dynamically that would be
301 * onlined here is in HIGHMEM
302 */
303void online_page(struct page *page)
304{
305 ClearPageReserved(page);
306 add_one_highpage_hotplug(page, page_to_pfn(page));
307}
308
309
280#ifdef CONFIG_NUMA 310#ifdef CONFIG_NUMA
281extern void set_highmem_pages_init(int); 311extern void set_highmem_pages_init(int);
282#else 312#else
@@ -284,7 +314,7 @@ static void __init set_highmem_pages_init(int bad_ppro)
284{ 314{
285 int pfn; 315 int pfn;
286 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) 316 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
287 one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 317 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
288 totalram_pages += totalhigh_pages; 318 totalram_pages += totalhigh_pages;
289} 319}
290#endif /* CONFIG_FLATMEM */ 320#endif /* CONFIG_FLATMEM */
@@ -615,6 +645,28 @@ void __init mem_init(void)
615#endif 645#endif
616} 646}
617 647
648/*
649 * this is for the non-NUMA, single node SMP system case.
650 * Specifically, in the case of x86, we will always add
651 * memory to the highmem for now.
652 */
653#ifndef CONFIG_NEED_MULTIPLE_NODES
654int add_memory(u64 start, u64 size)
655{
656 struct pglist_data *pgdata = &contig_page_data;
657 struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
658 unsigned long start_pfn = start >> PAGE_SHIFT;
659 unsigned long nr_pages = size >> PAGE_SHIFT;
660
661 return __add_pages(zone, start_pfn, nr_pages);
662}
663
664int remove_memory(u64 start, u64 size)
665{
666 return -EINVAL;
667}
668#endif
669
618kmem_cache_t *pgd_cache; 670kmem_cache_t *pgd_cache;
619kmem_cache_t *pmd_cache; 671kmem_cache_t *pmd_cache;
620 672
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
index f379b8d67558..5d09de8d1c6b 100644
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
28 unsigned long pfn; 28 unsigned long pfn;
29 29
30 pfn = phys_addr >> PAGE_SHIFT; 30 pfn = phys_addr >> PAGE_SHIFT;
31 pte = pte_alloc_kernel(&init_mm, pmd, addr); 31 pte = pte_alloc_kernel(pmd, addr);
32 if (!pte) 32 if (!pte)
33 return -ENOMEM; 33 return -ENOMEM;
34 do { 34 do {
@@ -87,14 +87,12 @@ static int ioremap_page_range(unsigned long addr,
87 flush_cache_all(); 87 flush_cache_all();
88 phys_addr -= addr; 88 phys_addr -= addr;
89 pgd = pgd_offset_k(addr); 89 pgd = pgd_offset_k(addr);
90 spin_lock(&init_mm.page_table_lock);
91 do { 90 do {
92 next = pgd_addr_end(addr, end); 91 next = pgd_addr_end(addr, end);
93 err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); 92 err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
94 if (err) 93 if (err)
95 break; 94 break;
96 } while (pgd++, addr = next, addr != end); 95 } while (pgd++, addr = next, addr != end);
97 spin_unlock(&init_mm.page_table_lock);
98 flush_tlb_all(); 96 flush_tlb_all();
99 return err; 97 return err;
100} 98}
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index dcdce2c6c532..9db3242103be 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -31,11 +31,13 @@ void show_mem(void)
31 pg_data_t *pgdat; 31 pg_data_t *pgdat;
32 unsigned long i; 32 unsigned long i;
33 struct page_state ps; 33 struct page_state ps;
34 unsigned long flags;
34 35
35 printk(KERN_INFO "Mem-info:\n"); 36 printk(KERN_INFO "Mem-info:\n");
36 show_free_areas(); 37 show_free_areas();
37 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 38 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
38 for_each_pgdat(pgdat) { 39 for_each_pgdat(pgdat) {
40 pgdat_resize_lock(pgdat, &flags);
39 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 41 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
40 page = pgdat_page_nr(pgdat, i); 42 page = pgdat_page_nr(pgdat, i);
41 total++; 43 total++;
@@ -48,6 +50,7 @@ void show_mem(void)
48 else if (page_count(page)) 50 else if (page_count(page))
49 shared += page_count(page) - 1; 51 shared += page_count(page) - 1;
50 } 52 }
53 pgdat_resize_unlock(pgdat, &flags);
51 } 54 }
52 printk(KERN_INFO "%d pages of RAM\n", total); 55 printk(KERN_INFO "%d pages of RAM\n", total);
53 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); 56 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
@@ -188,19 +191,19 @@ static inline void pgd_list_add(pgd_t *pgd)
188 struct page *page = virt_to_page(pgd); 191 struct page *page = virt_to_page(pgd);
189 page->index = (unsigned long)pgd_list; 192 page->index = (unsigned long)pgd_list;
190 if (pgd_list) 193 if (pgd_list)
191 pgd_list->private = (unsigned long)&page->index; 194 set_page_private(pgd_list, (unsigned long)&page->index);
192 pgd_list = page; 195 pgd_list = page;
193 page->private = (unsigned long)&pgd_list; 196 set_page_private(page, (unsigned long)&pgd_list);
194} 197}
195 198
196static inline void pgd_list_del(pgd_t *pgd) 199static inline void pgd_list_del(pgd_t *pgd)
197{ 200{
198 struct page *next, **pprev, *page = virt_to_page(pgd); 201 struct page *next, **pprev, *page = virt_to_page(pgd);
199 next = (struct page *)page->index; 202 next = (struct page *)page->index;
200 pprev = (struct page **)page->private; 203 pprev = (struct page **)page_private(page);
201 *pprev = next; 204 *pprev = next;
202 if (next) 205 if (next)
203 next->private = (unsigned long)pprev; 206 set_page_private(next, (unsigned long)pprev);
204} 207}
205 208
206void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) 209void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
diff --git a/arch/i386/oprofile/backtrace.c b/arch/i386/oprofile/backtrace.c
index 65dfd2edb671..21654be3f73f 100644
--- a/arch/i386/oprofile/backtrace.c
+++ b/arch/i386/oprofile/backtrace.c
@@ -12,6 +12,7 @@
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <asm/ptrace.h> 14#include <asm/ptrace.h>
15#include <asm/uaccess.h>
15 16
16struct frame_head { 17struct frame_head {
17 struct frame_head * ebp; 18 struct frame_head * ebp;
@@ -21,26 +22,22 @@ struct frame_head {
21static struct frame_head * 22static struct frame_head *
22dump_backtrace(struct frame_head * head) 23dump_backtrace(struct frame_head * head)
23{ 24{
24 oprofile_add_trace(head->ret); 25 struct frame_head bufhead[2];
25 26
26 /* frame pointers should strictly progress back up the stack 27 /* Also check accessibility of one struct frame_head beyond */
27 * (towards higher addresses) */ 28 if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
28 if (head >= head->ebp) 29 return NULL;
30 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
29 return NULL; 31 return NULL;
30 32
31 return head->ebp; 33 oprofile_add_trace(bufhead[0].ret);
32}
33
34/* check that the page(s) containing the frame head are present */
35static int pages_present(struct frame_head * head)
36{
37 struct mm_struct * mm = current->mm;
38 34
39 /* FIXME: only necessary once per page */ 35 /* frame pointers should strictly progress back up the stack
40 if (!check_user_page_readable(mm, (unsigned long)head)) 36 * (towards higher addresses) */
41 return 0; 37 if (head >= bufhead[0].ebp)
38 return NULL;
42 39
43 return check_user_page_readable(mm, (unsigned long)(head + 1)); 40 return bufhead[0].ebp;
44} 41}
45 42
46/* 43/*
@@ -97,15 +94,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
97 return; 94 return;
98 } 95 }
99 96
100#ifdef CONFIG_SMP 97 while (depth-- && head)
101 if (!spin_trylock(&current->mm->page_table_lock))
102 return;
103#endif
104
105 while (depth-- && head && pages_present(head))
106 head = dump_backtrace(head); 98 head = dump_backtrace(head);
107
108#ifdef CONFIG_SMP
109 spin_unlock(&current->mm->page_table_lock);
110#endif
111} 99}
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index d71731ee5b61..f7dfc107cb7b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
2352 insert_vm_struct(mm, vma); 2352 insert_vm_struct(mm, vma);
2353 2353
2354 mm->total_vm += size >> PAGE_SHIFT; 2354 mm->total_vm += size >> PAGE_SHIFT;
2355 vm_stat_account(vma); 2355 vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
2356 vma_pages(vma));
2356 up_write(&task->mm->mmap_sem); 2357 up_write(&task->mm->mmap_sem);
2357 2358
2358 /* 2359 /*
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index a3788fb84809..a88cdb7232f8 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -555,9 +555,13 @@ void show_mem(void)
555 show_free_areas(); 555 show_free_areas();
556 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 556 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
557 for_each_pgdat(pgdat) { 557 for_each_pgdat(pgdat) {
558 unsigned long present = pgdat->node_present_pages; 558 unsigned long present;
559 unsigned long flags;
559 int shared = 0, cached = 0, reserved = 0; 560 int shared = 0, cached = 0, reserved = 0;
561
560 printk("Node ID: %d\n", pgdat->node_id); 562 printk("Node ID: %d\n", pgdat->node_id);
563 pgdat_resize_lock(pgdat, &flags);
564 present = pgdat->node_present_pages;
561 for(i = 0; i < pgdat->node_spanned_pages; i++) { 565 for(i = 0; i < pgdat->node_spanned_pages; i++) {
562 struct page *page; 566 struct page *page;
563 if (pfn_valid(pgdat->node_start_pfn + i)) 567 if (pfn_valid(pgdat->node_start_pfn + i))
@@ -571,6 +575,7 @@ void show_mem(void)
571 else if (page_count(page)) 575 else if (page_count(page))
572 shared += page_count(page)-1; 576 shared += page_count(page)-1;
573 } 577 }
578 pgdat_resize_unlock(pgdat, &flags);
574 total_present += present; 579 total_present += present;
575 total_reserved += reserved; 580 total_reserved += reserved;
576 total_cached += cached; 581 total_cached += cached;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 3c32af910d60..af7eb087dca7 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -20,32 +20,6 @@
20extern void die (char *, struct pt_regs *, long); 20extern void die (char *, struct pt_regs *, long);
21 21
22/* 22/*
23 * This routine is analogous to expand_stack() but instead grows the
24 * register backing store (which grows towards higher addresses).
25 * Since the register backing store is access sequentially, we
26 * disallow growing the RBS by more than a page at a time. Note that
27 * the VM_GROWSUP flag can be set on any VM area but that's fine
28 * because the total process size is still limited by RLIMIT_STACK and
29 * RLIMIT_AS.
30 */
31static inline long
32expand_backing_store (struct vm_area_struct *vma, unsigned long address)
33{
34 unsigned long grow;
35
36 grow = PAGE_SIZE >> PAGE_SHIFT;
37 if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
38 || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
39 return -ENOMEM;
40 vma->vm_end += PAGE_SIZE;
41 vma->vm_mm->total_vm += grow;
42 if (vma->vm_flags & VM_LOCKED)
43 vma->vm_mm->locked_vm += grow;
44 __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
45 return 0;
46}
47
48/*
49 * Return TRUE if ADDRESS points at a page in the kernel's mapped segment 23 * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
50 * (inside region 5, on ia64) and that page is present. 24 * (inside region 5, on ia64) and that page is present.
51 */ 25 */
@@ -185,7 +159,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
185 if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) 159 if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
186 || REGION_OFFSET(address) >= RGN_MAP_LIMIT) 160 || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
187 goto bad_area; 161 goto bad_area;
188 if (expand_backing_store(vma, address)) 162 /*
163 * Since the register backing store is accessed sequentially,
164 * we disallow growing it by more than a page at a time.
165 */
166 if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
167 goto bad_area;
168 if (expand_upwards(vma, address))
189 goto bad_area; 169 goto bad_area;
190 } 170 }
191 goto good_area; 171 goto good_area;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 98246acd4991..e3215ba64ffd 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -158,7 +158,7 @@ ia64_init_addr_space (void)
158 vma->vm_start = current->thread.rbs_bot & PAGE_MASK; 158 vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
159 vma->vm_end = vma->vm_start + PAGE_SIZE; 159 vma->vm_end = vma->vm_start + PAGE_SIZE;
160 vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; 160 vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
161 vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP; 161 vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
162 down_write(&current->mm->mmap_sem); 162 down_write(&current->mm->mmap_sem);
163 if (insert_vm_struct(current->mm, vma)) { 163 if (insert_vm_struct(current->mm, vma)) {
164 up_write(&current->mm->mmap_sem); 164 up_write(&current->mm->mmap_sem);
@@ -275,26 +275,21 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
275 275
276 pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ 276 pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */
277 277
278 spin_lock(&init_mm.page_table_lock);
279 { 278 {
280 pud = pud_alloc(&init_mm, pgd, address); 279 pud = pud_alloc(&init_mm, pgd, address);
281 if (!pud) 280 if (!pud)
282 goto out; 281 goto out;
283
284 pmd = pmd_alloc(&init_mm, pud, address); 282 pmd = pmd_alloc(&init_mm, pud, address);
285 if (!pmd) 283 if (!pmd)
286 goto out; 284 goto out;
287 pte = pte_alloc_map(&init_mm, pmd, address); 285 pte = pte_alloc_kernel(pmd, address);
288 if (!pte) 286 if (!pte)
289 goto out; 287 goto out;
290 if (!pte_none(*pte)) { 288 if (!pte_none(*pte))
291 pte_unmap(pte);
292 goto out; 289 goto out;
293 }
294 set_pte(pte, mk_pte(page, pgprot)); 290 set_pte(pte, mk_pte(page, pgprot));
295 pte_unmap(pte);
296 } 291 }
297 out: spin_unlock(&init_mm.page_table_lock); 292 out:
298 /* no need for flush_tlb */ 293 /* no need for flush_tlb */
299 return page; 294 return page;
300} 295}
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index c93e0f2b5fea..c79a9b96d02b 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -158,10 +158,12 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long
158# ifdef CONFIG_SMP 158# ifdef CONFIG_SMP
159 platform_global_tlb_purge(mm, start, end, nbits); 159 platform_global_tlb_purge(mm, start, end, nbits);
160# else 160# else
161 preempt_disable();
161 do { 162 do {
162 ia64_ptcl(start, (nbits<<2)); 163 ia64_ptcl(start, (nbits<<2));
163 start += (1UL << nbits); 164 start += (1UL << nbits);
164 } while (start < end); 165 } while (start < end);
166 preempt_enable();
165# endif 167# endif
166 168
167 ia64_srlz_i(); /* srlz.i implies srlz.d */ 169 ia64_srlz_i(); /* srlz.i implies srlz.d */
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index d9a40b1fe8ba..6facf15b04f3 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -48,6 +48,8 @@ void show_mem(void)
48 show_free_areas(); 48 show_free_areas();
49 printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); 49 printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
50 for_each_pgdat(pgdat) { 50 for_each_pgdat(pgdat) {
51 unsigned long flags;
52 pgdat_resize_lock(pgdat, &flags);
51 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 53 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
52 page = pgdat_page_nr(pgdat, i); 54 page = pgdat_page_nr(pgdat, i);
53 total++; 55 total++;
@@ -60,6 +62,7 @@ void show_mem(void)
60 else if (page_count(page)) 62 else if (page_count(page))
61 shared += page_count(page) - 1; 63 shared += page_count(page) - 1;
62 } 64 }
65 pgdat_resize_unlock(pgdat, &flags);
63 } 66 }
64 printk("%d pages of RAM\n", total); 67 printk("%d pages of RAM\n", total);
65 printk("%d pages of HIGHMEM\n",highmem); 68 printk("%d pages of HIGHMEM\n",highmem);
@@ -150,10 +153,14 @@ int __init reservedpages_count(void)
150 int reservedpages, nid, i; 153 int reservedpages, nid, i;
151 154
152 reservedpages = 0; 155 reservedpages = 0;
153 for_each_online_node(nid) 156 for_each_online_node(nid) {
157 unsigned long flags;
158 pgdat_resize_lock(NODE_DATA(nid), &flags);
154 for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) 159 for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
155 if (PageReserved(nid_page_nr(nid, i))) 160 if (PageReserved(nid_page_nr(nid, i)))
156 reservedpages++; 161 reservedpages++;
162 pgdat_resize_unlock(NODE_DATA(nid), &flags);
163 }
157 164
158 return reservedpages; 165 return reservedpages;
159} 166}
diff --git a/arch/m32r/mm/ioremap.c b/arch/m32r/mm/ioremap.c
index 70c59055c19c..a151849a605e 100644
--- a/arch/m32r/mm/ioremap.c
+++ b/arch/m32r/mm/ioremap.c
@@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
67 if (address >= end) 67 if (address >= end)
68 BUG(); 68 BUG();
69 do { 69 do {
70 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 70 pte_t * pte = pte_alloc_kernel(pmd, address);
71 if (!pte) 71 if (!pte)
72 return -ENOMEM; 72 return -ENOMEM;
73 remap_area_pte(pte, address, end - address, address + phys_addr, flags); 73 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -90,7 +90,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
90 flush_cache_all(); 90 flush_cache_all();
91 if (address >= end) 91 if (address >= end)
92 BUG(); 92 BUG();
93 spin_lock(&init_mm.page_table_lock);
94 do { 93 do {
95 pmd_t *pmd; 94 pmd_t *pmd;
96 pmd = pmd_alloc(&init_mm, dir, address); 95 pmd = pmd_alloc(&init_mm, dir, address);
@@ -104,7 +103,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
104 address = (address + PGDIR_SIZE) & PGDIR_MASK; 103 address = (address + PGDIR_SIZE) & PGDIR_MASK;
105 dir++; 104 dir++;
106 } while (address && (address < end)); 105 } while (address && (address < end));
107 spin_unlock(&init_mm.page_table_lock);
108 flush_tlb_all(); 106 flush_tlb_all();
109 return error; 107 return error;
110} 108}
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index ba960bbc8e6d..1dd5d18b2201 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -388,33 +388,11 @@ config AMIGA_PCMCIA
388 Include support in the kernel for pcmcia on Amiga 1200 and Amiga 388 Include support in the kernel for pcmcia on Amiga 1200 and Amiga
389 600. If you intend to use pcmcia cards say Y; otherwise say N. 389 600. If you intend to use pcmcia cards say Y; otherwise say N.
390 390
391config STRAM_SWAP
392 bool "Support for ST-RAM as swap space"
393 depends on ATARI && BROKEN
394 ---help---
395 Some Atari 68k machines (including the 520STF and 1020STE) divide
396 their addressable memory into ST and TT sections. The TT section
397 (up to 512MB) is the main memory; the ST section (up to 4MB) is
398 accessible to the built-in graphics board, runs slower, and is
399 present mainly for backward compatibility with older machines.
400
401 This enables support for using (parts of) ST-RAM as swap space,
402 instead of as normal system memory. This can first enhance system
403 performance if you have lots of alternate RAM (compared to the size
404 of ST-RAM), because executable code always will reside in faster
405 memory. ST-RAM will remain as ultra-fast swap space. On the other
406 hand, it allows much improved dynamic allocations of ST-RAM buffers
407 for device driver modules (e.g. floppy, ACSI, SLM printer, DMA
408 sound). The probability that such allocations at module load time
409 fail is drastically reduced.
410
411config STRAM_PROC 391config STRAM_PROC
412 bool "ST-RAM statistics in /proc" 392 bool "ST-RAM statistics in /proc"
413 depends on ATARI 393 depends on ATARI
414 help 394 help
415 Say Y here to report ST-RAM usage statistics in /proc/stram. See 395 Say Y here to report ST-RAM usage statistics in /proc/stram.
416 the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its
417 uses.
418 396
419config HEARTBEAT 397config HEARTBEAT
420 bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40 398 bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40
diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c
index 5a3c106b40c8..22e0481a5f7b 100644
--- a/arch/m68k/atari/stram.c
+++ b/arch/m68k/atari/stram.c
@@ -15,11 +15,9 @@
15#include <linux/kdev_t.h> 15#include <linux/kdev_t.h>
16#include <linux/major.h> 16#include <linux/major.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/swap.h>
19#include <linux/slab.h> 18#include <linux/slab.h>
20#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
21#include <linux/pagemap.h> 20#include <linux/pagemap.h>
22#include <linux/shm.h>
23#include <linux/bootmem.h> 21#include <linux/bootmem.h>
24#include <linux/mount.h> 22#include <linux/mount.h>
25#include <linux/blkdev.h> 23#include <linux/blkdev.h>
@@ -33,8 +31,6 @@
33#include <asm/io.h> 31#include <asm/io.h>
34#include <asm/semaphore.h> 32#include <asm/semaphore.h>
35 33
36#include <linux/swapops.h>
37
38#undef DEBUG 34#undef DEBUG
39 35
40#ifdef DEBUG 36#ifdef DEBUG
@@ -49,8 +45,7 @@
49#include <linux/proc_fs.h> 45#include <linux/proc_fs.h>
50#endif 46#endif
51 47
52/* Pre-swapping comments: 48/*
53 *
54 * ++roman: 49 * ++roman:
55 * 50 *
56 * New version of ST-Ram buffer allocation. Instead of using the 51 * New version of ST-Ram buffer allocation. Instead of using the
@@ -75,76 +70,6 @@
75 * 70 *
76 */ 71 */
77 72
78/*
79 * New Nov 1997: Use ST-RAM as swap space!
80 *
81 * In the past, there were often problems with modules that require ST-RAM
82 * buffers. Such drivers have to use __get_dma_pages(), which unfortunately
83 * often isn't very successful in allocating more than 1 page :-( [1] The net
84 * result was that most of the time you couldn't insmod such modules (ataflop,
85 * ACSI, SCSI on Falcon, Atari internal framebuffer, not to speak of acsi_slm,
86 * which needs a 1 MB buffer... :-).
87 *
88 * To overcome this limitation, ST-RAM can now be turned into a very
89 * high-speed swap space. If a request for an ST-RAM buffer comes, the kernel
90 * now tries to unswap some pages on that swap device to make some free (and
91 * contiguous) space. This works much better in comparison to
92 * __get_dma_pages(), since used swap pages can be selectively freed by either
93 * moving them to somewhere else in swap space, or by reading them back into
94 * system memory. Ok, there operation of unswapping isn't really cheap (for
95 * each page, one has to go through the page tables of all processes), but it
96 * doesn't happen that often (only when allocation ST-RAM, i.e. when loading a
97 * module that needs ST-RAM). But it at least makes it possible to load such
98 * modules!
99 *
100 * It could also be that overall system performance increases a bit due to
101 * ST-RAM swapping, since slow ST-RAM isn't used anymore for holding data or
102 * executing code in. It's then just a (very fast, compared to disk) back
103 * storage for not-so-often needed data. (But this effect must be compared
104 * with the loss of total memory...) Don't know if the effect is already
105 * visible on a TT, where the speed difference between ST- and TT-RAM isn't
106 * that dramatic, but it should on machines where TT-RAM is really much faster
107 * (e.g. Afterburner).
108 *
109 * [1]: __get_free_pages() does a fine job if you only want one page, but if
110 * you want more (contiguous) pages, it can give you such a block only if
111 * there's already a free one. The algorithm can't try to free buffers or swap
112 * out something in order to make more free space, since all that page-freeing
113 * mechanisms work "target-less", i.e. they just free something, but not in a
114 * specific place. I.e., __get_free_pages() can't do anything to free
115 * *adjacent* pages :-( This situation becomes even worse for DMA memory,
116 * since the freeing algorithms are also blind to DMA capability of pages.
117 */
118
119/* 1998-10-20: ++andreas
120 unswap_by_move disabled because it does not handle swapped shm pages.
121*/
122
123/* 2000-05-01: ++andreas
124 Integrated with bootmem. Remove all traces of unswap_by_move.
125*/
126
127#ifdef CONFIG_STRAM_SWAP
128#define ALIGN_IF_SWAP(x) PAGE_ALIGN(x)
129#else
130#define ALIGN_IF_SWAP(x) (x)
131#endif
132
133/* get index of swap page at address 'addr' */
134#define SWAP_NR(addr) (((addr) - swap_start) >> PAGE_SHIFT)
135
136/* get address of swap page #'nr' */
137#define SWAP_ADDR(nr) (swap_start + ((nr) << PAGE_SHIFT))
138
139/* get number of pages for 'n' bytes (already page-aligned) */
140#define N_PAGES(n) ((n) >> PAGE_SHIFT)
141
142/* The following two numbers define the maximum fraction of ST-RAM in total
143 * memory, below that the kernel would automatically use ST-RAM as swap
144 * space. This decision can be overridden with stram_swap= */
145#define MAX_STRAM_FRACTION_NOM 1
146#define MAX_STRAM_FRACTION_DENOM 3
147
148/* Start and end (virtual) of ST-RAM */ 73/* Start and end (virtual) of ST-RAM */
149static void *stram_start, *stram_end; 74static void *stram_start, *stram_end;
150 75
@@ -164,10 +89,9 @@ typedef struct stram_block {
164} BLOCK; 89} BLOCK;
165 90
166/* values for flags field */ 91/* values for flags field */
167#define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */ 92#define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */
168#define BLOCK_KMALLOCED 0x02 /* structure allocated by kmalloc() */ 93#define BLOCK_KMALLOCED 0x02 /* structure allocated by kmalloc() */
169#define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */ 94#define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */
170#define BLOCK_INSWAP 0x10 /* block allocated in swap space */
171 95
172/* list of allocated blocks */ 96/* list of allocated blocks */
173static BLOCK *alloc_list; 97static BLOCK *alloc_list;
@@ -179,60 +103,8 @@ static BLOCK *alloc_list;
179#define N_STATIC_BLOCKS 20 103#define N_STATIC_BLOCKS 20
180static BLOCK static_blocks[N_STATIC_BLOCKS]; 104static BLOCK static_blocks[N_STATIC_BLOCKS];
181 105
182#ifdef CONFIG_STRAM_SWAP
183/* max. number of bytes to use for swapping
184 * 0 = no ST-RAM swapping
185 * -1 = do swapping (to whole ST-RAM) if it's less than MAX_STRAM_FRACTION of
186 * total memory
187 */
188static int max_swap_size = -1;
189
190/* start and end of swapping area */
191static void *swap_start, *swap_end;
192
193/* The ST-RAM's swap info structure */
194static struct swap_info_struct *stram_swap_info;
195
196/* The ST-RAM's swap type */
197static int stram_swap_type;
198
199/* Semaphore for get_stram_region. */
200static DECLARE_MUTEX(stram_swap_sem);
201
202/* major and minor device number of the ST-RAM device; for the major, we use
203 * the same as Amiga z2ram, which is really similar and impossible on Atari,
204 * and for the minor a relatively odd number to avoid the user creating and
205 * using that device. */
206#define STRAM_MAJOR Z2RAM_MAJOR
207#define STRAM_MINOR 13
208
209/* Some impossible pointer value */
210#define MAGIC_FILE_P (struct file *)0xffffdead
211
212#ifdef DO_PROC
213static unsigned stat_swap_read;
214static unsigned stat_swap_write;
215static unsigned stat_swap_force;
216#endif /* DO_PROC */
217
218#endif /* CONFIG_STRAM_SWAP */
219
220/***************************** Prototypes *****************************/ 106/***************************** Prototypes *****************************/
221 107
222#ifdef CONFIG_STRAM_SWAP
223static int swap_init(void *start_mem, void *swap_data);
224static void *get_stram_region( unsigned long n_pages );
225static void free_stram_region( unsigned long offset, unsigned long n_pages
226 );
227static int in_some_region(void *addr);
228static unsigned long find_free_region( unsigned long n_pages, unsigned long
229 *total_free, unsigned long
230 *region_free );
231static void do_stram_request(request_queue_t *);
232static int stram_open( struct inode *inode, struct file *filp );
233static int stram_release( struct inode *inode, struct file *filp );
234static void reserve_region(void *start, void *end);
235#endif
236static BLOCK *add_region( void *addr, unsigned long size ); 108static BLOCK *add_region( void *addr, unsigned long size );
237static BLOCK *find_region( void *addr ); 109static BLOCK *find_region( void *addr );
238static int remove_region( BLOCK *block ); 110static int remove_region( BLOCK *block );
@@ -279,84 +151,11 @@ void __init atari_stram_init(void)
279 */ 151 */
280void __init atari_stram_reserve_pages(void *start_mem) 152void __init atari_stram_reserve_pages(void *start_mem)
281{ 153{
282#ifdef CONFIG_STRAM_SWAP
283 /* if max_swap_size is negative (i.e. no stram_swap= option given),
284 * determine at run time whether to use ST-RAM swapping */
285 if (max_swap_size < 0)
286 /* Use swapping if ST-RAM doesn't make up more than MAX_STRAM_FRACTION
287 * of total memory. In that case, the max. size is set to 16 MB,
288 * because ST-RAM can never be bigger than that.
289 * Also, never use swapping on a Hades, there's no separate ST-RAM in
290 * that machine. */
291 max_swap_size =
292 (!MACH_IS_HADES &&
293 (N_PAGES(stram_end-stram_start)*MAX_STRAM_FRACTION_DENOM <=
294 ((unsigned long)high_memory>>PAGE_SHIFT)*MAX_STRAM_FRACTION_NOM)) ? 16*1024*1024 : 0;
295 DPRINTK( "atari_stram_reserve_pages: max_swap_size = %d\n", max_swap_size );
296#endif
297
298 /* always reserve first page of ST-RAM, the first 2 kB are 154 /* always reserve first page of ST-RAM, the first 2 kB are
299 * supervisor-only! */ 155 * supervisor-only! */
300 if (!kernel_in_stram) 156 if (!kernel_in_stram)
301 reserve_bootmem (0, PAGE_SIZE); 157 reserve_bootmem (0, PAGE_SIZE);
302 158
303#ifdef CONFIG_STRAM_SWAP
304 {
305 void *swap_data;
306
307 start_mem = (void *) PAGE_ALIGN ((unsigned long) start_mem);
308 /* determine first page to use as swap: if the kernel is
309 in TT-RAM, this is the first page of (usable) ST-RAM;
310 otherwise just use the end of kernel data (= start_mem) */
311 swap_start = !kernel_in_stram ? stram_start + PAGE_SIZE : start_mem;
312 /* decrement by one page, rest of kernel assumes that first swap page
313 * is always reserved and maybe doesn't handle swp_entry == 0
314 * correctly */
315 swap_start -= PAGE_SIZE;
316 swap_end = stram_end;
317 if (swap_end-swap_start > max_swap_size)
318 swap_end = swap_start + max_swap_size;
319 DPRINTK( "atari_stram_reserve_pages: swapping enabled; "
320 "swap=%p-%p\n", swap_start, swap_end);
321
322 /* reserve some amount of memory for maintainance of
323 * swapping itself: one page for each 2048 (PAGE_SIZE/2)
324 * swap pages. (2 bytes for each page) */
325 swap_data = start_mem;
326 start_mem += ((SWAP_NR(swap_end) + PAGE_SIZE/2 - 1)
327 >> (PAGE_SHIFT-1)) << PAGE_SHIFT;
328 /* correct swap_start if necessary */
329 if (swap_start + PAGE_SIZE == swap_data)
330 swap_start = start_mem - PAGE_SIZE;
331
332 if (!swap_init( start_mem, swap_data )) {
333 printk( KERN_ERR "ST-RAM swap space initialization failed\n" );
334 max_swap_size = 0;
335 return;
336 }
337 /* reserve region for swapping meta-data */
338 reserve_region(swap_data, start_mem);
339 /* reserve swapping area itself */
340 reserve_region(swap_start + PAGE_SIZE, swap_end);
341
342 /*
343 * If the whole ST-RAM is used for swapping, there are no allocatable
344 * dma pages left. But unfortunately, some shared parts of the kernel
345 * (particularly the SCSI mid-level) call __get_dma_pages()
346 * unconditionally :-( These calls then fail, and scsi.c even doesn't
347 * check for NULL return values and just crashes. The quick fix for
348 * this (instead of doing much clean up work in the SCSI code) is to
349 * pretend all pages are DMA-able by setting mach_max_dma_address to
350 * ULONG_MAX. This doesn't change any functionality so far, since
351 * get_dma_pages() shouldn't be used on Atari anyway anymore (better
352 * use atari_stram_alloc()), and the Atari SCSI drivers don't need DMA
353 * memory. But unfortunately there's now no kind of warning (even not
354 * a NULL return value) if you use get_dma_pages() nevertheless :-(
355 * You just will get non-DMA-able memory...
356 */
357 mach_max_dma_address = 0xffffffff;
358 }
359#endif
360} 159}
361 160
362void atari_stram_mem_init_hook (void) 161void atari_stram_mem_init_hook (void)
@@ -367,7 +166,6 @@ void atari_stram_mem_init_hook (void)
367 166
368/* 167/*
369 * This is main public interface: somehow allocate a ST-RAM block 168 * This is main public interface: somehow allocate a ST-RAM block
370 * There are three strategies:
371 * 169 *
372 * - If we're before mem_init(), we have to make a static allocation. The 170 * - If we're before mem_init(), we have to make a static allocation. The
373 * region is taken in the kernel data area (if the kernel is in ST-RAM) or 171 * region is taken in the kernel data area (if the kernel is in ST-RAM) or
@@ -375,14 +173,9 @@ void atari_stram_mem_init_hook (void)
375 * rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel 173 * rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel
376 * address space in the latter case. 174 * address space in the latter case.
377 * 175 *
378 * - If mem_init() already has been called and ST-RAM swapping is enabled, 176 * - If mem_init() already has been called, try with __get_dma_pages().
379 * try to get the memory from the (pseudo) swap-space, either free already 177 * This has the disadvantage that it's very hard to get more than 1 page,
380 * or by moving some other pages out of the swap. 178 * and it is likely to fail :-(
381 *
382 * - If mem_init() already has been called, and ST-RAM swapping is not
383 * enabled, the only possibility is to try with __get_dma_pages(). This has
384 * the disadvantage that it's very hard to get more than 1 page, and it is
385 * likely to fail :-(
386 * 179 *
387 */ 180 */
388void *atari_stram_alloc(long size, const char *owner) 181void *atari_stram_alloc(long size, const char *owner)
@@ -393,27 +186,13 @@ void *atari_stram_alloc(long size, const char *owner)
393 186
394 DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner); 187 DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner);
395 188
396 size = ALIGN_IF_SWAP(size);
397 DPRINTK( "atari_stram_alloc: rounded size = %08lx\n", size );
398#ifdef CONFIG_STRAM_SWAP
399 if (max_swap_size) {
400 /* If swapping is active: make some free space in the swap
401 "device". */
402 DPRINTK( "atari_stram_alloc: after mem_init, swapping ok, "
403 "calling get_region\n" );
404 addr = get_stram_region( N_PAGES(size) );
405 flags = BLOCK_INSWAP;
406 }
407 else
408#endif
409 if (!mem_init_done) 189 if (!mem_init_done)
410 return alloc_bootmem_low(size); 190 return alloc_bootmem_low(size);
411 else { 191 else {
412 /* After mem_init() and no swapping: can only resort to 192 /* After mem_init(): can only resort to __get_dma_pages() */
413 * __get_dma_pages() */
414 addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size)); 193 addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size));
415 flags = BLOCK_GFP; 194 flags = BLOCK_GFP;
416 DPRINTK( "atari_stram_alloc: after mem_init, swapping off, " 195 DPRINTK( "atari_stram_alloc: after mem_init, "
417 "get_pages=%p\n", addr ); 196 "get_pages=%p\n", addr );
418 } 197 }
419 198
@@ -422,12 +201,7 @@ void *atari_stram_alloc(long size, const char *owner)
422 /* out of memory for BLOCK structure :-( */ 201 /* out of memory for BLOCK structure :-( */
423 DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- " 202 DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- "
424 "freeing again\n" ); 203 "freeing again\n" );
425#ifdef CONFIG_STRAM_SWAP 204 free_pages((unsigned long)addr, get_order(size));
426 if (flags == BLOCK_INSWAP)
427 free_stram_region( SWAP_NR(addr), N_PAGES(size) );
428 else
429#endif
430 free_pages((unsigned long)addr, get_order(size));
431 return( NULL ); 205 return( NULL );
432 } 206 }
433 block->owner = owner; 207 block->owner = owner;
@@ -451,25 +225,12 @@ void atari_stram_free( void *addr )
451 DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, " 225 DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, "
452 "flags=%02x\n", block, block->size, block->owner, block->flags ); 226 "flags=%02x\n", block, block->size, block->owner, block->flags );
453 227
454#ifdef CONFIG_STRAM_SWAP 228 if (!(block->flags & BLOCK_GFP))
455 if (!max_swap_size) {
456#endif
457 if (block->flags & BLOCK_GFP) {
458 DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
459 get_order(block->size));
460 free_pages((unsigned long)addr, get_order(block->size));
461 }
462 else
463 goto fail;
464#ifdef CONFIG_STRAM_SWAP
465 }
466 else if (block->flags & BLOCK_INSWAP) {
467 DPRINTK( "atari_stram_free: is swap-alloced\n" );
468 free_stram_region( SWAP_NR(block->start), N_PAGES(block->size) );
469 }
470 else
471 goto fail; 229 goto fail;
472#endif 230
231 DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n",
232 get_order(block->size));
233 free_pages((unsigned long)addr, get_order(block->size));
473 remove_region( block ); 234 remove_region( block );
474 return; 235 return;
475 236
@@ -478,612 +239,6 @@ void atari_stram_free( void *addr )
478 "(called from %p)\n", addr, __builtin_return_address(0) ); 239 "(called from %p)\n", addr, __builtin_return_address(0) );
479} 240}
480 241
481
482#ifdef CONFIG_STRAM_SWAP
483
484
485/* ------------------------------------------------------------------------ */
486/* Main Swapping Functions */
487/* ------------------------------------------------------------------------ */
488
489
490/*
491 * Initialize ST-RAM swap device
492 * (lots copied and modified from sys_swapon() in mm/swapfile.c)
493 */
494static int __init swap_init(void *start_mem, void *swap_data)
495{
496 static struct dentry fake_dentry;
497 static struct vfsmount fake_vfsmnt;
498 struct swap_info_struct *p;
499 struct inode swap_inode;
500 unsigned int type;
501 void *addr;
502 int i, j, k, prev;
503
504 DPRINTK("swap_init(start_mem=%p, swap_data=%p)\n",
505 start_mem, swap_data);
506
507 /* need at least one page for swapping to (and this also isn't very
508 * much... :-) */
509 if (swap_end - swap_start < 2*PAGE_SIZE) {
510 printk( KERN_WARNING "stram_swap_init: swap space too small\n" );
511 return( 0 );
512 }
513
514 /* find free slot in swap_info */
515 for( p = swap_info, type = 0; type < nr_swapfiles; type++, p++ )
516 if (!(p->flags & SWP_USED))
517 break;
518 if (type >= MAX_SWAPFILES) {
519 printk( KERN_WARNING "stram_swap_init: max. number of "
520 "swap devices exhausted\n" );
521 return( 0 );
522 }
523 if (type >= nr_swapfiles)
524 nr_swapfiles = type+1;
525
526 stram_swap_info = p;
527 stram_swap_type = type;
528
529 /* fake some dir cache entries to give us some name in /dev/swaps */
530 fake_dentry.d_parent = &fake_dentry;
531 fake_dentry.d_name.name = "stram (internal)";
532 fake_dentry.d_name.len = 16;
533 fake_vfsmnt.mnt_parent = &fake_vfsmnt;
534
535 p->flags = SWP_USED;
536 p->swap_file = &fake_dentry;
537 p->swap_vfsmnt = &fake_vfsmnt;
538 p->swap_map = swap_data;
539 p->cluster_nr = 0;
540 p->next = -1;
541 p->prio = 0x7ff0; /* a rather high priority, but not the higest
542 * to give the user a chance to override */
543
544 /* call stram_open() directly, avoids at least the overhead in
545 * constructing a dummy file structure... */
546 swap_inode.i_rdev = MKDEV( STRAM_MAJOR, STRAM_MINOR );
547 stram_open( &swap_inode, MAGIC_FILE_P );
548 p->max = SWAP_NR(swap_end);
549
550 /* initialize swap_map: set regions that are already allocated or belong
551 * to kernel data space to SWAP_MAP_BAD, otherwise to free */
552 j = 0; /* # of free pages */
553 k = 0; /* # of already allocated pages (from pre-mem_init stram_alloc()) */
554 p->lowest_bit = 0;
555 p->highest_bit = 0;
556 for( i = 1, addr = SWAP_ADDR(1); i < p->max;
557 i++, addr += PAGE_SIZE ) {
558 if (in_some_region( addr )) {
559 p->swap_map[i] = SWAP_MAP_BAD;
560 ++k;
561 }
562 else if (kernel_in_stram && addr < start_mem ) {
563 p->swap_map[i] = SWAP_MAP_BAD;
564 }
565 else {
566 p->swap_map[i] = 0;
567 ++j;
568 if (!p->lowest_bit) p->lowest_bit = i;
569 p->highest_bit = i;
570 }
571 }
572 /* first page always reserved (and doesn't really belong to swap space) */
573 p->swap_map[0] = SWAP_MAP_BAD;
574
575 /* now swapping to this device ok */
576 p->pages = j + k;
577 swap_list_lock();
578 nr_swap_pages += j;
579 p->flags = SWP_WRITEOK;
580
581 /* insert swap space into swap_list */
582 prev = -1;
583 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
584 if (p->prio >= swap_info[i].prio) {
585 break;
586 }
587 prev = i;
588 }
589 p->next = i;
590 if (prev < 0) {
591 swap_list.head = swap_list.next = p - swap_info;
592 } else {
593 swap_info[prev].next = p - swap_info;
594 }
595 swap_list_unlock();
596
597 printk( KERN_INFO "Using %dk (%d pages) of ST-RAM as swap space.\n",
598 p->pages << 2, p->pages );
599 return( 1 );
600}
601
602
603/*
604 * The swap entry has been read in advance, and we return 1 to indicate
605 * that the page has been used or is no longer needed.
606 *
607 * Always set the resulting pte to be nowrite (the same as COW pages
608 * after one process has exited). We don't know just how many PTEs will
609 * share this swap entry, so be cautious and let do_wp_page work out
610 * what to do if a write is requested later.
611 */
612static inline void unswap_pte(struct vm_area_struct * vma, unsigned long
613 address, pte_t *dir, swp_entry_t entry,
614 struct page *page)
615{
616 pte_t pte = *dir;
617
618 if (pte_none(pte))
619 return;
620 if (pte_present(pte)) {
621 /* If this entry is swap-cached, then page must already
622 hold the right address for any copies in physical
623 memory */
624 if (pte_page(pte) != page)
625 return;
626 /* We will be removing the swap cache in a moment, so... */
627 set_pte(dir, pte_mkdirty(pte));
628 return;
629 }
630 if (pte_val(pte) != entry.val)
631 return;
632
633 DPRINTK("unswap_pte: replacing entry %08lx by new page %p",
634 entry.val, page);
635 set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
636 swap_free(entry);
637 get_page(page);
638 inc_mm_counter(vma->vm_mm, rss);
639}
640
641static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir,
642 unsigned long address, unsigned long size,
643 unsigned long offset, swp_entry_t entry,
644 struct page *page)
645{
646 pte_t * pte;
647 unsigned long end;
648
649 if (pmd_none(*dir))
650 return;
651 if (pmd_bad(*dir)) {
652 pmd_ERROR(*dir);
653 pmd_clear(dir);
654 return;
655 }
656 pte = pte_offset_kernel(dir, address);
657 offset += address & PMD_MASK;
658 address &= ~PMD_MASK;
659 end = address + size;
660 if (end > PMD_SIZE)
661 end = PMD_SIZE;
662 do {
663 unswap_pte(vma, offset+address-vma->vm_start, pte, entry, page);
664 address += PAGE_SIZE;
665 pte++;
666 } while (address < end);
667}
668
669static inline void unswap_pgd(struct vm_area_struct * vma, pgd_t *dir,
670 unsigned long address, unsigned long size,
671 swp_entry_t entry, struct page *page)
672{
673 pmd_t * pmd;
674 unsigned long offset, end;
675
676 if (pgd_none(*dir))
677 return;
678 if (pgd_bad(*dir)) {
679 pgd_ERROR(*dir);
680 pgd_clear(dir);
681 return;
682 }
683 pmd = pmd_offset(dir, address);
684 offset = address & PGDIR_MASK;
685 address &= ~PGDIR_MASK;
686 end = address + size;
687 if (end > PGDIR_SIZE)
688 end = PGDIR_SIZE;
689 do {
690 unswap_pmd(vma, pmd, address, end - address, offset, entry,
691 page);
692 address = (address + PMD_SIZE) & PMD_MASK;
693 pmd++;
694 } while (address < end);
695}
696
697static void unswap_vma(struct vm_area_struct * vma, pgd_t *pgdir,
698 swp_entry_t entry, struct page *page)
699{
700 unsigned long start = vma->vm_start, end = vma->vm_end;
701
702 do {
703 unswap_pgd(vma, pgdir, start, end - start, entry, page);
704 start = (start + PGDIR_SIZE) & PGDIR_MASK;
705 pgdir++;
706 } while (start < end);
707}
708
709static void unswap_process(struct mm_struct * mm, swp_entry_t entry,
710 struct page *page)
711{
712 struct vm_area_struct* vma;
713
714 /*
715 * Go through process' page directory.
716 */
717 if (!mm)
718 return;
719 for (vma = mm->mmap; vma; vma = vma->vm_next) {
720 pgd_t * pgd = pgd_offset(mm, vma->vm_start);
721 unswap_vma(vma, pgd, entry, page);
722 }
723}
724
725
726static int unswap_by_read(unsigned short *map, unsigned long max,
727 unsigned long start, unsigned long n_pages)
728{
729 struct task_struct *p;
730 struct page *page;
731 swp_entry_t entry;
732 unsigned long i;
733
734 DPRINTK( "unswapping %lu..%lu by reading in\n",
735 start, start+n_pages-1 );
736
737 for( i = start; i < start+n_pages; ++i ) {
738 if (map[i] == SWAP_MAP_BAD) {
739 printk( KERN_ERR "get_stram_region: page %lu already "
740 "reserved??\n", i );
741 continue;
742 }
743
744 if (map[i]) {
745 entry = swp_entry(stram_swap_type, i);
746 DPRINTK("unswap: map[i=%lu]=%u nr_swap=%ld\n",
747 i, map[i], nr_swap_pages);
748
749 swap_device_lock(stram_swap_info);
750 map[i]++;
751 swap_device_unlock(stram_swap_info);
752 /* Get a page for the entry, using the existing
753 swap cache page if there is one. Otherwise,
754 get a clean page and read the swap into it. */
755 page = read_swap_cache_async(entry, NULL, 0);
756 if (!page) {
757 swap_free(entry);
758 return -ENOMEM;
759 }
760 read_lock(&tasklist_lock);
761 for_each_process(p)
762 unswap_process(p->mm, entry, page);
763 read_unlock(&tasklist_lock);
764 shmem_unuse(entry, page);
765 /* Now get rid of the extra reference to the
766 temporary page we've been using. */
767 if (PageSwapCache(page))
768 delete_from_swap_cache(page);
769 __free_page(page);
770 #ifdef DO_PROC
771 stat_swap_force++;
772 #endif
773 }
774
775 DPRINTK( "unswap: map[i=%lu]=%u nr_swap=%ld\n",
776 i, map[i], nr_swap_pages );
777 swap_list_lock();
778 swap_device_lock(stram_swap_info);
779 map[i] = SWAP_MAP_BAD;
780 if (stram_swap_info->lowest_bit == i)
781 stram_swap_info->lowest_bit++;
782 if (stram_swap_info->highest_bit == i)
783 stram_swap_info->highest_bit--;
784 --nr_swap_pages;
785 swap_device_unlock(stram_swap_info);
786 swap_list_unlock();
787 }
788
789 return 0;
790}
791
792/*
793 * reserve a region in ST-RAM swap space for an allocation
794 */
795static void *get_stram_region( unsigned long n_pages )
796{
797 unsigned short *map = stram_swap_info->swap_map;
798 unsigned long max = stram_swap_info->max;
799 unsigned long start, total_free, region_free;
800 int err;
801 void *ret = NULL;
802
803 DPRINTK( "get_stram_region(n_pages=%lu)\n", n_pages );
804
805 down(&stram_swap_sem);
806
807 /* disallow writing to the swap device now */
808 stram_swap_info->flags = SWP_USED;
809
810 /* find a region of n_pages pages in the swap space including as much free
811 * pages as possible (and excluding any already-reserved pages). */
812 if (!(start = find_free_region( n_pages, &total_free, &region_free )))
813 goto end;
814 DPRINTK( "get_stram_region: region starts at %lu, has %lu free pages\n",
815 start, region_free );
816
817 err = unswap_by_read(map, max, start, n_pages);
818 if (err)
819 goto end;
820
821 ret = SWAP_ADDR(start);
822 end:
823 /* allow using swap device again */
824 stram_swap_info->flags = SWP_WRITEOK;
825 up(&stram_swap_sem);
826 DPRINTK( "get_stram_region: returning %p\n", ret );
827 return( ret );
828}
829
830
831/*
832 * free a reserved region in ST-RAM swap space
833 */
834static void free_stram_region( unsigned long offset, unsigned long n_pages )
835{
836 unsigned short *map = stram_swap_info->swap_map;
837
838 DPRINTK( "free_stram_region(offset=%lu,n_pages=%lu)\n", offset, n_pages );
839
840 if (offset < 1 || offset + n_pages > stram_swap_info->max) {
841 printk( KERN_ERR "free_stram_region: Trying to free non-ST-RAM\n" );
842 return;
843 }
844
845 swap_list_lock();
846 swap_device_lock(stram_swap_info);
847 /* un-reserve the freed pages */
848 for( ; n_pages > 0; ++offset, --n_pages ) {
849 if (map[offset] != SWAP_MAP_BAD)
850 printk( KERN_ERR "free_stram_region: Swap page %lu was not "
851 "reserved\n", offset );
852 map[offset] = 0;
853 }
854
855 /* update swapping meta-data */
856 if (offset < stram_swap_info->lowest_bit)
857 stram_swap_info->lowest_bit = offset;
858 if (offset+n_pages-1 > stram_swap_info->highest_bit)
859 stram_swap_info->highest_bit = offset+n_pages-1;
860 if (stram_swap_info->prio > swap_info[swap_list.next].prio)
861 swap_list.next = swap_list.head;
862 nr_swap_pages += n_pages;
863 swap_device_unlock(stram_swap_info);
864 swap_list_unlock();
865}
866
867
868/* ------------------------------------------------------------------------ */
869/* Utility Functions for Swapping */
870/* ------------------------------------------------------------------------ */
871
872
873/* is addr in some of the allocated regions? */
874static int in_some_region(void *addr)
875{
876 BLOCK *p;
877
878 for( p = alloc_list; p; p = p->next ) {
879 if (p->start <= addr && addr < p->start + p->size)
880 return( 1 );
881 }
882 return( 0 );
883}
884
885
886static unsigned long find_free_region(unsigned long n_pages,
887 unsigned long *total_free,
888 unsigned long *region_free)
889{
890 unsigned short *map = stram_swap_info->swap_map;
891 unsigned long max = stram_swap_info->max;
892 unsigned long head, tail, max_start;
893 long nfree, max_free;
894
895 /* first scan the swap space for a suitable place for the allocation */
896 head = 1;
897 max_start = 0;
898 max_free = -1;
899 *total_free = 0;
900
901 start_over:
902 /* increment tail until final window size reached, and count free pages */
903 nfree = 0;
904 for( tail = head; tail-head < n_pages && tail < max; ++tail ) {
905 if (map[tail] == SWAP_MAP_BAD) {
906 head = tail+1;
907 goto start_over;
908 }
909 if (!map[tail]) {
910 ++nfree;
911 ++*total_free;
912 }
913 }
914 if (tail-head < n_pages)
915 goto out;
916 if (nfree > max_free) {
917 max_start = head;
918 max_free = nfree;
919 if (max_free >= n_pages)
920 /* don't need more free pages... :-) */
921 goto out;
922 }
923
924 /* now shift the window and look for the area where as much pages as
925 * possible are free */
926 while( tail < max ) {
927 nfree -= (map[head++] == 0);
928 if (map[tail] == SWAP_MAP_BAD) {
929 head = tail+1;
930 goto start_over;
931 }
932 if (!map[tail]) {
933 ++nfree;
934 ++*total_free;
935 }
936 ++tail;
937 if (nfree > max_free) {
938 max_start = head;
939 max_free = nfree;
940 if (max_free >= n_pages)
941 /* don't need more free pages... :-) */
942 goto out;
943 }
944 }
945
946 out:
947 if (max_free < 0) {
948 printk( KERN_NOTICE "get_stram_region: ST-RAM too full or fragmented "
949 "-- can't allocate %lu pages\n", n_pages );
950 return( 0 );
951 }
952
953 *region_free = max_free;
954 return( max_start );
955}
956
957
958/* setup parameters from command line */
959void __init stram_swap_setup(char *str, int *ints)
960{
961 if (ints[0] >= 1)
962 max_swap_size = ((ints[1] < 0 ? 0 : ints[1]) * 1024) & PAGE_MASK;
963}
964
965
966/* ------------------------------------------------------------------------ */
967/* ST-RAM device */
968/* ------------------------------------------------------------------------ */
969
970static int refcnt;
971
972static void do_stram_request(request_queue_t *q)
973{
974 struct request *req;
975
976 while ((req = elv_next_request(q)) != NULL) {
977 void *start = swap_start + (req->sector << 9);
978 unsigned long len = req->current_nr_sectors << 9;
979 if ((start + len) > swap_end) {
980 printk( KERN_ERR "stram: bad access beyond end of device: "
981 "block=%ld, count=%d\n",
982 req->sector,
983 req->current_nr_sectors );
984 end_request(req, 0);
985 continue;
986 }
987
988 if (req->cmd == READ) {
989 memcpy(req->buffer, start, len);
990#ifdef DO_PROC
991 stat_swap_read += N_PAGES(len);
992#endif
993 }
994 else {
995 memcpy(start, req->buffer, len);
996#ifdef DO_PROC
997 stat_swap_write += N_PAGES(len);
998#endif
999 }
1000 end_request(req, 1);
1001 }
1002}
1003
1004
1005static int stram_open( struct inode *inode, struct file *filp )
1006{
1007 if (filp != MAGIC_FILE_P) {
1008 printk( KERN_NOTICE "Only kernel can open ST-RAM device\n" );
1009 return( -EPERM );
1010 }
1011 if (refcnt)
1012 return( -EBUSY );
1013 ++refcnt;
1014 return( 0 );
1015}
1016
1017static int stram_release( struct inode *inode, struct file *filp )
1018{
1019 if (filp != MAGIC_FILE_P) {
1020 printk( KERN_NOTICE "Only kernel can close ST-RAM device\n" );
1021 return( -EPERM );
1022 }
1023 if (refcnt > 0)
1024 --refcnt;
1025 return( 0 );
1026}
1027
1028
1029static struct block_device_operations stram_fops = {
1030 .open = stram_open,
1031 .release = stram_release,
1032};
1033
1034static struct gendisk *stram_disk;
1035static struct request_queue *stram_queue;
1036static DEFINE_SPINLOCK(stram_lock);
1037
1038int __init stram_device_init(void)
1039{
1040 if (!MACH_IS_ATARI)
1041 /* no point in initializing this, I hope */
1042 return -ENXIO;
1043
1044 if (!max_swap_size)
1045 /* swapping not enabled */
1046 return -ENXIO;
1047 stram_disk = alloc_disk(1);
1048 if (!stram_disk)
1049 return -ENOMEM;
1050
1051 if (register_blkdev(STRAM_MAJOR, "stram")) {
1052 put_disk(stram_disk);
1053 return -ENXIO;
1054 }
1055
1056 stram_queue = blk_init_queue(do_stram_request, &stram_lock);
1057 if (!stram_queue) {
1058 unregister_blkdev(STRAM_MAJOR, "stram");
1059 put_disk(stram_disk);
1060 return -ENOMEM;
1061 }
1062
1063 stram_disk->major = STRAM_MAJOR;
1064 stram_disk->first_minor = STRAM_MINOR;
1065 stram_disk->fops = &stram_fops;
1066 stram_disk->queue = stram_queue;
1067 sprintf(stram_disk->disk_name, "stram");
1068 set_capacity(stram_disk, (swap_end - swap_start)/512);
1069 add_disk(stram_disk);
1070 return 0;
1071}
1072
1073
1074
1075/* ------------------------------------------------------------------------ */
1076/* Misc Utility Functions */
1077/* ------------------------------------------------------------------------ */
1078
1079/* reserve a range of pages */
1080static void reserve_region(void *start, void *end)
1081{
1082 reserve_bootmem (virt_to_phys(start), end - start);
1083}
1084
1085#endif /* CONFIG_STRAM_SWAP */
1086
1087 242
1088/* ------------------------------------------------------------------------ */ 243/* ------------------------------------------------------------------------ */
1089/* Region Management */ 244/* Region Management */
@@ -1173,50 +328,9 @@ int get_stram_list( char *buf )
1173{ 328{
1174 int len = 0; 329 int len = 0;
1175 BLOCK *p; 330 BLOCK *p;
1176#ifdef CONFIG_STRAM_SWAP
1177 int i;
1178 unsigned short *map = stram_swap_info->swap_map;
1179 unsigned long max = stram_swap_info->max;
1180 unsigned free = 0, used = 0, rsvd = 0;
1181#endif
1182 331
1183#ifdef CONFIG_STRAM_SWAP 332 PRINT_PROC("Total ST-RAM: %8u kB\n",
1184 if (max_swap_size) {
1185 for( i = 1; i < max; ++i ) {
1186 if (!map[i])
1187 ++free;
1188 else if (map[i] == SWAP_MAP_BAD)
1189 ++rsvd;
1190 else
1191 ++used;
1192 }
1193 PRINT_PROC(
1194 "Total ST-RAM: %8u kB\n"
1195 "Total ST-RAM swap: %8lu kB\n"
1196 "Free swap: %8u kB\n"
1197 "Used swap: %8u kB\n"
1198 "Allocated swap: %8u kB\n"
1199 "Swap Reads: %8u\n"
1200 "Swap Writes: %8u\n"
1201 "Swap Forced Reads: %8u\n",
1202 (stram_end - stram_start) >> 10,
1203 (max-1) << (PAGE_SHIFT-10),
1204 free << (PAGE_SHIFT-10),
1205 used << (PAGE_SHIFT-10),
1206 rsvd << (PAGE_SHIFT-10),
1207 stat_swap_read,
1208 stat_swap_write,
1209 stat_swap_force );
1210 }
1211 else {
1212#endif
1213 PRINT_PROC( "ST-RAM swapping disabled\n" );
1214 PRINT_PROC("Total ST-RAM: %8u kB\n",
1215 (stram_end - stram_start) >> 10); 333 (stram_end - stram_start) >> 10);
1216#ifdef CONFIG_STRAM_SWAP
1217 }
1218#endif
1219
1220 PRINT_PROC( "Allocated regions:\n" ); 334 PRINT_PROC( "Allocated regions:\n" );
1221 for( p = alloc_list; p; p = p->next ) { 335 for( p = alloc_list; p; p = p->next ) {
1222 if (len + 50 >= PAGE_SIZE) 336 if (len + 50 >= PAGE_SIZE)
@@ -1227,8 +341,6 @@ int get_stram_list( char *buf )
1227 p->owner); 341 p->owner);
1228 if (p->flags & BLOCK_GFP) 342 if (p->flags & BLOCK_GFP)
1229 PRINT_PROC( "page-alloced)\n" ); 343 PRINT_PROC( "page-alloced)\n" );
1230 else if (p->flags & BLOCK_INSWAP)
1231 PRINT_PROC( "in swap)\n" );
1232 else 344 else
1233 PRINT_PROC( "??)\n" ); 345 PRINT_PROC( "??)\n" );
1234 } 346 }
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c
index 5dcb3fa35ea9..fe2383e36b06 100644
--- a/arch/m68k/mm/kmap.c
+++ b/arch/m68k/mm/kmap.c
@@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag)
201 virtaddr += PTRTREESIZE; 201 virtaddr += PTRTREESIZE;
202 size -= PTRTREESIZE; 202 size -= PTRTREESIZE;
203 } else { 203 } else {
204 pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr); 204 pte_dir = pte_alloc_kernel(pmd_dir, virtaddr);
205 if (!pte_dir) { 205 if (!pte_dir) {
206 printk("ioremap: no mem for pte_dir\n"); 206 printk("ioremap: no mem for pte_dir\n");
207 return NULL; 207 return NULL;
diff --git a/arch/m68k/sun3x/dvma.c b/arch/m68k/sun3x/dvma.c
index 32e55adfeb8e..117481e86305 100644
--- a/arch/m68k/sun3x/dvma.c
+++ b/arch/m68k/sun3x/dvma.c
@@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr,
116 pte_t *pte; 116 pte_t *pte;
117 unsigned long end3; 117 unsigned long end3;
118 118
119 if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) { 119 if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) {
120 ret = -ENOMEM; 120 ret = -ENOMEM;
121 goto out; 121 goto out;
122 } 122 }
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index 99262fe64560..7ce34d4aa220 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
697 /* Do this so that we can load the interpreter, if need be. We will 697 /* Do this so that we can load the interpreter, if need be. We will
698 * change some of these later. 698 * change some of these later.
699 */ 699 */
700 set_mm_counter(current->mm, rss, 0);
701 setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); 700 setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
702 current->mm->start_stack = bprm->p; 701 current->mm->start_stack = bprm->p;
703 702
diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c
index 9c44ca70befa..3101d1db5592 100644
--- a/arch/mips/mm/ioremap.c
+++ b/arch/mips/mm/ioremap.c
@@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
55 if (address >= end) 55 if (address >= end)
56 BUG(); 56 BUG();
57 do { 57 do {
58 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 58 pte_t * pte = pte_alloc_kernel(pmd, address);
59 if (!pte) 59 if (!pte)
60 return -ENOMEM; 60 return -ENOMEM;
61 remap_area_pte(pte, address, end - address, address + phys_addr, flags); 61 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -77,7 +77,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
77 flush_cache_all(); 77 flush_cache_all();
78 if (address >= end) 78 if (address >= end)
79 BUG(); 79 BUG();
80 spin_lock(&init_mm.page_table_lock);
81 do { 80 do {
82 pud_t *pud; 81 pud_t *pud;
83 pmd_t *pmd; 82 pmd_t *pmd;
@@ -96,7 +95,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
96 address = (address + PGDIR_SIZE) & PGDIR_MASK; 95 address = (address + PGDIR_SIZE) & PGDIR_MASK;
97 dir++; 96 dir++;
98 } while (address && (address < end)); 97 } while (address && (address < end));
99 spin_unlock(&init_mm.page_table_lock);
100 flush_tlb_all(); 98 flush_tlb_all();
101 return error; 99 return error;
102} 100}
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index e15f09eaed12..a065349aee37 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -270,7 +270,6 @@ void flush_dcache_page(struct page *page)
270 unsigned long offset; 270 unsigned long offset;
271 unsigned long addr; 271 unsigned long addr;
272 pgoff_t pgoff; 272 pgoff_t pgoff;
273 pte_t *pte;
274 unsigned long pfn = page_to_pfn(page); 273 unsigned long pfn = page_to_pfn(page);
275 274
276 275
@@ -301,21 +300,16 @@ void flush_dcache_page(struct page *page)
301 * taking a page fault if the pte doesn't exist. 300 * taking a page fault if the pte doesn't exist.
302 * This is just for speed. If the page translation 301 * This is just for speed. If the page translation
303 * isn't there, there's no point exciting the 302 * isn't there, there's no point exciting the
304 * nadtlb handler into a nullification frenzy */ 303 * nadtlb handler into a nullification frenzy.
305 304 *
306 305 * Make sure we really have this page: the private
307 if(!(pte = translation_exists(mpnt, addr)))
308 continue;
309
310 /* make sure we really have this page: the private
311 * mappings may cover this area but have COW'd this 306 * mappings may cover this area but have COW'd this
312 * particular page */ 307 * particular page.
313 if(pte_pfn(*pte) != pfn) 308 */
314 continue; 309 if (translation_exists(mpnt, addr, pfn)) {
315 310 __flush_cache_page(mpnt, addr);
316 __flush_cache_page(mpnt, addr); 311 break;
317 312 }
318 break;
319 } 313 }
320 flush_dcache_mmap_unlock(mapping); 314 flush_dcache_mmap_unlock(mapping);
321} 315}
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index ae6213d71670..f94a02ef3d95 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr,
114 if (end > PGDIR_SIZE) 114 if (end > PGDIR_SIZE)
115 end = PGDIR_SIZE; 115 end = PGDIR_SIZE;
116 do { 116 do {
117 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr); 117 pte_t * pte = pte_alloc_kernel(pmd, vaddr);
118 if (!pte) 118 if (!pte)
119 return -ENOMEM; 119 return -ENOMEM;
120 if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr)) 120 if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr))
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 2886ad70db48..29b998e430e6 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -505,7 +505,9 @@ void show_mem(void)
505 505
506 for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { 506 for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
507 struct page *p; 507 struct page *p;
508 unsigned long flags;
508 509
510 pgdat_resize_lock(NODE_DATA(i), &flags);
509 p = nid_page_nr(i, j) - node_start_pfn(i); 511 p = nid_page_nr(i, j) - node_start_pfn(i);
510 512
511 total++; 513 total++;
@@ -517,6 +519,7 @@ void show_mem(void)
517 free++; 519 free++;
518 else 520 else
519 shared += page_count(p) - 1; 521 shared += page_count(p) - 1;
522 pgdat_resize_unlock(NODE_DATA(i), &flags);
520 } 523 }
521 } 524 }
522#endif 525#endif
diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c
index f2df502cdae3..5c7a1b3b9326 100644
--- a/arch/parisc/mm/ioremap.c
+++ b/arch/parisc/mm/ioremap.c
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
52 if (address >= end) 52 if (address >= end)
53 BUG(); 53 BUG();
54 do { 54 do {
55 pte_t * pte = pte_alloc_kernel(NULL, pmd, address); 55 pte_t * pte = pte_alloc_kernel(pmd, address);
56 if (!pte) 56 if (!pte)
57 return -ENOMEM; 57 return -ENOMEM;
58 remap_area_pte(pte, address, end - address, address + phys_addr, flags); 58 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -75,10 +75,9 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
75 flush_cache_all(); 75 flush_cache_all();
76 if (address >= end) 76 if (address >= end)
77 BUG(); 77 BUG();
78 spin_lock(&init_mm.page_table_lock);
79 do { 78 do {
80 pmd_t *pmd; 79 pmd_t *pmd;
81 pmd = pmd_alloc(dir, address); 80 pmd = pmd_alloc(&init_mm, dir, address);
82 error = -ENOMEM; 81 error = -ENOMEM;
83 if (!pmd) 82 if (!pmd)
84 break; 83 break;
@@ -89,7 +88,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
89 address = (address + PGDIR_SIZE) & PGDIR_MASK; 88 address = (address + PGDIR_SIZE) & PGDIR_MASK;
90 dir++; 89 dir++;
91 } while (address && (address < end)); 90 } while (address && (address < end));
92 spin_unlock(&init_mm.page_table_lock);
93 flush_tlb_all(); 91 flush_tlb_all();
94 return error; 92 return error;
95} 93}
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c
index 0f710d2baec6..685fd0defe23 100644
--- a/arch/ppc/kernel/dma-mapping.c
+++ b/arch/ppc/kernel/dma-mapping.c
@@ -335,8 +335,6 @@ static int __init dma_alloc_init(void)
335 pte_t *pte; 335 pte_t *pte;
336 int ret = 0; 336 int ret = 0;
337 337
338 spin_lock(&init_mm.page_table_lock);
339
340 do { 338 do {
341 pgd = pgd_offset(&init_mm, CONSISTENT_BASE); 339 pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
342 pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); 340 pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -347,7 +345,7 @@ static int __init dma_alloc_init(void)
347 } 345 }
348 WARN_ON(!pmd_none(*pmd)); 346 WARN_ON(!pmd_none(*pmd));
349 347
350 pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); 348 pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
351 if (!pte) { 349 if (!pte) {
352 printk(KERN_ERR "%s: no pte tables\n", __func__); 350 printk(KERN_ERR "%s: no pte tables\n", __func__);
353 ret = -ENOMEM; 351 ret = -ENOMEM;
@@ -357,8 +355,6 @@ static int __init dma_alloc_init(void)
357 consistent_pte = pte; 355 consistent_pte = pte;
358 } while (0); 356 } while (0);
359 357
360 spin_unlock(&init_mm.page_table_lock);
361
362 return ret; 358 return ret;
363} 359}
364 360
diff --git a/arch/ppc/mm/4xx_mmu.c b/arch/ppc/mm/4xx_mmu.c
index b7bcbc232f39..4d006aa1a0d1 100644
--- a/arch/ppc/mm/4xx_mmu.c
+++ b/arch/ppc/mm/4xx_mmu.c
@@ -110,13 +110,11 @@ unsigned long __init mmu_mapin_ram(void)
110 pmd_t *pmdp; 110 pmd_t *pmdp;
111 unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; 111 unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
112 112
113 spin_lock(&init_mm.page_table_lock);
114 pmdp = pmd_offset(pgd_offset_k(v), v); 113 pmdp = pmd_offset(pgd_offset_k(v), v);
115 pmd_val(*pmdp++) = val; 114 pmd_val(*pmdp++) = val;
116 pmd_val(*pmdp++) = val; 115 pmd_val(*pmdp++) = val;
117 pmd_val(*pmdp++) = val; 116 pmd_val(*pmdp++) = val;
118 pmd_val(*pmdp++) = val; 117 pmd_val(*pmdp++) = val;
119 spin_unlock(&init_mm.page_table_lock);
120 118
121 v += LARGE_PAGE_SIZE_16M; 119 v += LARGE_PAGE_SIZE_16M;
122 p += LARGE_PAGE_SIZE_16M; 120 p += LARGE_PAGE_SIZE_16M;
@@ -127,10 +125,8 @@ unsigned long __init mmu_mapin_ram(void)
127 pmd_t *pmdp; 125 pmd_t *pmdp;
128 unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; 126 unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
129 127
130 spin_lock(&init_mm.page_table_lock);
131 pmdp = pmd_offset(pgd_offset_k(v), v); 128 pmdp = pmd_offset(pgd_offset_k(v), v);
132 pmd_val(*pmdp) = val; 129 pmd_val(*pmdp) = val;
133 spin_unlock(&init_mm.page_table_lock);
134 130
135 v += LARGE_PAGE_SIZE_4M; 131 v += LARGE_PAGE_SIZE_4M;
136 p += LARGE_PAGE_SIZE_4M; 132 p += LARGE_PAGE_SIZE_4M;
diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c
index 43505b1fc5d8..6ea9185fd120 100644
--- a/arch/ppc/mm/pgtable.c
+++ b/arch/ppc/mm/pgtable.c
@@ -280,18 +280,16 @@ map_page(unsigned long va, phys_addr_t pa, int flags)
280 pte_t *pg; 280 pte_t *pg;
281 int err = -ENOMEM; 281 int err = -ENOMEM;
282 282
283 spin_lock(&init_mm.page_table_lock);
284 /* Use upper 10 bits of VA to index the first level map */ 283 /* Use upper 10 bits of VA to index the first level map */
285 pd = pmd_offset(pgd_offset_k(va), va); 284 pd = pmd_offset(pgd_offset_k(va), va);
286 /* Use middle 10 bits of VA to index the second-level map */ 285 /* Use middle 10 bits of VA to index the second-level map */
287 pg = pte_alloc_kernel(&init_mm, pd, va); 286 pg = pte_alloc_kernel(pd, va);
288 if (pg != 0) { 287 if (pg != 0) {
289 err = 0; 288 err = 0;
290 set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); 289 set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
291 if (mem_init_done) 290 if (mem_init_done)
292 flush_HPTE(0, va, pmd_val(*pd)); 291 flush_HPTE(0, va, pmd_val(*pd));
293 } 292 }
294 spin_unlock(&init_mm.page_table_lock);
295 return err; 293 return err;
296} 294}
297 295
diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c
index efa985f05aca..4aacf521e3e4 100644
--- a/arch/ppc64/kernel/vdso.c
+++ b/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
176 return NOPAGE_SIGBUS; 176 return NOPAGE_SIGBUS;
177 177
178 /* 178 /*
179 * Last page is systemcfg, special handling here, no get_page() a 179 * Last page is systemcfg.
180 * this is a reserved page
181 */ 180 */
182 if ((vma->vm_end - address) <= PAGE_SIZE) 181 if ((vma->vm_end - address) <= PAGE_SIZE)
183 return virt_to_page(systemcfg); 182 pg = virt_to_page(systemcfg);
183 else
184 pg = virt_to_page(vbase + offset);
184 185
185 pg = virt_to_page(vbase + offset);
186 get_page(pg); 186 get_page(pg);
187 DBG(" ->page count: %d\n", page_count(pg)); 187 DBG(" ->page count: %d\n", page_count(pg));
188 188
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
259 * gettimeofday will be totally dead. It's fine to use that for setting 259 * gettimeofday will be totally dead. It's fine to use that for setting
260 * breakpoints in the vDSO code pages though 260 * breakpoints in the vDSO code pages though
261 */ 261 */
262 vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 262 vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
263 vma->vm_flags |= mm->def_flags; 263 vma->vm_flags |= mm->def_flags;
264 vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; 264 vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
265 vma->vm_ops = &vdso_vmops; 265 vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
603 ClearPageReserved(pg); 603 ClearPageReserved(pg);
604 get_page(pg); 604 get_page(pg);
605 } 605 }
606
607 get_page(virt_to_page(systemcfg));
606} 608}
607 609
608int in_gate_area_no_task(unsigned long addr) 610int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
index c65b87b92756..f4ca29cf5364 100644
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -300,12 +300,7 @@ void im_free(void * addr)
300 for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { 300 for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
301 if (tmp->addr == addr) { 301 if (tmp->addr == addr) {
302 *p = tmp->next; 302 *p = tmp->next;
303
304 /* XXX: do we need the lock? */
305 spin_lock(&init_mm.page_table_lock);
306 unmap_vm_area(tmp); 303 unmap_vm_area(tmp);
307 spin_unlock(&init_mm.page_table_lock);
308
309 kfree(tmp); 304 kfree(tmp);
310 up(&imlist_sem); 305 up(&imlist_sem);
311 return; 306 return;
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index be64b157afce..e2bd7776622f 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -104,6 +104,8 @@ void show_mem(void)
104 show_free_areas(); 104 show_free_areas();
105 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); 105 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
106 for_each_pgdat(pgdat) { 106 for_each_pgdat(pgdat) {
107 unsigned long flags;
108 pgdat_resize_lock(pgdat, &flags);
107 for (i = 0; i < pgdat->node_spanned_pages; i++) { 109 for (i = 0; i < pgdat->node_spanned_pages; i++) {
108 page = pgdat_page_nr(pgdat, i); 110 page = pgdat_page_nr(pgdat, i);
109 total++; 111 total++;
@@ -114,6 +116,7 @@ void show_mem(void)
114 else if (page_count(page)) 116 else if (page_count(page))
115 shared += page_count(page) - 1; 117 shared += page_count(page) - 1;
116 } 118 }
119 pgdat_resize_unlock(pgdat, &flags);
117 } 120 }
118 printk("%ld pages of RAM\n", total); 121 printk("%ld pages of RAM\n", total);
119 printk("%ld reserved pages\n", reserved); 122 printk("%ld reserved pages\n", reserved);
@@ -155,7 +158,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
155 unsigned long vsid; 158 unsigned long vsid;
156 159
157 if (mem_init_done) { 160 if (mem_init_done) {
158 spin_lock(&init_mm.page_table_lock);
159 pgdp = pgd_offset_k(ea); 161 pgdp = pgd_offset_k(ea);
160 pudp = pud_alloc(&init_mm, pgdp, ea); 162 pudp = pud_alloc(&init_mm, pgdp, ea);
161 if (!pudp) 163 if (!pudp)
@@ -163,12 +165,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
163 pmdp = pmd_alloc(&init_mm, pudp, ea); 165 pmdp = pmd_alloc(&init_mm, pudp, ea);
164 if (!pmdp) 166 if (!pmdp)
165 return -ENOMEM; 167 return -ENOMEM;
166 ptep = pte_alloc_kernel(&init_mm, pmdp, ea); 168 ptep = pte_alloc_kernel(pmdp, ea);
167 if (!ptep) 169 if (!ptep)
168 return -ENOMEM; 170 return -ENOMEM;
169 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 171 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
170 __pgprot(flags))); 172 __pgprot(flags)));
171 spin_unlock(&init_mm.page_table_lock);
172 } else { 173 } else {
173 unsigned long va, vpn, hash, hpteg; 174 unsigned long va, vpn, hash, hpteg;
174 175
@@ -649,11 +650,14 @@ void __init mem_init(void)
649#endif 650#endif
650 651
651 for_each_pgdat(pgdat) { 652 for_each_pgdat(pgdat) {
653 unsigned long flags;
654 pgdat_resize_lock(pgdat, &flags);
652 for (i = 0; i < pgdat->node_spanned_pages; i++) { 655 for (i = 0; i < pgdat->node_spanned_pages; i++) {
653 page = pgdat_page_nr(pgdat, i); 656 page = pgdat_page_nr(pgdat, i);
654 if (PageReserved(page)) 657 if (PageReserved(page))
655 reservedpages++; 658 reservedpages++;
656 } 659 }
660 pgdat_resize_unlock(pgdat, &flags);
657 } 661 }
658 662
659 codesize = (unsigned long)&_etext - (unsigned long)&_stext; 663 codesize = (unsigned long)&_etext - (unsigned long)&_stext;
@@ -867,3 +871,80 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
867 return vma_prot; 871 return vma_prot;
868} 872}
869EXPORT_SYMBOL(phys_mem_access_prot); 873EXPORT_SYMBOL(phys_mem_access_prot);
874
875#ifdef CONFIG_MEMORY_HOTPLUG
876
877void online_page(struct page *page)
878{
879 ClearPageReserved(page);
880 free_cold_page(page);
881 totalram_pages++;
882 num_physpages++;
883}
884
885/*
886 * This works only for the non-NUMA case. Later, we'll need a lookup
887 * to convert from real physical addresses to nid, that doesn't use
888 * pfn_to_nid().
889 */
890int __devinit add_memory(u64 start, u64 size)
891{
892 struct pglist_data *pgdata = NODE_DATA(0);
893 struct zone *zone;
894 unsigned long start_pfn = start >> PAGE_SHIFT;
895 unsigned long nr_pages = size >> PAGE_SHIFT;
896
897 /* this should work for most non-highmem platforms */
898 zone = pgdata->node_zones;
899
900 return __add_pages(zone, start_pfn, nr_pages);
901
902 return 0;
903}
904
905/*
906 * First pass at this code will check to determine if the remove
907 * request is within the RMO. Do not allow removal within the RMO.
908 */
909int __devinit remove_memory(u64 start, u64 size)
910{
911 struct zone *zone;
912 unsigned long start_pfn, end_pfn, nr_pages;
913
914 start_pfn = start >> PAGE_SHIFT;
915 nr_pages = size >> PAGE_SHIFT;
916 end_pfn = start_pfn + nr_pages;
917
918 printk("%s(): Attempting to remove memoy in range "
919 "%lx to %lx\n", __func__, start, start+size);
920 /*
921 * check for range within RMO
922 */
923 zone = page_zone(pfn_to_page(start_pfn));
924
925 printk("%s(): memory will be removed from "
926 "the %s zone\n", __func__, zone->name);
927
928 /*
929 * not handling removing memory ranges that
930 * overlap multiple zones yet
931 */
932 if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages))
933 goto overlap;
934
935 /* make sure it is NOT in RMO */
936 if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) {
937 printk("%s(): range to be removed must NOT be in RMO!\n",
938 __func__);
939 goto in_rmo;
940 }
941
942 return __remove_pages(zone, start_pfn, nr_pages);
943
944overlap:
945 printk("%s(): memory range to be removed overlaps "
946 "multiple zones!!!\n", __func__);
947in_rmo:
948 return -1;
949}
950#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/s390/mm/ioremap.c b/arch/s390/mm/ioremap.c
index c6c39d868bc8..0f6e9ecbefe2 100644
--- a/arch/s390/mm/ioremap.c
+++ b/arch/s390/mm/ioremap.c
@@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
58 if (address >= end) 58 if (address >= end)
59 BUG(); 59 BUG();
60 do { 60 do {
61 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 61 pte_t * pte = pte_alloc_kernel(pmd, address);
62 if (!pte) 62 if (!pte)
63 return -ENOMEM; 63 return -ENOMEM;
64 remap_area_pte(pte, address, end - address, address + phys_addr, flags); 64 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -80,7 +80,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
80 flush_cache_all(); 80 flush_cache_all();
81 if (address >= end) 81 if (address >= end)
82 BUG(); 82 BUG();
83 spin_lock(&init_mm.page_table_lock);
84 do { 83 do {
85 pmd_t *pmd; 84 pmd_t *pmd;
86 pmd = pmd_alloc(&init_mm, dir, address); 85 pmd = pmd_alloc(&init_mm, dir, address);
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
94 address = (address + PGDIR_SIZE) & PGDIR_MASK; 93 address = (address + PGDIR_SIZE) & PGDIR_MASK;
95 dir++; 94 dir++;
96 } while (address && (address < end)); 95 } while (address && (address < end));
97 spin_unlock(&init_mm.page_table_lock);
98 flush_tlb_all(); 96 flush_tlb_all();
99 return 0; 97 return 0;
100} 98}
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 7abba2161da6..775f86cd3fe8 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -194,10 +194,13 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
194 unsigned long address) 194 unsigned long address)
195{ 195{
196 unsigned long addrmax = P4SEG; 196 unsigned long addrmax = P4SEG;
197 pgd_t *dir; 197 pgd_t *pgd;
198 pmd_t *pmd; 198 pmd_t *pmd;
199 pte_t *pte; 199 pte_t *pte;
200 pte_t entry; 200 pte_t entry;
201 struct mm_struct *mm;
202 spinlock_t *ptl;
203 int ret = 1;
201 204
202#ifdef CONFIG_SH_KGDB 205#ifdef CONFIG_SH_KGDB
203 if (kgdb_nofault && kgdb_bus_err_hook) 206 if (kgdb_nofault && kgdb_bus_err_hook)
@@ -208,28 +211,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
208 addrmax = P4SEG_STORE_QUE + 0x04000000; 211 addrmax = P4SEG_STORE_QUE + 0x04000000;
209#endif 212#endif
210 213
211 if (address >= P3SEG && address < addrmax) 214 if (address >= P3SEG && address < addrmax) {
212 dir = pgd_offset_k(address); 215 pgd = pgd_offset_k(address);
213 else if (address >= TASK_SIZE) 216 mm = NULL;
217 } else if (address >= TASK_SIZE)
214 return 1; 218 return 1;
215 else if (!current->mm) 219 else if (!(mm = current->mm))
216 return 1; 220 return 1;
217 else 221 else
218 dir = pgd_offset(current->mm, address); 222 pgd = pgd_offset(mm, address);
219 223
220 pmd = pmd_offset(dir, address); 224 pmd = pmd_offset(pgd, address);
221 if (pmd_none(*pmd)) 225 if (pmd_none_or_clear_bad(pmd))
222 return 1;
223 if (pmd_bad(*pmd)) {
224 pmd_ERROR(*pmd);
225 pmd_clear(pmd);
226 return 1; 226 return 1;
227 } 227 if (mm)
228 pte = pte_offset_kernel(pmd, address); 228 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
229 else
230 pte = pte_offset_kernel(pmd, address);
231
229 entry = *pte; 232 entry = *pte;
230 if (pte_none(entry) || pte_not_present(entry) 233 if (pte_none(entry) || pte_not_present(entry)
231 || (writeaccess && !pte_write(entry))) 234 || (writeaccess && !pte_write(entry)))
232 return 1; 235 goto unlock;
233 236
234 if (writeaccess) 237 if (writeaccess)
235 entry = pte_mkdirty(entry); 238 entry = pte_mkdirty(entry);
@@ -251,8 +254,11 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
251 254
252 set_pte(pte, entry); 255 set_pte(pte, entry);
253 update_mmu_cache(NULL, address, entry); 256 update_mmu_cache(NULL, address, entry);
254 257 ret = 0;
255 return 0; 258unlock:
259 if (mm)
260 pte_unmap_unlock(pte, ptl);
261 return ret;
256} 262}
257 263
258void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) 264void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 95bb1a6c6060..6b7a7688c98e 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -54,8 +54,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
54 return pte; 54 return pte;
55} 55}
56 56
57#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
58
59void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 57void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
60 pte_t *ptep, pte_t entry) 58 pte_t *ptep, pte_t entry)
61{ 59{
diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c
index 9f490c2742f0..e794e27a72f1 100644
--- a/arch/sh/mm/ioremap.c
+++ b/arch/sh/mm/ioremap.c
@@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
57 if (address >= end) 57 if (address >= end)
58 BUG(); 58 BUG();
59 do { 59 do {
60 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 60 pte_t * pte = pte_alloc_kernel(pmd, address);
61 if (!pte) 61 if (!pte)
62 return -ENOMEM; 62 return -ENOMEM;
63 remap_area_pte(pte, address, end - address, address + phys_addr, flags); 63 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -79,7 +79,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
79 flush_cache_all(); 79 flush_cache_all();
80 if (address >= end) 80 if (address >= end)
81 BUG(); 81 BUG();
82 spin_lock(&init_mm.page_table_lock);
83 do { 82 do {
84 pmd_t *pmd; 83 pmd_t *pmd;
85 pmd = pmd_alloc(&init_mm, dir, address); 84 pmd = pmd_alloc(&init_mm, dir, address);
@@ -93,7 +92,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
93 address = (address + PGDIR_SIZE) & PGDIR_MASK; 92 address = (address + PGDIR_SIZE) & PGDIR_MASK;
94 dir++; 93 dir++;
95 } while (address && (address < end)); 94 } while (address && (address < end));
96 spin_unlock(&init_mm.page_table_lock);
97 flush_tlb_all(); 95 flush_tlb_all();
98 return error; 96 return error;
99} 97}
diff --git a/arch/sh64/mm/cache.c b/arch/sh64/mm/cache.c
index 3b87e25ea773..c0c1b21350d8 100644
--- a/arch/sh64/mm/cache.c
+++ b/arch/sh64/mm/cache.c
@@ -584,32 +584,36 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr)
584 } 584 }
585} 585}
586 586
587static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr) 587static void sh64_dcache_purge_user_pages(struct mm_struct *mm,
588 unsigned long addr, unsigned long end)
588{ 589{
589 pgd_t *pgd; 590 pgd_t *pgd;
590 pmd_t *pmd; 591 pmd_t *pmd;
591 pte_t *pte; 592 pte_t *pte;
592 pte_t entry; 593 pte_t entry;
594 spinlock_t *ptl;
593 unsigned long paddr; 595 unsigned long paddr;
594 596
595 /* NOTE : all the callers of this have mm->page_table_lock held, so the 597 if (!mm)
596 following page table traversal is safe even on SMP/pre-emptible. */ 598 return; /* No way to find physical address of page */
597 599
598 if (!mm) return; /* No way to find physical address of page */ 600 pgd = pgd_offset(mm, addr);
599 pgd = pgd_offset(mm, eaddr); 601 if (pgd_bad(*pgd))
600 if (pgd_bad(*pgd)) return; 602 return;
601 603
602 pmd = pmd_offset(pgd, eaddr); 604 pmd = pmd_offset(pgd, addr);
603 if (pmd_none(*pmd) || pmd_bad(*pmd)) return; 605 if (pmd_none(*pmd) || pmd_bad(*pmd))
604 606 return;
605 pte = pte_offset_kernel(pmd, eaddr); 607
606 entry = *pte; 608 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
607 if (pte_none(entry) || !pte_present(entry)) return; 609 do {
608 610 entry = *pte;
609 paddr = pte_val(entry) & PAGE_MASK; 611 if (pte_none(entry) || !pte_present(entry))
610 612 continue;
611 sh64_dcache_purge_coloured_phy_page(paddr, eaddr); 613 paddr = pte_val(entry) & PAGE_MASK;
612 614 sh64_dcache_purge_coloured_phy_page(paddr, addr);
615 } while (pte++, addr += PAGE_SIZE, addr != end);
616 pte_unmap_unlock(pte - 1, ptl);
613} 617}
614/****************************************************************************/ 618/****************************************************************************/
615 619
@@ -668,7 +672,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
668 int n_pages; 672 int n_pages;
669 673
670 n_pages = ((end - start) >> PAGE_SHIFT); 674 n_pages = ((end - start) >> PAGE_SHIFT);
671 if (n_pages >= 64) { 675 if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) {
672#if 1 676#if 1
673 sh64_dcache_purge_all(); 677 sh64_dcache_purge_all();
674#else 678#else
@@ -707,20 +711,10 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
707 } 711 }
708#endif 712#endif
709 } else { 713 } else {
710 /* 'Small' range */ 714 /* Small range, covered by a single page table page */
711 unsigned long aligned_start; 715 start &= PAGE_MASK; /* should already be so */
712 unsigned long eaddr; 716 end = PAGE_ALIGN(end); /* should already be so */
713 unsigned long last_page_start; 717 sh64_dcache_purge_user_pages(mm, start, end);
714
715 aligned_start = start & PAGE_MASK;
716 /* 'end' is 1 byte beyond the end of the range */
717 last_page_start = (end - 1) & PAGE_MASK;
718
719 eaddr = aligned_start;
720 while (eaddr <= last_page_start) {
721 sh64_dcache_purge_user_page(mm, eaddr);
722 eaddr += PAGE_SIZE;
723 }
724 } 718 }
725 return; 719 return;
726} 720}
@@ -880,9 +874,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
880 addresses from the user address space specified by mm, after writing 874 addresses from the user address space specified by mm, after writing
881 back any dirty data. 875 back any dirty data.
882 876
883 Note(1), 'end' is 1 byte beyond the end of the range to flush. 877 Note, 'end' is 1 byte beyond the end of the range to flush. */
884
885 Note(2), this is called with mm->page_table_lock held.*/
886 878
887 sh64_dcache_purge_user_range(mm, start, end); 879 sh64_dcache_purge_user_range(mm, start, end);
888 sh64_icache_inv_user_page_range(mm, start, end); 880 sh64_icache_inv_user_page_range(mm, start, end);
@@ -898,7 +890,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned
898 the I-cache must be searched too in case the page in question is 890 the I-cache must be searched too in case the page in question is
899 both writable and being executed from (e.g. stack trampolines.) 891 both writable and being executed from (e.g. stack trampolines.)
900 892
901 Note(1), this is called with mm->page_table_lock held. 893 Note, this is called with pte lock held.
902 */ 894 */
903 895
904 sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT); 896 sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT);
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index dcd9c8a8baf8..ed6a505b3ee2 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -54,41 +54,31 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
54 return pte; 54 return pte;
55} 55}
56 56
57#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) 57void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
58 58 pte_t *ptep, pte_t entry)
59static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
60 struct page *page, pte_t * page_table, int write_access)
61{ 59{
62 unsigned long i; 60 int i;
63 pte_t entry;
64
65 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
66
67 if (write_access)
68 entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
69 vma->vm_page_prot)));
70 else
71 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
72 entry = pte_mkyoung(entry);
73 mk_pte_huge(entry);
74 61
75 for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { 62 for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
76 set_pte(page_table, entry); 63 set_pte_at(mm, addr, ptep, entry);
77 page_table++; 64 ptep++;
78 65 addr += PAGE_SIZE;
79 pte_val(entry) += PAGE_SIZE; 66 pte_val(entry) += PAGE_SIZE;
80 } 67 }
81} 68}
82 69
83pte_t huge_ptep_get_and_clear(pte_t *ptep) 70pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
71 pte_t *ptep)
84{ 72{
85 pte_t entry; 73 pte_t entry;
74 int i;
86 75
87 entry = *ptep; 76 entry = *ptep;
88 77
89 for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { 78 for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
90 pte_clear(pte); 79 pte_clear(mm, addr, ptep);
91 pte++; 80 addr += PAGE_SIZE;
81 ptep++;
92 } 82 }
93 83
94 return entry; 84 return entry;
@@ -106,79 +96,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
106 return 0; 96 return 0;
107} 97}
108 98
109int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
110 struct vm_area_struct *vma)
111{
112 pte_t *src_pte, *dst_pte, entry;
113 struct page *ptepage;
114 unsigned long addr = vma->vm_start;
115 unsigned long end = vma->vm_end;
116 int i;
117
118 while (addr < end) {
119 dst_pte = huge_pte_alloc(dst, addr);
120 if (!dst_pte)
121 goto nomem;
122 src_pte = huge_pte_offset(src, addr);
123 BUG_ON(!src_pte || pte_none(*src_pte));
124 entry = *src_pte;
125 ptepage = pte_page(entry);
126 get_page(ptepage);
127 for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
128 set_pte(dst_pte, entry);
129 pte_val(entry) += PAGE_SIZE;
130 dst_pte++;
131 }
132 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
133 addr += HPAGE_SIZE;
134 }
135 return 0;
136
137nomem:
138 return -ENOMEM;
139}
140
141int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
142 struct page **pages, struct vm_area_struct **vmas,
143 unsigned long *position, int *length, int i)
144{
145 unsigned long vaddr = *position;
146 int remainder = *length;
147
148 WARN_ON(!is_vm_hugetlb_page(vma));
149
150 while (vaddr < vma->vm_end && remainder) {
151 if (pages) {
152 pte_t *pte;
153 struct page *page;
154
155 pte = huge_pte_offset(mm, vaddr);
156
157 /* hugetlb should be locked, and hence, prefaulted */
158 BUG_ON(!pte || pte_none(*pte));
159
160 page = pte_page(*pte);
161
162 WARN_ON(!PageCompound(page));
163
164 get_page(page);
165 pages[i] = page;
166 }
167
168 if (vmas)
169 vmas[i] = vma;
170
171 vaddr += PAGE_SIZE;
172 --remainder;
173 ++i;
174 }
175
176 *length = remainder;
177 *position = vaddr;
178
179 return i;
180}
181
182struct page *follow_huge_addr(struct mm_struct *mm, 99struct page *follow_huge_addr(struct mm_struct *mm,
183 unsigned long address, int write) 100 unsigned long address, int write)
184{ 101{
@@ -195,84 +112,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
195{ 112{
196 return NULL; 113 return NULL;
197} 114}
198
199void unmap_hugepage_range(struct vm_area_struct *vma,
200 unsigned long start, unsigned long end)
201{
202 struct mm_struct *mm = vma->vm_mm;
203 unsigned long address;
204 pte_t *pte;
205 struct page *page;
206 int i;
207
208 BUG_ON(start & (HPAGE_SIZE - 1));
209 BUG_ON(end & (HPAGE_SIZE - 1));
210
211 for (address = start; address < end; address += HPAGE_SIZE) {
212 pte = huge_pte_offset(mm, address);
213 BUG_ON(!pte);
214 if (pte_none(*pte))
215 continue;
216 page = pte_page(*pte);
217 put_page(page);
218 for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
219 pte_clear(mm, address+(i*PAGE_SIZE), pte);
220 pte++;
221 }
222 }
223 add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
224 flush_tlb_range(vma, start, end);
225}
226
227int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
228{
229 struct mm_struct *mm = current->mm;
230 unsigned long addr;
231 int ret = 0;
232
233 BUG_ON(vma->vm_start & ~HPAGE_MASK);
234 BUG_ON(vma->vm_end & ~HPAGE_MASK);
235
236 spin_lock(&mm->page_table_lock);
237 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
238 unsigned long idx;
239 pte_t *pte = huge_pte_alloc(mm, addr);
240 struct page *page;
241
242 if (!pte) {
243 ret = -ENOMEM;
244 goto out;
245 }
246 if (!pte_none(*pte))
247 continue;
248
249 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
250 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
251 page = find_get_page(mapping, idx);
252 if (!page) {
253 /* charge the fs quota first */
254 if (hugetlb_get_quota(mapping)) {
255 ret = -ENOMEM;
256 goto out;
257 }
258 page = alloc_huge_page();
259 if (!page) {
260 hugetlb_put_quota(mapping);
261 ret = -ENOMEM;
262 goto out;
263 }
264 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
265 if (! ret) {
266 unlock_page(page);
267 } else {
268 hugetlb_put_quota(mapping);
269 free_huge_page(page);
270 goto out;
271 }
272 }
273 set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
274 }
275out:
276 spin_unlock(&mm->page_table_lock);
277 return ret;
278}
diff --git a/arch/sh64/mm/ioremap.c b/arch/sh64/mm/ioremap.c
index f4003da556bc..fb1866fa2c9d 100644
--- a/arch/sh64/mm/ioremap.c
+++ b/arch/sh64/mm/ioremap.c
@@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
79 BUG(); 79 BUG();
80 80
81 do { 81 do {
82 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 82 pte_t * pte = pte_alloc_kernel(pmd, address);
83 if (!pte) 83 if (!pte)
84 return -ENOMEM; 84 return -ENOMEM;
85 remap_area_pte(pte, address, end - address, address + phys_addr, flags); 85 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -101,7 +101,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
101 flush_cache_all(); 101 flush_cache_all();
102 if (address >= end) 102 if (address >= end)
103 BUG(); 103 BUG();
104 spin_lock(&init_mm.page_table_lock);
105 do { 104 do {
106 pmd_t *pmd = pmd_alloc(&init_mm, dir, address); 105 pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
107 error = -ENOMEM; 106 error = -ENOMEM;
@@ -115,7 +114,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
115 address = (address + PGDIR_SIZE) & PGDIR_MASK; 114 address = (address + PGDIR_SIZE) & PGDIR_MASK;
116 dir++; 115 dir++;
117 } while (address && (address < end)); 116 } while (address && (address < end));
118 spin_unlock(&init_mm.page_table_lock);
119 flush_tlb_all(); 117 flush_tlb_all();
120 return 0; 118 return 0;
121} 119}
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c
index 20ccb957fb77..9604893ffdbd 100644
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -73,14 +73,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
73 int space = GET_IOSPACE(pfn); 73 int space = GET_IOSPACE(pfn);
74 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 74 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
75 75
76 /* See comment in mm/memory.c remap_pfn_range */
77 vma->vm_flags |= VM_IO | VM_RESERVED;
78
76 prot = __pgprot(pg_iobits); 79 prot = __pgprot(pg_iobits);
77 offset -= from; 80 offset -= from;
78 dir = pgd_offset(mm, from); 81 dir = pgd_offset(mm, from);
79 flush_cache_range(vma, beg, end); 82 flush_cache_range(vma, beg, end);
80 83
81 spin_lock(&mm->page_table_lock);
82 while (from < end) { 84 while (from < end) {
83 pmd_t *pmd = pmd_alloc(current->mm, dir, from); 85 pmd_t *pmd = pmd_alloc(mm, dir, from);
84 error = -ENOMEM; 86 error = -ENOMEM;
85 if (!pmd) 87 if (!pmd)
86 break; 88 break;
@@ -90,7 +92,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
90 from = (from + PGDIR_SIZE) & PGDIR_MASK; 92 from = (from + PGDIR_SIZE) & PGDIR_MASK;
91 dir++; 93 dir++;
92 } 94 }
93 spin_unlock(&mm->page_table_lock);
94 95
95 flush_tlb_range(vma, beg, end); 96 flush_tlb_range(vma, beg, end);
96 return error; 97 return error;
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index b2854ef221d0..edf52d06b280 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
241 current->mm->brk = ex.a_bss + 241 current->mm->brk = ex.a_bss +
242 (current->mm->start_brk = N_BSSADDR(ex)); 242 (current->mm->start_brk = N_BSSADDR(ex));
243 243
244 set_mm_counter(current->mm, rss, 0);
245 current->mm->mmap = NULL; 244 current->mm->mmap = NULL;
246 compute_creds(bprm); 245 compute_creds(bprm);
247 current->flags &= ~PF_FORKNOEXEC; 246 current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index c954d91f01d0..112c316e7cd2 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -127,14 +127,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
127 int space = GET_IOSPACE(pfn); 127 int space = GET_IOSPACE(pfn);
128 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; 128 unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
129 129
130 /* See comment in mm/memory.c remap_pfn_range */
131 vma->vm_flags |= VM_IO | VM_RESERVED;
132
130 prot = __pgprot(pg_iobits); 133 prot = __pgprot(pg_iobits);
131 offset -= from; 134 offset -= from;
132 dir = pgd_offset(mm, from); 135 dir = pgd_offset(mm, from);
133 flush_cache_range(vma, beg, end); 136 flush_cache_range(vma, beg, end);
134 137
135 spin_lock(&mm->page_table_lock);
136 while (from < end) { 138 while (from < end) {
137 pud_t *pud = pud_alloc(current->mm, dir, from); 139 pud_t *pud = pud_alloc(mm, dir, from);
138 error = -ENOMEM; 140 error = -ENOMEM;
139 if (!pud) 141 if (!pud)
140 break; 142 break;
@@ -144,8 +146,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
144 from = (from + PGDIR_SIZE) & PGDIR_MASK; 146 from = (from + PGDIR_SIZE) & PGDIR_MASK;
145 dir++; 147 dir++;
146 } 148 }
147 flush_tlb_range(vma, beg, end);
148 spin_unlock(&mm->page_table_lock);
149 149
150 flush_tlb_range(vma, beg, end);
150 return error; 151 return error;
151} 152}
diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c
index 90ca99d0b89c..8b104be4662b 100644
--- a/arch/sparc64/mm/tlb.c
+++ b/arch/sparc64/mm/tlb.c
@@ -18,8 +18,7 @@
18 18
19/* Heavily inspired by the ppc64 code. */ 19/* Heavily inspired by the ppc64 code. */
20 20
21DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = 21DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, };
22 { NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, };
23 22
24void flush_tlb_pending(void) 23void flush_tlb_pending(void)
25{ 24{
@@ -72,7 +71,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
72 71
73no_cache_flush: 72no_cache_flush:
74 73
75 if (mp->tlb_frozen) 74 if (mp->fullmm)
76 return; 75 return;
77 76
78 nr = mp->tlb_nr; 77 nr = mp->tlb_nr;
@@ -97,7 +96,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
97 unsigned long nr = mp->tlb_nr; 96 unsigned long nr = mp->tlb_nr;
98 long s = start, e = end, vpte_base; 97 long s = start, e = end, vpte_base;
99 98
100 if (mp->tlb_frozen) 99 if (mp->fullmm)
101 return; 100 return;
102 101
103 /* If start is greater than end, that is a real problem. */ 102 /* If start is greater than end, that is a real problem. */
diff --git a/arch/um/include/tlb.h b/arch/um/include/tlb.h
index 45d7da6c3b2c..8efc1e0f1b84 100644
--- a/arch/um/include/tlb.h
+++ b/arch/um/include/tlb.h
@@ -34,7 +34,6 @@ struct host_vm_op {
34 } u; 34 } u;
35}; 35};
36 36
37extern void mprotect_kernel_vm(int w);
38extern void force_flush_all(void); 37extern void force_flush_all(void);
39extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr, 38extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
40 unsigned long end_addr, int force, 39 unsigned long end_addr, int force,
diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c
index 0d73ceeece72..34b54a3e2132 100644
--- a/arch/um/kernel/process_kern.c
+++ b/arch/um/kernel/process_kern.c
@@ -222,6 +222,7 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
222 pud_t *pud; 222 pud_t *pud;
223 pmd_t *pmd; 223 pmd_t *pmd;
224 pte_t *pte; 224 pte_t *pte;
225 pte_t ptent;
225 226
226 if(task->mm == NULL) 227 if(task->mm == NULL)
227 return(ERR_PTR(-EINVAL)); 228 return(ERR_PTR(-EINVAL));
@@ -238,12 +239,13 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
238 return(ERR_PTR(-EINVAL)); 239 return(ERR_PTR(-EINVAL));
239 240
240 pte = pte_offset_kernel(pmd, addr); 241 pte = pte_offset_kernel(pmd, addr);
241 if(!pte_present(*pte)) 242 ptent = *pte;
243 if(!pte_present(ptent))
242 return(ERR_PTR(-EINVAL)); 244 return(ERR_PTR(-EINVAL));
243 245
244 if(pte_out != NULL) 246 if(pte_out != NULL)
245 *pte_out = *pte; 247 *pte_out = ptent;
246 return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK)); 248 return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK));
247} 249}
248 250
249char *current_cmd(void) 251char *current_cmd(void)
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 240143b616a2..9e5e39cea821 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -28,7 +28,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
28 pmd_t *pmd; 28 pmd_t *pmd;
29 pte_t *pte; 29 pte_t *pte;
30 30
31 spin_lock(&mm->page_table_lock);
32 pgd = pgd_offset(mm, proc); 31 pgd = pgd_offset(mm, proc);
33 pud = pud_alloc(mm, pgd, proc); 32 pud = pud_alloc(mm, pgd, proc);
34 if (!pud) 33 if (!pud)
@@ -63,7 +62,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
63 *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); 62 *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
64 *pte = pte_mkexec(*pte); 63 *pte = pte_mkexec(*pte);
65 *pte = pte_wrprotect(*pte); 64 *pte = pte_wrprotect(*pte);
66 spin_unlock(&mm->page_table_lock);
67 return(0); 65 return(0);
68 66
69 out_pmd: 67 out_pmd:
@@ -71,7 +69,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
71 out_pte: 69 out_pte:
72 pmd_free(pmd); 70 pmd_free(pmd);
73 out: 71 out:
74 spin_unlock(&mm->page_table_lock);
75 return(-ENOMEM); 72 return(-ENOMEM);
76} 73}
77 74
@@ -147,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
147 144
148 if(!proc_mm || !ptrace_faultinfo){ 145 if(!proc_mm || !ptrace_faultinfo){
149 free_page(mmu->id.stack); 146 free_page(mmu->id.stack);
147 pte_lock_deinit(virt_to_page(mmu->last_page_table));
150 pte_free_kernel((pte_t *) mmu->last_page_table); 148 pte_free_kernel((pte_t *) mmu->last_page_table);
151 dec_page_state(nr_page_table_pages); 149 dec_page_state(nr_page_table_pages);
152#ifdef CONFIG_3_LEVEL_PGTABLES 150#ifdef CONFIG_3_LEVEL_PGTABLES
diff --git a/arch/um/kernel/tt/tlb.c b/arch/um/kernel/tt/tlb.c
index f1d85dbb45b9..ae6217c86135 100644
--- a/arch/um/kernel/tt/tlb.c
+++ b/arch/um/kernel/tt/tlb.c
@@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end)
74 atomic_inc(&vmchange_seq); 74 atomic_inc(&vmchange_seq);
75} 75}
76 76
77static void protect_vm_page(unsigned long addr, int w, int must_succeed)
78{
79 int err;
80
81 err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
82 if(err == 0) return;
83 else if((err == -EFAULT) || (err == -ENOMEM)){
84 flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
85 protect_vm_page(addr, w, 1);
86 }
87 else panic("protect_vm_page : protect failed, errno = %d\n", err);
88}
89
90void mprotect_kernel_vm(int w)
91{
92 struct mm_struct *mm;
93 pgd_t *pgd;
94 pud_t *pud;
95 pmd_t *pmd;
96 pte_t *pte;
97 unsigned long addr;
98
99 mm = &init_mm;
100 for(addr = start_vm; addr < end_vm;){
101 pgd = pgd_offset(mm, addr);
102 pud = pud_offset(pgd, addr);
103 pmd = pmd_offset(pud, addr);
104 if(pmd_present(*pmd)){
105 pte = pte_offset_kernel(pmd, addr);
106 if(pte_present(*pte)) protect_vm_page(addr, w, 0);
107 addr += PAGE_SIZE;
108 }
109 else addr += PMD_SIZE;
110 }
111}
112
113void flush_tlb_kernel_vm_tt(void) 77void flush_tlb_kernel_vm_tt(void)
114{ 78{
115 flush_tlb_kernel_range(start_vm, end_vm); 79 flush_tlb_kernel_range(start_vm, end_vm);
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3e6780fa0186..93c60f4aa47a 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
314 current->mm->free_area_cache = TASK_UNMAPPED_BASE; 314 current->mm->free_area_cache = TASK_UNMAPPED_BASE;
315 current->mm->cached_hole_size = 0; 315 current->mm->cached_hole_size = 0;
316 316
317 set_mm_counter(current->mm, rss, 0);
318 current->mm->mmap = NULL; 317 current->mm->mmap = NULL;
319 compute_creds(bprm); 318 compute_creds(bprm);
320 current->flags &= ~PF_FORKNOEXEC; 319 current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 6972df480d2b..ecf7acb5db9b 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
60 if (address >= end) 60 if (address >= end)
61 BUG(); 61 BUG();
62 do { 62 do {
63 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); 63 pte_t * pte = pte_alloc_kernel(pmd, address);
64 if (!pte) 64 if (!pte)
65 return -ENOMEM; 65 return -ENOMEM;
66 remap_area_pte(pte, address, end - address, address + phys_addr, flags); 66 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
105 flush_cache_all(); 105 flush_cache_all();
106 if (address >= end) 106 if (address >= end)
107 BUG(); 107 BUG();
108 spin_lock(&init_mm.page_table_lock);
109 do { 108 do {
110 pud_t *pud; 109 pud_t *pud;
111 pud = pud_alloc(&init_mm, pgd, address); 110 pud = pud_alloc(&init_mm, pgd, address);
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
119 address = (address + PGDIR_SIZE) & PGDIR_MASK; 118 address = (address + PGDIR_SIZE) & PGDIR_MASK;
120 pgd++; 119 pgd++;
121 } while (address && (address < end)); 120 } while (address && (address < end));
122 spin_unlock(&init_mm.page_table_lock);
123 flush_tlb_all(); 121 flush_tlb_all();
124 return error; 122 return error;
125} 123}
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 01a1bd239263..2143609d2936 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -200,8 +200,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
200 * Note: Assume that this function returns zero on success 200 * Note: Assume that this function returns zero on success
201 */ 201 */
202 result = add_memory(mem_device->start_addr, 202 result = add_memory(mem_device->start_addr,
203 (mem_device->end_addr - mem_device->start_addr) + 1, 203 (mem_device->end_addr - mem_device->start_addr) + 1);
204 mem_device->read_write_attribute);
205 if (result) { 204 if (result) {
206 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); 205 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n"));
207 mem_device->state = MEMORY_INVALID_STATE; 206 mem_device->state = MEMORY_INVALID_STATE;
@@ -259,7 +258,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
259 * Ask the VM to offline this memory range. 258 * Ask the VM to offline this memory range.
260 * Note: Assume that this function returns zero on success 259 * Note: Assume that this function returns zero on success
261 */ 260 */
262 result = remove_memory(start, len, attr); 261 result = remove_memory(start, len);
263 if (result) { 262 if (result) {
264 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); 263 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
265 return_VALUE(result); 264 return_VALUE(result);
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 66d9c4643fc1..f12898d53078 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -7,6 +7,7 @@ obj-y := core.o sys.o bus.o dd.o \
7obj-y += power/ 7obj-y += power/
8obj-$(CONFIG_FW_LOADER) += firmware_class.o 8obj-$(CONFIG_FW_LOADER) += firmware_class.o
9obj-$(CONFIG_NUMA) += node.o 9obj-$(CONFIG_NUMA) += node.o
10obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
10 11
11ifeq ($(CONFIG_DEBUG_DRIVER),y) 12ifeq ($(CONFIG_DEBUG_DRIVER),y)
12EXTRA_CFLAGS += -DDEBUG 13EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/base/init.c b/drivers/base/init.c
index 84e604e25c4f..c648914b9cde 100644
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/device.h> 10#include <linux/device.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/memory.h>
12 13
13#include "base.h" 14#include "base.h"
14 15
@@ -33,5 +34,6 @@ void __init driver_init(void)
33 platform_bus_init(); 34 platform_bus_init();
34 system_bus_init(); 35 system_bus_init();
35 cpu_dev_init(); 36 cpu_dev_init();
37 memory_dev_init();
36 attribute_container_init(); 38 attribute_container_init();
37} 39}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
new file mode 100644
index 000000000000..b7ddd651d664
--- /dev/null
+++ b/drivers/base/memory.c
@@ -0,0 +1,452 @@
1/*
2 * drivers/base/memory.c - basic Memory class support
3 *
4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
5 * Dave Hansen <haveblue@us.ibm.com>
6 *
7 * This file provides the necessary infrastructure to represent
8 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
9 * All arch-independent code that assumes MEMORY_HOTPLUG requires
10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
11 */
12
13#include <linux/sysdev.h>
14#include <linux/module.h>
15#include <linux/init.h>
16#include <linux/sched.h> /* capable() */
17#include <linux/topology.h>
18#include <linux/device.h>
19#include <linux/memory.h>
20#include <linux/kobject.h>
21#include <linux/memory_hotplug.h>
22#include <linux/mm.h>
23#include <asm/atomic.h>
24#include <asm/uaccess.h>
25
26#define MEMORY_CLASS_NAME "memory"
27
28static struct sysdev_class memory_sysdev_class = {
29 set_kset_name(MEMORY_CLASS_NAME),
30};
31EXPORT_SYMBOL(memory_sysdev_class);
32
33static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
34{
35 return MEMORY_CLASS_NAME;
36}
37
38static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
39 int num_envp, char *buffer, int buffer_size)
40{
41 int retval = 0;
42
43 return retval;
44}
45
46static struct kset_hotplug_ops memory_hotplug_ops = {
47 .name = memory_hotplug_name,
48 .hotplug = memory_hotplug,
49};
50
51static struct notifier_block *memory_chain;
52
53static int register_memory_notifier(struct notifier_block *nb)
54{
55 return notifier_chain_register(&memory_chain, nb);
56}
57
58static void unregister_memory_notifier(struct notifier_block *nb)
59{
60 notifier_chain_unregister(&memory_chain, nb);
61}
62
63/*
64 * register_memory - Setup a sysfs device for a memory block
65 */
66static int
67register_memory(struct memory_block *memory, struct mem_section *section,
68 struct node *root)
69{
70 int error;
71
72 memory->sysdev.cls = &memory_sysdev_class;
73 memory->sysdev.id = __section_nr(section);
74
75 error = sysdev_register(&memory->sysdev);
76
77 if (root && !error)
78 error = sysfs_create_link(&root->sysdev.kobj,
79 &memory->sysdev.kobj,
80 kobject_name(&memory->sysdev.kobj));
81
82 return error;
83}
84
85static void
86unregister_memory(struct memory_block *memory, struct mem_section *section,
87 struct node *root)
88{
89 BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
90 BUG_ON(memory->sysdev.id != __section_nr(section));
91
92 sysdev_unregister(&memory->sysdev);
93 if (root)
94 sysfs_remove_link(&root->sysdev.kobj,
95 kobject_name(&memory->sysdev.kobj));
96}
97
98/*
99 * use this as the physical section index that this memsection
100 * uses.
101 */
102
103static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
104{
105 struct memory_block *mem =
106 container_of(dev, struct memory_block, sysdev);
107 return sprintf(buf, "%08lx\n", mem->phys_index);
108}
109
110/*
111 * online, offline, going offline, etc.
112 */
113static ssize_t show_mem_state(struct sys_device *dev, char *buf)
114{
115 struct memory_block *mem =
116 container_of(dev, struct memory_block, sysdev);
117 ssize_t len = 0;
118
119 /*
120 * We can probably put these states in a nice little array
121 * so that they're not open-coded
122 */
123 switch (mem->state) {
124 case MEM_ONLINE:
125 len = sprintf(buf, "online\n");
126 break;
127 case MEM_OFFLINE:
128 len = sprintf(buf, "offline\n");
129 break;
130 case MEM_GOING_OFFLINE:
131 len = sprintf(buf, "going-offline\n");
132 break;
133 default:
134 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
135 mem->state);
136 WARN_ON(1);
137 break;
138 }
139
140 return len;
141}
142
143static inline int memory_notify(unsigned long val, void *v)
144{
145 return notifier_call_chain(&memory_chain, val, v);
146}
147
148/*
149 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
150 * OK to have direct references to sparsemem variables in here.
151 */
152static int
153memory_block_action(struct memory_block *mem, unsigned long action)
154{
155 int i;
156 unsigned long psection;
157 unsigned long start_pfn, start_paddr;
158 struct page *first_page;
159 int ret;
160 int old_state = mem->state;
161
162 psection = mem->phys_index;
163 first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
164
165 /*
166 * The probe routines leave the pages reserved, just
167 * as the bootmem code does. Make sure they're still
168 * that way.
169 */
170 if (action == MEM_ONLINE) {
171 for (i = 0; i < PAGES_PER_SECTION; i++) {
172 if (PageReserved(first_page+i))
173 continue;
174
175 printk(KERN_WARNING "section number %ld page number %d "
176 "not reserved, was it already online? \n",
177 psection, i);
178 return -EBUSY;
179 }
180 }
181
182 switch (action) {
183 case MEM_ONLINE:
184 start_pfn = page_to_pfn(first_page);
185 ret = online_pages(start_pfn, PAGES_PER_SECTION);
186 break;
187 case MEM_OFFLINE:
188 mem->state = MEM_GOING_OFFLINE;
189 memory_notify(MEM_GOING_OFFLINE, NULL);
190 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
191 ret = remove_memory(start_paddr,
192 PAGES_PER_SECTION << PAGE_SHIFT);
193 if (ret) {
194 mem->state = old_state;
195 break;
196 }
197 memory_notify(MEM_MAPPING_INVALID, NULL);
198 break;
199 default:
200 printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
201 __FUNCTION__, mem, action, action);
202 WARN_ON(1);
203 ret = -EINVAL;
204 }
205 /*
206 * For now, only notify on successful memory operations
207 */
208 if (!ret)
209 memory_notify(action, NULL);
210
211 return ret;
212}
213
214static int memory_block_change_state(struct memory_block *mem,
215 unsigned long to_state, unsigned long from_state_req)
216{
217 int ret = 0;
218 down(&mem->state_sem);
219
220 if (mem->state != from_state_req) {
221 ret = -EINVAL;
222 goto out;
223 }
224
225 ret = memory_block_action(mem, to_state);
226 if (!ret)
227 mem->state = to_state;
228
229out:
230 up(&mem->state_sem);
231 return ret;
232}
233
234static ssize_t
235store_mem_state(struct sys_device *dev, const char *buf, size_t count)
236{
237 struct memory_block *mem;
238 unsigned int phys_section_nr;
239 int ret = -EINVAL;
240
241 mem = container_of(dev, struct memory_block, sysdev);
242 phys_section_nr = mem->phys_index;
243
244 if (!valid_section_nr(phys_section_nr))
245 goto out;
246
247 if (!strncmp(buf, "online", min((int)count, 6)))
248 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
249 else if(!strncmp(buf, "offline", min((int)count, 7)))
250 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
251out:
252 if (ret)
253 return ret;
254 return count;
255}
256
257/*
258 * phys_device is a bad name for this. What I really want
259 * is a way to differentiate between memory ranges that
260 * are part of physical devices that constitute
261 * a complete removable unit or fru.
262 * i.e. do these ranges belong to the same physical device,
263 * s.t. if I offline all of these sections I can then
264 * remove the physical device?
265 */
266static ssize_t show_phys_device(struct sys_device *dev, char *buf)
267{
268 struct memory_block *mem =
269 container_of(dev, struct memory_block, sysdev);
270 return sprintf(buf, "%d\n", mem->phys_device);
271}
272
273static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
274static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
275static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
276
277#define mem_create_simple_file(mem, attr_name) \
278 sysdev_create_file(&mem->sysdev, &attr_##attr_name)
279#define mem_remove_simple_file(mem, attr_name) \
280 sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
281
282/*
283 * Block size attribute stuff
284 */
285static ssize_t
286print_block_size(struct class *class, char *buf)
287{
288 return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
289}
290
291static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
292
293static int block_size_init(void)
294{
295 sysfs_create_file(&memory_sysdev_class.kset.kobj,
296 &class_attr_block_size_bytes.attr);
297 return 0;
298}
299
300/*
301 * Some architectures will have custom drivers to do this, and
302 * will not need to do it from userspace. The fake hot-add code
303 * as well as ppc64 will do all of their discovery in userspace
304 * and will require this interface.
305 */
306#ifdef CONFIG_ARCH_MEMORY_PROBE
307static ssize_t
308memory_probe_store(struct class *class, const char __user *buf, size_t count)
309{
310 u64 phys_addr;
311 int ret;
312
313 phys_addr = simple_strtoull(buf, NULL, 0);
314
315 ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
316
317 if (ret)
318 count = ret;
319
320 return count;
321}
322static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
323
324static int memory_probe_init(void)
325{
326 sysfs_create_file(&memory_sysdev_class.kset.kobj,
327 &class_attr_probe.attr);
328 return 0;
329}
330#else
331#define memory_probe_init(...) do {} while (0)
332#endif
333
334/*
335 * Note that phys_device is optional. It is here to allow for
336 * differentiation between which *physical* devices each
337 * section belongs to...
338 */
339
340static int add_memory_block(unsigned long node_id, struct mem_section *section,
341 unsigned long state, int phys_device)
342{
343 struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
344 int ret = 0;
345
346 if (!mem)
347 return -ENOMEM;
348
349 mem->phys_index = __section_nr(section);
350 mem->state = state;
351 init_MUTEX(&mem->state_sem);
352 mem->phys_device = phys_device;
353
354 ret = register_memory(mem, section, NULL);
355 if (!ret)
356 ret = mem_create_simple_file(mem, phys_index);
357 if (!ret)
358 ret = mem_create_simple_file(mem, state);
359 if (!ret)
360 ret = mem_create_simple_file(mem, phys_device);
361
362 return ret;
363}
364
365/*
366 * For now, we have a linear search to go find the appropriate
367 * memory_block corresponding to a particular phys_index. If
368 * this gets to be a real problem, we can always use a radix
369 * tree or something here.
370 *
371 * This could be made generic for all sysdev classes.
372 */
373static struct memory_block *find_memory_block(struct mem_section *section)
374{
375 struct kobject *kobj;
376 struct sys_device *sysdev;
377 struct memory_block *mem;
378 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
379
380 /*
381 * This only works because we know that section == sysdev->id
382 * slightly redundant with sysdev_register()
383 */
384 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
385
386 kobj = kset_find_obj(&memory_sysdev_class.kset, name);
387 if (!kobj)
388 return NULL;
389
390 sysdev = container_of(kobj, struct sys_device, kobj);
391 mem = container_of(sysdev, struct memory_block, sysdev);
392
393 return mem;
394}
395
396int remove_memory_block(unsigned long node_id, struct mem_section *section,
397 int phys_device)
398{
399 struct memory_block *mem;
400
401 mem = find_memory_block(section);
402 mem_remove_simple_file(mem, phys_index);
403 mem_remove_simple_file(mem, state);
404 mem_remove_simple_file(mem, phys_device);
405 unregister_memory(mem, section, NULL);
406
407 return 0;
408}
409
410/*
411 * need an interface for the VM to add new memory regions,
412 * but without onlining it.
413 */
414int register_new_memory(struct mem_section *section)
415{
416 return add_memory_block(0, section, MEM_OFFLINE, 0);
417}
418
419int unregister_memory_section(struct mem_section *section)
420{
421 if (!valid_section(section))
422 return -EINVAL;
423
424 return remove_memory_block(0, section, 0);
425}
426
427/*
428 * Initialize the sysfs support for memory devices...
429 */
430int __init memory_dev_init(void)
431{
432 unsigned int i;
433 int ret;
434
435 memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
436 ret = sysdev_class_register(&memory_sysdev_class);
437
438 /*
439 * Create entries for memory sections that were found
440 * during boot and have been initialized
441 */
442 for (i = 0; i < NR_MEM_SECTIONS; i++) {
443 if (!valid_section_nr(i))
444 continue;
445 add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
446 }
447
448 memory_probe_init();
449 block_size_init();
450
451 return ret;
452}
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 07fee811c09e..d86d5c26061d 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1887,13 +1887,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
1887 int i; 1887 int i;
1888 1888
1889 for (i=0; i < nr_pages; i++) { 1889 for (i=0; i < nr_pages; i++) {
1890 if (dirtied && !PageReserved(sgl[i].page)) 1890 struct page *page = sgl[i].page;
1891 SetPageDirty(sgl[i].page); 1891
1892 /* unlock_page(sgl[i].page); */ 1892 /* XXX: just for debug. Remove when PageReserved is removed */
1893 BUG_ON(PageReserved(page));
1894 if (dirtied)
1895 SetPageDirty(page);
1896 /* unlock_page(page); */
1893 /* FIXME: cache flush missing for rw==READ 1897 /* FIXME: cache flush missing for rw==READ
1894 * FIXME: call the correct reference counting function 1898 * FIXME: call the correct reference counting function
1895 */ 1899 */
1896 page_cache_release(sgl[i].page); 1900 page_cache_release(page);
1897 } 1901 }
1898 1902
1899 return 0; 1903 return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5eb54d8019b4..da9766283bd7 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
4526 int i; 4526 int i;
4527 4527
4528 for (i=0; i < nr_pages; i++) { 4528 for (i=0; i < nr_pages; i++) {
4529 if (dirtied && !PageReserved(sgl[i].page)) 4529 struct page *page = sgl[i].page;
4530 SetPageDirty(sgl[i].page); 4530
4531 /* XXX: just for debug. Remove when PageReserved is removed */
4532 BUG_ON(PageReserved(page));
4533 if (dirtied)
4534 SetPageDirty(page);
4531 /* FIXME: cache flush missing for rw==READ 4535 /* FIXME: cache flush missing for rw==READ
4532 * FIXME: call the correct reference counting function 4536 * FIXME: call the correct reference counting function
4533 */ 4537 */
4534 page_cache_release(sgl[i].page); 4538 page_cache_release(page);
4535 } 4539 }
4536 4540
4537 return 0; 4541 return 0;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0d576987ec67..4975c9c193dd 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
291 cachefs_uncache_page(vnode->cache, page); 291 cachefs_uncache_page(vnode->cache, page);
292#endif 292#endif
293 293
294 pageio = (struct cachefs_page *) page->private; 294 pageio = (struct cachefs_page *) page_private(page);
295 page->private = 0; 295 set_page_private(page, 0);
296 ClearPagePrivate(page); 296 ClearPagePrivate(page);
297 297
298 if (pageio) 298 if (pageio)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index dd9baabaf016..72011826f0cb 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
318 current->mm->free_area_cache = current->mm->mmap_base; 318 current->mm->free_area_cache = current->mm->mmap_base;
319 current->mm->cached_hole_size = 0; 319 current->mm->cached_hole_size = 0;
320 320
321 set_mm_counter(current->mm, rss, 0);
322 current->mm->mmap = NULL; 321 current->mm->mmap = NULL;
323 compute_creds(bprm); 322 compute_creds(bprm);
324 current->flags &= ~PF_FORKNOEXEC; 323 current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4b15576e584..918ccc267e41 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
773 773
774 /* Do this so that we can load the interpreter, if need be. We will 774 /* Do this so that we can load the interpreter, if need be. We will
775 change some of these later */ 775 change some of these later */
776 set_mm_counter(current->mm, rss, 0);
777 current->mm->free_area_cache = current->mm->mmap_base; 776 current->mm->free_area_cache = current->mm->mmap_base;
778 current->mm->cached_hole_size = 0; 777 current->mm->cached_hole_size = 0;
779 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 778 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 134c9c0d1f54..dda87c4c82a3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
294 &interp_params, 294 &interp_params,
295 &current->mm->start_stack, 295 &current->mm->start_stack,
296 &current->mm->start_brk); 296 &current->mm->start_brk);
297#endif
298
299 /* do this so that we can load the interpreter, if need be
300 * - we will change some of these later
301 */
302 set_mm_counter(current->mm, rss, 0);
303 297
304#ifdef CONFIG_MMU
305 retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); 298 retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
306 if (retval < 0) { 299 if (retval < 0) {
307 send_sig(SIGKILL, current, 0); 300 send_sig(SIGKILL, current, 0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7974efa107bc..9d6625829b99 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
650 current->mm->start_brk = datapos + data_len + bss_len; 650 current->mm->start_brk = datapos + data_len + bss_len;
651 current->mm->brk = (current->mm->start_brk + 3) & ~3; 651 current->mm->brk = (current->mm->start_brk + 3) & ~3;
652 current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; 652 current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
653 set_mm_counter(current->mm, rss, 0);
654 } 653 }
655 654
656 if (flags & FLAT_FLAG_KTRACE) 655 if (flags & FLAT_FLAG_KTRACE)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 227a2682d2bf..00a91dc25d16 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
259 create_som_tables(bprm); 259 create_som_tables(bprm);
260 260
261 current->mm->start_stack = bprm->p; 261 current->mm->start_stack = bprm->p;
262 set_mm_counter(current->mm, rss, 0);
263 262
264#if 0 263#if 0
265 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); 264 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/fs/buffer.c b/fs/buffer.c
index b1667986442f..2066e4cb700c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,7 @@ static void
96__clear_page_buffers(struct page *page) 96__clear_page_buffers(struct page *page)
97{ 97{
98 ClearPagePrivate(page); 98 ClearPagePrivate(page);
99 page->private = 0; 99 set_page_private(page, 0);
100 page_cache_release(page); 100 page_cache_release(page);
101} 101}
102 102
diff --git a/fs/compat.c b/fs/compat.c
index a719e158e002..8e71cdbecc7c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
1490 /* execve success */ 1490 /* execve success */
1491 security_bprm_free(bprm); 1491 security_bprm_free(bprm);
1492 acct_update_integrals(current); 1492 acct_update_integrals(current);
1493 update_mem_hiwater(current);
1494 kfree(bprm); 1493 kfree(bprm);
1495 return retval; 1494 return retval;
1496 } 1495 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0d06097bc995..3931e7f1e6bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
162 up_read(&current->mm->mmap_sem); 162 up_read(&current->mm->mmap_sem);
163 163
164 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { 164 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
165 struct page *page = ZERO_PAGE(dio->curr_user_address);
165 /* 166 /*
166 * A memory fault, but the filesystem has some outstanding 167 * A memory fault, but the filesystem has some outstanding
167 * mapped blocks. We need to use those blocks up to avoid 168 * mapped blocks. We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
169 */ 170 */
170 if (dio->page_errors == 0) 171 if (dio->page_errors == 0)
171 dio->page_errors = ret; 172 dio->page_errors = ret;
172 dio->pages[0] = ZERO_PAGE(dio->curr_user_address); 173 page_cache_get(page);
174 dio->pages[0] = page;
173 dio->head = 0; 175 dio->head = 0;
174 dio->tail = 1; 176 dio->tail = 1;
175 ret = 0; 177 ret = 0;
diff --git a/fs/exec.c b/fs/exec.c
index d2208f7c87db..ba73797eb4cb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -309,40 +309,36 @@ void install_arg_page(struct vm_area_struct *vma,
309 pud_t * pud; 309 pud_t * pud;
310 pmd_t * pmd; 310 pmd_t * pmd;
311 pte_t * pte; 311 pte_t * pte;
312 spinlock_t *ptl;
312 313
313 if (unlikely(anon_vma_prepare(vma))) 314 if (unlikely(anon_vma_prepare(vma)))
314 goto out_sig; 315 goto out;
315 316
316 flush_dcache_page(page); 317 flush_dcache_page(page);
317 pgd = pgd_offset(mm, address); 318 pgd = pgd_offset(mm, address);
318
319 spin_lock(&mm->page_table_lock);
320 pud = pud_alloc(mm, pgd, address); 319 pud = pud_alloc(mm, pgd, address);
321 if (!pud) 320 if (!pud)
322 goto out; 321 goto out;
323 pmd = pmd_alloc(mm, pud, address); 322 pmd = pmd_alloc(mm, pud, address);
324 if (!pmd) 323 if (!pmd)
325 goto out; 324 goto out;
326 pte = pte_alloc_map(mm, pmd, address); 325 pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
327 if (!pte) 326 if (!pte)
328 goto out; 327 goto out;
329 if (!pte_none(*pte)) { 328 if (!pte_none(*pte)) {
330 pte_unmap(pte); 329 pte_unmap_unlock(pte, ptl);
331 goto out; 330 goto out;
332 } 331 }
333 inc_mm_counter(mm, rss); 332 inc_mm_counter(mm, anon_rss);
334 lru_cache_add_active(page); 333 lru_cache_add_active(page);
335 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( 334 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
336 page, vma->vm_page_prot)))); 335 page, vma->vm_page_prot))));
337 page_add_anon_rmap(page, vma, address); 336 page_add_anon_rmap(page, vma, address);
338 pte_unmap(pte); 337 pte_unmap_unlock(pte, ptl);
339 spin_unlock(&mm->page_table_lock);
340 338
341 /* no need for flush_tlb */ 339 /* no need for flush_tlb */
342 return; 340 return;
343out: 341out:
344 spin_unlock(&mm->page_table_lock);
345out_sig:
346 __free_page(page); 342 __free_page(page);
347 force_sig(SIGKILL, current); 343 force_sig(SIGKILL, current);
348} 344}
@@ -1207,7 +1203,6 @@ int do_execve(char * filename,
1207 /* execve success */ 1203 /* execve success */
1208 security_bprm_free(bprm); 1204 security_bprm_free(bprm);
1209 acct_update_integrals(current); 1205 acct_update_integrals(current);
1210 update_mem_hiwater(current);
1211 kfree(bprm); 1206 kfree(bprm);
1212 return retval; 1207 return retval;
1213 } 1208 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a9b6d179cbd..e026c807e6b3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -45,10 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
45 45
46int sysctl_hugetlb_shm_group; 46int sysctl_hugetlb_shm_group;
47 47
48static void huge_pagevec_release(struct pagevec *pvec)
49{
50 int i;
51
52 for (i = 0; i < pagevec_count(pvec); ++i)
53 put_page(pvec->pages[i]);
54
55 pagevec_reinit(pvec);
56}
57
58/*
59 * huge_pages_needed tries to determine the number of new huge pages that
60 * will be required to fully populate this VMA. This will be equal to
61 * the size of the VMA in huge pages minus the number of huge pages
62 * (covered by this VMA) that are found in the page cache.
63 *
64 * Result is in bytes to be compatible with is_hugepage_mem_enough()
65 */
66unsigned long
67huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
68{
69 int i;
70 struct pagevec pvec;
71 unsigned long start = vma->vm_start;
72 unsigned long end = vma->vm_end;
73 unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
74 pgoff_t next = vma->vm_pgoff;
75 pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
76
77 pagevec_init(&pvec, 0);
78 while (next < endpg) {
79 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
80 break;
81 for (i = 0; i < pagevec_count(&pvec); i++) {
82 struct page *page = pvec.pages[i];
83 if (page->index > next)
84 next = page->index;
85 if (page->index >= endpg)
86 break;
87 next++;
88 hugepages--;
89 }
90 huge_pagevec_release(&pvec);
91 }
92 return hugepages << HPAGE_SHIFT;
93}
94
48static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 95static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
49{ 96{
50 struct inode *inode = file->f_dentry->d_inode; 97 struct inode *inode = file->f_dentry->d_inode;
51 struct address_space *mapping = inode->i_mapping; 98 struct address_space *mapping = inode->i_mapping;
99 unsigned long bytes;
52 loff_t len, vma_len; 100 loff_t len, vma_len;
53 int ret; 101 int ret;
54 102
@@ -67,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
67 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 115 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
68 return -EINVAL; 116 return -EINVAL;
69 117
118 bytes = huge_pages_needed(mapping, vma);
119 if (!is_hugepage_mem_enough(bytes))
120 return -ENOMEM;
121
70 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 122 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
71 123
72 down(&inode->i_sem); 124 down(&inode->i_sem);
@@ -79,10 +131,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
79 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) 131 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
80 goto out; 132 goto out;
81 133
82 ret = hugetlb_prefault(mapping, vma); 134 ret = 0;
83 if (ret) 135 hugetlb_prefault_arch_hook(vma->vm_mm);
84 goto out;
85
86 if (inode->i_size < len) 136 if (inode->i_size < len)
87 inode->i_size = len; 137 inode->i_size = len;
88out: 138out:
@@ -92,7 +142,7 @@ out:
92} 142}
93 143
94/* 144/*
95 * Called under down_write(mmap_sem), page_table_lock is not held 145 * Called under down_write(mmap_sem).
96 */ 146 */
97 147
98#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 148#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -171,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file,
171 return -EINVAL; 221 return -EINVAL;
172} 222}
173 223
174static void huge_pagevec_release(struct pagevec *pvec)
175{
176 int i;
177
178 for (i = 0; i < pagevec_count(pvec); ++i)
179 put_page(pvec->pages[i]);
180
181 pagevec_reinit(pvec);
182}
183
184static void truncate_huge_page(struct page *page) 224static void truncate_huge_page(struct page *page)
185{ 225{
186 clear_page_dirty(page); 226 clear_page_dirty(page);
@@ -224,52 +264,35 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
224 264
225static void hugetlbfs_delete_inode(struct inode *inode) 265static void hugetlbfs_delete_inode(struct inode *inode)
226{ 266{
227 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
228
229 hlist_del_init(&inode->i_hash);
230 list_del_init(&inode->i_list);
231 list_del_init(&inode->i_sb_list);
232 inode->i_state |= I_FREEING;
233 inodes_stat.nr_inodes--;
234 spin_unlock(&inode_lock);
235
236 if (inode->i_data.nrpages) 267 if (inode->i_data.nrpages)
237 truncate_hugepages(&inode->i_data, 0); 268 truncate_hugepages(&inode->i_data, 0);
238
239 security_inode_delete(inode);
240
241 if (sbinfo->free_inodes >= 0) {
242 spin_lock(&sbinfo->stat_lock);
243 sbinfo->free_inodes++;
244 spin_unlock(&sbinfo->stat_lock);
245 }
246
247 clear_inode(inode); 269 clear_inode(inode);
248 destroy_inode(inode);
249} 270}
250 271
251static void hugetlbfs_forget_inode(struct inode *inode) 272static void hugetlbfs_forget_inode(struct inode *inode)
252{ 273{
253 struct super_block *super_block = inode->i_sb; 274 struct super_block *sb = inode->i_sb;
254 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block);
255 275
256 if (hlist_unhashed(&inode->i_hash)) 276 if (!hlist_unhashed(&inode->i_hash)) {
257 goto out_truncate; 277 if (!(inode->i_state & (I_DIRTY|I_LOCK)))
258 278 list_move(&inode->i_list, &inode_unused);
259 if (!(inode->i_state & (I_DIRTY|I_LOCK))) { 279 inodes_stat.nr_unused++;
260 list_del(&inode->i_list); 280 if (!sb || (sb->s_flags & MS_ACTIVE)) {
261 list_add(&inode->i_list, &inode_unused); 281 spin_unlock(&inode_lock);
262 } 282 return;
263 inodes_stat.nr_unused++; 283 }
264 if (!super_block || (super_block->s_flags & MS_ACTIVE)) { 284 inode->i_state |= I_WILL_FREE;
265 spin_unlock(&inode_lock); 285 spin_unlock(&inode_lock);
266 return; 286 /*
287 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
288 * in our backing_dev_info.
289 */
290 write_inode_now(inode, 1);
291 spin_lock(&inode_lock);
292 inode->i_state &= ~I_WILL_FREE;
293 inodes_stat.nr_unused--;
294 hlist_del_init(&inode->i_hash);
267 } 295 }
268
269 /* write_inode_now() ? */
270 inodes_stat.nr_unused--;
271 hlist_del_init(&inode->i_hash);
272out_truncate:
273 list_del_init(&inode->i_list); 296 list_del_init(&inode->i_list);
274 list_del_init(&inode->i_sb_list); 297 list_del_init(&inode->i_sb_list);
275 inode->i_state |= I_FREEING; 298 inode->i_state |= I_FREEING;
@@ -277,13 +300,6 @@ out_truncate:
277 spin_unlock(&inode_lock); 300 spin_unlock(&inode_lock);
278 if (inode->i_data.nrpages) 301 if (inode->i_data.nrpages)
279 truncate_hugepages(&inode->i_data, 0); 302 truncate_hugepages(&inode->i_data, 0);
280
281 if (sbinfo->free_inodes >= 0) {
282 spin_lock(&sbinfo->stat_lock);
283 sbinfo->free_inodes++;
284 spin_unlock(&sbinfo->stat_lock);
285 }
286
287 clear_inode(inode); 303 clear_inode(inode);
288 destroy_inode(inode); 304 destroy_inode(inode);
289} 305}
@@ -291,7 +307,7 @@ out_truncate:
291static void hugetlbfs_drop_inode(struct inode *inode) 307static void hugetlbfs_drop_inode(struct inode *inode)
292{ 308{
293 if (!inode->i_nlink) 309 if (!inode->i_nlink)
294 hugetlbfs_delete_inode(inode); 310 generic_delete_inode(inode);
295 else 311 else
296 hugetlbfs_forget_inode(inode); 312 hugetlbfs_forget_inode(inode);
297} 313}
@@ -308,7 +324,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
308 324
309 vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { 325 vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
310 unsigned long h_vm_pgoff; 326 unsigned long h_vm_pgoff;
311 unsigned long v_length;
312 unsigned long v_offset; 327 unsigned long v_offset;
313 328
314 h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); 329 h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -319,11 +334,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
319 if (h_vm_pgoff >= h_pgoff) 334 if (h_vm_pgoff >= h_pgoff)
320 v_offset = 0; 335 v_offset = 0;
321 336
322 v_length = vma->vm_end - vma->vm_start; 337 unmap_hugepage_range(vma,
323 338 vma->vm_start + v_offset, vma->vm_end);
324 zap_hugepage_range(vma,
325 vma->vm_start + v_offset,
326 v_length - v_offset);
327 } 339 }
328} 340}
329 341
@@ -379,17 +391,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
379 gid_t gid, int mode, dev_t dev) 391 gid_t gid, int mode, dev_t dev)
380{ 392{
381 struct inode *inode; 393 struct inode *inode;
382 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
383
384 if (sbinfo->free_inodes >= 0) {
385 spin_lock(&sbinfo->stat_lock);
386 if (!sbinfo->free_inodes) {
387 spin_unlock(&sbinfo->stat_lock);
388 return NULL;
389 }
390 sbinfo->free_inodes--;
391 spin_unlock(&sbinfo->stat_lock);
392 }
393 394
394 inode = new_inode(sb); 395 inode = new_inode(sb);
395 if (inode) { 396 if (inode) {
@@ -531,29 +532,51 @@ static void hugetlbfs_put_super(struct super_block *sb)
531 } 532 }
532} 533}
533 534
535static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
536{
537 if (sbinfo->free_inodes >= 0) {
538 spin_lock(&sbinfo->stat_lock);
539 if (unlikely(!sbinfo->free_inodes)) {
540 spin_unlock(&sbinfo->stat_lock);
541 return 0;
542 }
543 sbinfo->free_inodes--;
544 spin_unlock(&sbinfo->stat_lock);
545 }
546
547 return 1;
548}
549
550static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
551{
552 if (sbinfo->free_inodes >= 0) {
553 spin_lock(&sbinfo->stat_lock);
554 sbinfo->free_inodes++;
555 spin_unlock(&sbinfo->stat_lock);
556 }
557}
558
559
534static kmem_cache_t *hugetlbfs_inode_cachep; 560static kmem_cache_t *hugetlbfs_inode_cachep;
535 561
536static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 562static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
537{ 563{
564 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
538 struct hugetlbfs_inode_info *p; 565 struct hugetlbfs_inode_info *p;
539 566
567 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
568 return NULL;
540 p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); 569 p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
541 if (!p) 570 if (unlikely(!p)) {
571 hugetlbfs_inc_free_inodes(sbinfo);
542 return NULL; 572 return NULL;
573 }
543 return &p->vfs_inode; 574 return &p->vfs_inode;
544} 575}
545 576
546static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
547{
548 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
549
550 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
551 SLAB_CTOR_CONSTRUCTOR)
552 inode_init_once(&ei->vfs_inode);
553}
554
555static void hugetlbfs_destroy_inode(struct inode *inode) 577static void hugetlbfs_destroy_inode(struct inode *inode)
556{ 578{
579 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
557 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 580 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
558 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 581 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
559} 582}
@@ -565,6 +588,16 @@ static struct address_space_operations hugetlbfs_aops = {
565 .set_page_dirty = hugetlbfs_set_page_dirty, 588 .set_page_dirty = hugetlbfs_set_page_dirty,
566}; 589};
567 590
591
592static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
593{
594 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
595
596 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
597 SLAB_CTOR_CONSTRUCTOR)
598 inode_init_once(&ei->vfs_inode);
599}
600
568struct file_operations hugetlbfs_file_operations = { 601struct file_operations hugetlbfs_file_operations = {
569 .mmap = hugetlbfs_file_mmap, 602 .mmap = hugetlbfs_file_mmap,
570 .fsync = simple_sync_file, 603 .fsync = simple_sync_file,
@@ -592,6 +625,7 @@ static struct super_operations hugetlbfs_ops = {
592 .alloc_inode = hugetlbfs_alloc_inode, 625 .alloc_inode = hugetlbfs_alloc_inode,
593 .destroy_inode = hugetlbfs_destroy_inode, 626 .destroy_inode = hugetlbfs_destroy_inode,
594 .statfs = hugetlbfs_statfs, 627 .statfs = hugetlbfs_statfs,
628 .delete_inode = hugetlbfs_delete_inode,
595 .drop_inode = hugetlbfs_drop_inode, 629 .drop_inode = hugetlbfs_drop_inode,
596 .put_super = hugetlbfs_put_super, 630 .put_super = hugetlbfs_put_super,
597}; 631};
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 26091a5f88d4..8a53981f9f27 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
86 atomic_t io_count; 86 atomic_t io_count;
87 struct metapage *mp[MPS_PER_PAGE]; 87 struct metapage *mp[MPS_PER_PAGE];
88}; 88};
89#define mp_anchor(page) ((struct meta_anchor *)page->private) 89#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
90 90
91static inline struct metapage *page_to_mp(struct page *page, uint offset) 91static inline struct metapage *page_to_mp(struct page *page, uint offset)
92{ 92{
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
108 if (!a) 108 if (!a)
109 return -ENOMEM; 109 return -ENOMEM;
110 memset(a, 0, sizeof(struct meta_anchor)); 110 memset(a, 0, sizeof(struct meta_anchor));
111 page->private = (unsigned long)a; 111 set_page_private(page, (unsigned long)a);
112 SetPagePrivate(page); 112 SetPagePrivate(page);
113 kmap(page); 113 kmap(page);
114 } 114 }
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
136 a->mp[index] = NULL; 136 a->mp[index] = NULL;
137 if (--a->mp_count == 0) { 137 if (--a->mp_count == 0) {
138 kfree(a); 138 kfree(a);
139 page->private = 0; 139 set_page_private(page, 0);
140 ClearPagePrivate(page); 140 ClearPagePrivate(page);
141 kunmap(page); 141 kunmap(page);
142 } 142 }
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
156#else 156#else
157static inline struct metapage *page_to_mp(struct page *page, uint offset) 157static inline struct metapage *page_to_mp(struct page *page, uint offset)
158{ 158{
159 return PagePrivate(page) ? (struct metapage *)page->private : NULL; 159 return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
160} 160}
161 161
162static inline int insert_metapage(struct page *page, struct metapage *mp) 162static inline int insert_metapage(struct page *page, struct metapage *mp)
163{ 163{
164 if (mp) { 164 if (mp) {
165 page->private = (unsigned long)mp; 165 set_page_private(page, (unsigned long)mp);
166 SetPagePrivate(page); 166 SetPagePrivate(page);
167 kmap(page); 167 kmap(page);
168 } 168 }
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
171 171
172static inline void remove_metapage(struct page *page, struct metapage *mp) 172static inline void remove_metapage(struct page *page, struct metapage *mp)
173{ 173{
174 page->private = 0; 174 set_page_private(page, 0);
175 ClearPagePrivate(page); 175 ClearPagePrivate(page);
176 kunmap(page); 176 kunmap(page);
177} 177}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d84eecacbeaf..3e1239e4b303 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
438 jiffies_to_clock_t(it_real_value), 438 jiffies_to_clock_t(it_real_value),
439 start_time, 439 start_time,
440 vsize, 440 vsize,
441 mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */ 441 mm ? get_mm_rss(mm) : 0,
442 rsslim, 442 rsslim,
443 mm ? mm->start_code : 0, 443 mm ? mm->start_code : 0,
444 mm ? mm->end_code : 0, 444 mm ? mm->end_code : 0,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c7ef3e48e35b..d2fa42006d8f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
14char *task_mem(struct mm_struct *mm, char *buffer) 14char *task_mem(struct mm_struct *mm, char *buffer)
15{ 15{
16 unsigned long data, text, lib; 16 unsigned long data, text, lib;
17 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
18
19 /*
20 * Note: to minimize their overhead, mm maintains hiwater_vm and
21 * hiwater_rss only when about to *lower* total_vm or rss. Any
22 * collector of these hiwater stats must therefore get total_vm
23 * and rss too, which will usually be the higher. Barriers? not
24 * worth the effort, such snapshots can always be inconsistent.
25 */
26 hiwater_vm = total_vm = mm->total_vm;
27 if (hiwater_vm < mm->hiwater_vm)
28 hiwater_vm = mm->hiwater_vm;
29 hiwater_rss = total_rss = get_mm_rss(mm);
30 if (hiwater_rss < mm->hiwater_rss)
31 hiwater_rss = mm->hiwater_rss;
17 32
18 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 33 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
19 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 34 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
20 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 35 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
21 buffer += sprintf(buffer, 36 buffer += sprintf(buffer,
37 "VmPeak:\t%8lu kB\n"
22 "VmSize:\t%8lu kB\n" 38 "VmSize:\t%8lu kB\n"
23 "VmLck:\t%8lu kB\n" 39 "VmLck:\t%8lu kB\n"
40 "VmHWM:\t%8lu kB\n"
24 "VmRSS:\t%8lu kB\n" 41 "VmRSS:\t%8lu kB\n"
25 "VmData:\t%8lu kB\n" 42 "VmData:\t%8lu kB\n"
26 "VmStk:\t%8lu kB\n" 43 "VmStk:\t%8lu kB\n"
27 "VmExe:\t%8lu kB\n" 44 "VmExe:\t%8lu kB\n"
28 "VmLib:\t%8lu kB\n" 45 "VmLib:\t%8lu kB\n"
29 "VmPTE:\t%8lu kB\n", 46 "VmPTE:\t%8lu kB\n",
30 (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 47 hiwater_vm << (PAGE_SHIFT-10),
48 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
31 mm->locked_vm << (PAGE_SHIFT-10), 49 mm->locked_vm << (PAGE_SHIFT-10),
32 get_mm_counter(mm, rss) << (PAGE_SHIFT-10), 50 hiwater_rss << (PAGE_SHIFT-10),
51 total_rss << (PAGE_SHIFT-10),
33 data << (PAGE_SHIFT-10), 52 data << (PAGE_SHIFT-10),
34 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 53 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
35 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); 54 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +63,11 @@ unsigned long task_vsize(struct mm_struct *mm)
44int task_statm(struct mm_struct *mm, int *shared, int *text, 63int task_statm(struct mm_struct *mm, int *shared, int *text,
45 int *data, int *resident) 64 int *data, int *resident)
46{ 65{
47 int rss = get_mm_counter(mm, rss); 66 *shared = get_mm_counter(mm, file_rss);
48
49 *shared = rss - get_mm_counter(mm, anon_rss);
50 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 67 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
51 >> PAGE_SHIFT; 68 >> PAGE_SHIFT;
52 *data = mm->total_vm - mm->shared_vm; 69 *data = mm->total_vm - mm->shared_vm;
53 *resident = rss; 70 *resident = *shared + get_mm_counter(mm, anon_rss);
54 return mm->total_vm; 71 return mm->total_vm;
55} 72}
56 73
@@ -186,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
186 struct mem_size_stats *mss) 203 struct mem_size_stats *mss)
187{ 204{
188 pte_t *pte, ptent; 205 pte_t *pte, ptent;
206 spinlock_t *ptl;
189 unsigned long pfn; 207 unsigned long pfn;
190 struct page *page; 208 struct page *page;
191 209
192 pte = pte_offset_map(pmd, addr); 210 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
193 do { 211 do {
194 ptent = *pte; 212 ptent = *pte;
195 if (pte_none(ptent) || !pte_present(ptent)) 213 if (!pte_present(ptent))
196 continue; 214 continue;
197 215
198 mss->resident += PAGE_SIZE; 216 mss->resident += PAGE_SIZE;
@@ -213,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
213 mss->private_clean += PAGE_SIZE; 231 mss->private_clean += PAGE_SIZE;
214 } 232 }
215 } while (pte++, addr += PAGE_SIZE, addr != end); 233 } while (pte++, addr += PAGE_SIZE, addr != end);
216 pte_unmap(pte - 1); 234 pte_unmap_unlock(pte - 1, ptl);
217 cond_resched_lock(&vma->vm_mm->page_table_lock); 235 cond_resched();
218} 236}
219 237
220static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, 238static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -268,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
268static int show_smap(struct seq_file *m, void *v) 286static int show_smap(struct seq_file *m, void *v)
269{ 287{
270 struct vm_area_struct *vma = v; 288 struct vm_area_struct *vma = v;
271 struct mm_struct *mm = vma->vm_mm;
272 struct mem_size_stats mss; 289 struct mem_size_stats mss;
273 290
274 memset(&mss, 0, sizeof mss); 291 memset(&mss, 0, sizeof mss);
275 292 if (vma->vm_mm)
276 if (mm) {
277 spin_lock(&mm->page_table_lock);
278 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); 293 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
279 spin_unlock(&mm->page_table_lock);
280 }
281
282 return show_map_internal(m, v, &mss); 294 return show_map_internal(m, v, &mss);
283} 295}
284 296
@@ -407,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
407 for_each_node(i) 419 for_each_node(i)
408 md->node[i] =0; 420 md->node[i] =0;
409 421
410 spin_lock(&mm->page_table_lock);
411 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { 422 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
412 page = follow_page(mm, vaddr, 0); 423 page = follow_page(mm, vaddr, 0);
413 if (page) { 424 if (page) {
@@ -422,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
422 md->anon++; 433 md->anon++;
423 md->node[page_to_nid(page)]++; 434 md->node[page_to_nid(page)]++;
424 } 435 }
436 cond_resched();
425 } 437 }
426 spin_unlock(&mm->page_table_lock);
427 return md; 438 return md;
428} 439}
429 440
@@ -469,7 +480,7 @@ static int show_numa_map(struct seq_file *m, void *v)
469 seq_printf(m, " interleave={"); 480 seq_printf(m, " interleave={");
470 first = 1; 481 first = 1;
471 for_each_node(n) { 482 for_each_node(n) {
472 if (test_bit(n, pol->v.nodes)) { 483 if (node_isset(n, pol->v.nodes)) {
473 if (!first) 484 if (!first)
474 seq_putc(m,','); 485 seq_putc(m,',');
475 else 486 else
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba4767c04adf..4cd46abe8434 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -181,8 +181,9 @@ set_page_region(
181 size_t offset, 181 size_t offset,
182 size_t length) 182 size_t length)
183{ 183{
184 page->private |= page_region_mask(offset, length); 184 set_page_private(page,
185 if (page->private == ~0UL) 185 page_private(page) | page_region_mask(offset, length));
186 if (page_private(page) == ~0UL)
186 SetPageUptodate(page); 187 SetPageUptodate(page);
187} 188}
188 189
@@ -194,7 +195,7 @@ test_page_region(
194{ 195{
195 unsigned long mask = page_region_mask(offset, length); 196 unsigned long mask = page_region_mask(offset, length);
196 197
197 return (mask && (page->private & mask) == mask); 198 return (mask && (page_private(page) & mask) == mask);
198} 199}
199 200
200/* 201/*
diff --git a/include/asm-alpha/barrier.h b/include/asm-alpha/barrier.h
index 229c83fe77cb..681ff581afa5 100644
--- a/include/asm-alpha/barrier.h
+++ b/include/asm-alpha/barrier.h
@@ -1,6 +1,8 @@
1#ifndef __BARRIER_H 1#ifndef __BARRIER_H
2#define __BARRIER_H 2#define __BARRIER_H
3 3
4#include <asm/compiler.h>
5
4#define mb() \ 6#define mb() \
5__asm__ __volatile__("mb": : :"memory") 7__asm__ __volatile__("mb": : :"memory")
6 8
diff --git a/include/asm-alpha/rwsem.h b/include/asm-alpha/rwsem.h
index 8e058a67c9a4..fafdd4f7010a 100644
--- a/include/asm-alpha/rwsem.h
+++ b/include/asm-alpha/rwsem.h
@@ -262,5 +262,10 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
262#endif 262#endif
263} 263}
264 264
265static inline int rwsem_is_locked(struct rw_semaphore *sem)
266{
267 return (sem->count != 0);
268}
269
265#endif /* __KERNEL__ */ 270#endif /* __KERNEL__ */
266#endif /* _ALPHA_RWSEM_H */ 271#endif /* _ALPHA_RWSEM_H */
diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index 9bb325c54645..f49bfb78c221 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -27,11 +27,7 @@
27 */ 27 */
28struct mmu_gather { 28struct mmu_gather {
29 struct mm_struct *mm; 29 struct mm_struct *mm;
30 unsigned int freed;
31 unsigned int fullmm; 30 unsigned int fullmm;
32
33 unsigned int flushes;
34 unsigned int avoided_flushes;
35}; 31};
36 32
37DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); 33DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -39,11 +35,9 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
39static inline struct mmu_gather * 35static inline struct mmu_gather *
40tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) 36tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
41{ 37{
42 int cpu = smp_processor_id(); 38 struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
43 struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
44 39
45 tlb->mm = mm; 40 tlb->mm = mm;
46 tlb->freed = 0;
47 tlb->fullmm = full_mm_flush; 41 tlb->fullmm = full_mm_flush;
48 42
49 return tlb; 43 return tlb;
@@ -52,24 +46,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
52static inline void 46static inline void
53tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) 47tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
54{ 48{
55 struct mm_struct *mm = tlb->mm;
56 unsigned long freed = tlb->freed;
57 int rss = get_mm_counter(mm, rss);
58
59 if (rss < freed)
60 freed = rss;
61 add_mm_counter(mm, rss, -freed);
62
63 if (tlb->fullmm) 49 if (tlb->fullmm)
64 flush_tlb_mm(mm); 50 flush_tlb_mm(tlb->mm);
65 51
66 /* keep the page table cache within bounds */ 52 /* keep the page table cache within bounds */
67 check_pgt_cache(); 53 check_pgt_cache();
68}
69 54
70static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb) 55 put_cpu_var(mmu_gathers);
71{
72 return tlb->fullmm;
73} 56}
74 57
75#define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) 58#define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0)
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h
index 1316352a58f3..08ddd85b8d35 100644
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -10,24 +10,20 @@
10 */ 10 */
11struct mmu_gather { 11struct mmu_gather {
12 struct mm_struct *mm; 12 struct mm_struct *mm;
13 unsigned int freed; 13 unsigned int need_flush;
14 unsigned int fullmm; 14 unsigned int fullmm;
15
16 unsigned int flushes;
17 unsigned int avoided_flushes;
18}; 15};
19 16
20extern struct mmu_gather mmu_gathers[NR_CPUS]; 17DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
21 18
22static inline struct mmu_gather * 19static inline struct mmu_gather *
23tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) 20tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
24{ 21{
25 int cpu = smp_processor_id(); 22 struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
26 struct mmu_gather *tlb = &mmu_gathers[cpu];
27 23
28 tlb->mm = mm; 24 tlb->mm = mm;
29 tlb->freed = 0; 25 tlb->need_flush = 0;
30 tlb->fullmm = full_mm_flush; 26 tlb->fullmm = full_mm_flush;
31 27
32 return tlb; 28 return tlb;
33} 29}
@@ -35,30 +31,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
35static inline void 31static inline void
36tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) 32tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
37{ 33{
38 struct mm_struct *mm = tlb->mm; 34 if (tlb->need_flush)
39 unsigned long freed = tlb->freed; 35 flush_tlb_mm(tlb->mm);
40 int rss = get_mm_counter(mm, rss);
41
42 if (rss < freed)
43 freed = rss;
44 add_mm_counter(mm, rss, -freed);
45
46 if (freed) {
47 flush_tlb_mm(mm);
48 tlb->flushes++;
49 } else {
50 tlb->avoided_flushes++;
51 }
52 36
53 /* keep the page table cache within bounds */ 37 /* keep the page table cache within bounds */
54 check_pgt_cache(); 38 check_pgt_cache();
55}
56
57 39
58static inline unsigned int 40 put_cpu_var(mmu_gathers);
59tlb_is_full_mm(struct mmu_gather *tlb)
60{
61 return tlb->fullmm;
62} 41}
63 42
64#define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) 43#define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0)
@@ -71,7 +50,13 @@ tlb_is_full_mm(struct mmu_gather *tlb)
71 } while (0) 50 } while (0)
72#define tlb_end_vma(tlb,vma) do { } while (0) 51#define tlb_end_vma(tlb,vma) do { } while (0)
73 52
74#define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) 53static inline void
54tlb_remove_page(struct mmu_gather *tlb, struct page *page)
55{
56 tlb->need_flush = 1;
57 free_page_and_swap_cache(page);
58}
59
75#define pte_free_tlb(tlb,ptep) pte_free(ptep) 60#define pte_free_tlb(tlb,ptep) pte_free(ptep)
76#define pmd_free_tlb(tlb,pmdp) pmd_free(pmdp) 61#define pmd_free_tlb(tlb,pmdp) pmd_free(pmdp)
77 62
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index c20ec257ecc0..68c6fea994d9 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -10,14 +10,9 @@
10 10
11#define pud_t pgd_t 11#define pud_t pgd_t
12 12
13#define pmd_alloc(mm, pud, address) \ 13#define pmd_alloc(mm, pud, address) \
14({ pmd_t *ret; \ 14 ((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \
15 if (pgd_none(*pud)) \ 15 NULL: pmd_offset(pud, address))
16 ret = __pmd_alloc(mm, pud, address); \
17 else \
18 ret = pmd_offset(pud, address); \
19 ret; \
20})
21 16
22#define pud_alloc(mm, pgd, address) (pgd) 17#define pud_alloc(mm, pgd, address) (pgd)
23#define pud_offset(pgd, start) (pgd) 18#define pud_offset(pgd, start) (pgd)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index ff28c8b31f58..7dca30a26c53 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -8,7 +8,7 @@
8 * - update the page tables 8 * - update the page tables
9 * - inform the TLB about the new one 9 * - inform the TLB about the new one
10 * 10 *
11 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock. 11 * We hold the mm semaphore for reading, and the pte lock.
12 * 12 *
13 * Note: the old pte is known to not be writable, so we don't need to 13 * Note: the old pte is known to not be writable, so we don't need to
14 * worry about dirty bits etc getting lost. 14 * worry about dirty bits etc getting lost.
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 7d0298347ee7..cdd4145243cd 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -35,16 +35,13 @@
35#endif 35#endif
36 36
37/* struct mmu_gather is an opaque type used by the mm code for passing around 37/* struct mmu_gather is an opaque type used by the mm code for passing around
38 * any data needed by arch specific code for tlb_remove_page. This structure 38 * any data needed by arch specific code for tlb_remove_page.
39 * can be per-CPU or per-MM as the page table lock is held for the duration of
40 * TLB shootdown.
41 */ 39 */
42struct mmu_gather { 40struct mmu_gather {
43 struct mm_struct *mm; 41 struct mm_struct *mm;
44 unsigned int nr; /* set to ~0U means fast mode */ 42 unsigned int nr; /* set to ~0U means fast mode */
45 unsigned int need_flush;/* Really unmapped some ptes? */ 43 unsigned int need_flush;/* Really unmapped some ptes? */
46 unsigned int fullmm; /* non-zero means full mm flush */ 44 unsigned int fullmm; /* non-zero means full mm flush */
47 unsigned long freed;
48 struct page * pages[FREE_PTE_NR]; 45 struct page * pages[FREE_PTE_NR];
49}; 46};
50 47
@@ -57,7 +54,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
57static inline struct mmu_gather * 54static inline struct mmu_gather *
58tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) 55tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
59{ 56{
60 struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id()); 57 struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
61 58
62 tlb->mm = mm; 59 tlb->mm = mm;
63 60
@@ -65,7 +62,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
65 tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; 62 tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
66 63
67 tlb->fullmm = full_mm_flush; 64 tlb->fullmm = full_mm_flush;
68 tlb->freed = 0;
69 65
70 return tlb; 66 return tlb;
71} 67}
@@ -85,28 +81,17 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
85 81
86/* tlb_finish_mmu 82/* tlb_finish_mmu
87 * Called at the end of the shootdown operation to free up any resources 83 * Called at the end of the shootdown operation to free up any resources
88 * that were required. The page table lock is still held at this point. 84 * that were required.
89 */ 85 */
90static inline void 86static inline void
91tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) 87tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
92{ 88{
93 int freed = tlb->freed;
94 struct mm_struct *mm = tlb->mm;
95 int rss = get_mm_counter(mm, rss);
96
97 if (rss < freed)
98 freed = rss;
99 add_mm_counter(mm, rss, -freed);
100 tlb_flush_mmu(tlb, start, end); 89 tlb_flush_mmu(tlb, start, end);
101 90
102 /* keep the page table cache within bounds */ 91 /* keep the page table cache within bounds */
103 check_pgt_cache(); 92 check_pgt_cache();
104}
105 93
106static inline unsigned int 94 put_cpu_var(mmu_gathers);
107tlb_is_full_mm(struct mmu_gather *tlb)
108{
109 return tlb->fullmm;
110} 95}
111 96
112/* tlb_remove_page 97/* tlb_remove_page
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 348fe3a4879d..620a90641ea8 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -88,12 +88,6 @@ static inline int pfn_to_nid(unsigned long pfn)
88 __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ 88 __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
89}) 89})
90 90
91#define local_mapnr(kvaddr) \
92({ \
93 unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \
94 (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \
95})
96
97/* XXX: FIXME -- wli */ 91/* XXX: FIXME -- wli */
98#define kern_addr_valid(kaddr) (0) 92#define kern_addr_valid(kaddr) (0)
99 93
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index d101ac414f07..0e3ec809352d 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -203,7 +203,8 @@ extern unsigned long pg0[];
203#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) 203#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
204#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) 204#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
205 205
206#define pmd_none(x) (!pmd_val(x)) 206/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
207#define pmd_none(x) (!(unsigned long)pmd_val(x))
207#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) 208#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
208#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) 209#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
209#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) 210#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h
index 7625a675852f..be4ab859238e 100644
--- a/include/asm-i386/rwsem.h
+++ b/include/asm-i386/rwsem.h
@@ -284,5 +284,10 @@ LOCK_PREFIX "xadd %0,(%2)"
284 return tmp+delta; 284 return tmp+delta;
285} 285}
286 286
287static inline int rwsem_is_locked(struct rw_semaphore *sem)
288{
289 return (sem->count != 0);
290}
291
287#endif /* __KERNEL__ */ 292#endif /* __KERNEL__ */
288#endif /* _I386_RWSEM_H */ 293#endif /* _I386_RWSEM_H */
diff --git a/include/asm-ia64/rwsem.h b/include/asm-ia64/rwsem.h
index e18b5ab0cb75..1327c91ea39c 100644
--- a/include/asm-ia64/rwsem.h
+++ b/include/asm-ia64/rwsem.h
@@ -186,4 +186,9 @@ __downgrade_write (struct rw_semaphore *sem)
186#define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) 186#define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count))
187#define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) 187#define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
188 188
189static inline int rwsem_is_locked(struct rw_semaphore *sem)
190{
191 return (sem->count != 0);
192}
193
189#endif /* _ASM_IA64_RWSEM_H */ 194#endif /* _ASM_IA64_RWSEM_H */
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 3a9a6d1be75c..834370b9dea1 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -60,7 +60,6 @@ struct mmu_gather {
60 unsigned int nr; /* == ~0U => fast mode */ 60 unsigned int nr; /* == ~0U => fast mode */
61 unsigned char fullmm; /* non-zero means full mm flush */ 61 unsigned char fullmm; /* non-zero means full mm flush */
62 unsigned char need_flush; /* really unmapped some PTEs? */ 62 unsigned char need_flush; /* really unmapped some PTEs? */
63 unsigned long freed; /* number of pages freed */
64 unsigned long start_addr; 63 unsigned long start_addr;
65 unsigned long end_addr; 64 unsigned long end_addr;
66 struct page *pages[FREE_PTE_NR]; 65 struct page *pages[FREE_PTE_NR];
@@ -129,7 +128,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e
129static inline struct mmu_gather * 128static inline struct mmu_gather *
130tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) 129tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
131{ 130{
132 struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers); 131 struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
133 132
134 tlb->mm = mm; 133 tlb->mm = mm;
135 /* 134 /*
@@ -147,25 +146,17 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
147 */ 146 */
148 tlb->nr = (num_online_cpus() == 1) ? ~0U : 0; 147 tlb->nr = (num_online_cpus() == 1) ? ~0U : 0;
149 tlb->fullmm = full_mm_flush; 148 tlb->fullmm = full_mm_flush;
150 tlb->freed = 0;
151 tlb->start_addr = ~0UL; 149 tlb->start_addr = ~0UL;
152 return tlb; 150 return tlb;
153} 151}
154 152
155/* 153/*
156 * Called at the end of the shootdown operation to free up any resources that were 154 * Called at the end of the shootdown operation to free up any resources that were
157 * collected. The page table lock is still held at this point. 155 * collected.
158 */ 156 */
159static inline void 157static inline void
160tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) 158tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
161{ 159{
162 unsigned long freed = tlb->freed;
163 struct mm_struct *mm = tlb->mm;
164 unsigned long rss = get_mm_counter(mm, rss);
165
166 if (rss < freed)
167 freed = rss;
168 add_mm_counter(mm, rss, -freed);
169 /* 160 /*
170 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and 161 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
171 * tlb->end_addr. 162 * tlb->end_addr.
@@ -174,12 +165,8 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
174 165
175 /* keep the page table cache within bounds */ 166 /* keep the page table cache within bounds */
176 check_pgt_cache(); 167 check_pgt_cache();
177}
178 168
179static inline unsigned int 169 put_cpu_var(mmu_gathers);
180tlb_is_full_mm(struct mmu_gather *tlb)
181{
182 return tlb->fullmm;
183} 170}
184 171
185/* 172/*
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h
index d58878ec899e..adc7970a77ec 100644
--- a/include/asm-m32r/mmzone.h
+++ b/include/asm-m32r/mmzone.h
@@ -21,12 +21,6 @@ extern struct pglist_data *node_data[];
21 __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ 21 __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \
22}) 22})
23 23
24#define local_mapnr(kvaddr) \
25({ \
26 unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \
27 (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \
28})
29
30#define pfn_to_page(pfn) \ 24#define pfn_to_page(pfn) \
31({ \ 25({ \
32 unsigned long __pfn = pfn; \ 26 unsigned long __pfn = pfn; \
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index aa592d8c0e39..1bc3c83ee74b 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -100,30 +100,34 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
100 100
101/* Simple function to work out if we have an existing address translation 101/* Simple function to work out if we have an existing address translation
102 * for a user space vma. */ 102 * for a user space vma. */
103static inline pte_t *__translation_exists(struct mm_struct *mm, 103static inline int translation_exists(struct vm_area_struct *vma,
104 unsigned long addr) 104 unsigned long addr, unsigned long pfn)
105{ 105{
106 pgd_t *pgd = pgd_offset(mm, addr); 106 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
107 pmd_t *pmd; 107 pmd_t *pmd;
108 pte_t *pte; 108 pte_t pte;
109 109
110 if(pgd_none(*pgd)) 110 if(pgd_none(*pgd))
111 return NULL; 111 return 0;
112 112
113 pmd = pmd_offset(pgd, addr); 113 pmd = pmd_offset(pgd, addr);
114 if(pmd_none(*pmd) || pmd_bad(*pmd)) 114 if(pmd_none(*pmd) || pmd_bad(*pmd))
115 return NULL; 115 return 0;
116 116
117 pte = pte_offset_map(pmd, addr); 117 /* We cannot take the pte lock here: flush_cache_page is usually
118 * called with pte lock already held. Whereas flush_dcache_page
119 * takes flush_dcache_mmap_lock, which is lower in the hierarchy:
120 * the vma itself is secure, but the pte might come or go racily.
121 */
122 pte = *pte_offset_map(pmd, addr);
123 /* But pte_unmap() does nothing on this architecture */
118 124
119 /* The PA flush mappings show up as pte_none, but they're 125 /* Filter out coincidental file entries and swap entries */
120 * valid none the less */ 126 if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
121 if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0)) 127 return 0;
122 return NULL;
123 return pte;
124}
125#define translation_exists(vma, addr) __translation_exists((vma)->vm_mm, addr)
126 128
129 return pte_pfn(pte) == pfn;
130}
127 131
128/* Private function to flush a page from the cache of a non-current 132/* Private function to flush a page from the cache of a non-current
129 * process. cr25 contains the Page Directory of the current user 133 * process. cr25 contains the Page Directory of the current user
@@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long
175{ 179{
176 BUG_ON(!vma->vm_mm->context); 180 BUG_ON(!vma->vm_mm->context);
177 181
178 if(likely(translation_exists(vma, vmaddr))) 182 if (likely(translation_exists(vma, vmaddr, pfn)))
179 __flush_cache_page(vma, vmaddr); 183 __flush_cache_page(vma, vmaddr);
180 184
181} 185}
182#endif 186#endif
183
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h
index 595d3dce120a..ae039f4fd711 100644
--- a/include/asm-parisc/mmzone.h
+++ b/include/asm-parisc/mmzone.h
@@ -27,12 +27,6 @@ extern struct node_map_data node_data[];
27}) 27})
28#define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) 28#define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid))
29 29
30#define local_mapnr(kvaddr) \
31({ \
32 unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \
33 (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \
34})
35
36#define pfn_to_page(pfn) \ 30#define pfn_to_page(pfn) \
37({ \ 31({ \
38 unsigned long __pfn = (pfn); \ 32 unsigned long __pfn = (pfn); \
diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h
index 84af4ab1fe51..e97aa8d1eff5 100644
--- a/include/asm-parisc/tlbflush.h
+++ b/include/asm-parisc/tlbflush.h
@@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
88 if (npages >= 512) /* 2MB of space: arbitrary, should be tuned */ 88 if (npages >= 512) /* 2MB of space: arbitrary, should be tuned */
89 flush_tlb_all(); 89 flush_tlb_all();
90 else { 90 else {
91 91 preempt_disable();
92 mtsp(vma->vm_mm->context,1); 92 mtsp(vma->vm_mm->context,1);
93 purge_tlb_start(); 93 purge_tlb_start();
94 if (split_tlb) { 94 if (split_tlb) {
@@ -102,6 +102,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
102 pdtlb(start); 102 pdtlb(start);
103 start += PAGE_SIZE; 103 start += PAGE_SIZE;
104 } 104 }
105 preempt_enable();
105 } 106 }
106 purge_tlb_end(); 107 purge_tlb_end();
107 } 108 }
diff --git a/include/asm-ppc/rwsem.h b/include/asm-ppc/rwsem.h
index 3e738f483c11..3501ea72f88c 100644
--- a/include/asm-ppc/rwsem.h
+++ b/include/asm-ppc/rwsem.h
@@ -168,5 +168,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
168 return atomic_add_return(delta, (atomic_t *)(&sem->count)); 168 return atomic_add_return(delta, (atomic_t *)(&sem->count));
169} 169}
170 170
171static inline int rwsem_is_locked(struct rw_semaphore *sem)
172{
173 return (sem->count != 0);
174}
175
171#endif /* __KERNEL__ */ 176#endif /* __KERNEL__ */
172#endif /* _PPC_RWSEM_XADD_H */ 177#endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index ed473f4b0152..80a708e7093a 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -67,9 +67,6 @@ static inline int pa_to_nid(unsigned long pa)
67#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 67#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
68#define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) 68#define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn)
69 69
70#define local_mapnr(kvaddr) \
71 ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr))
72
73#ifdef CONFIG_DISCONTIGMEM 70#ifdef CONFIG_DISCONTIGMEM
74 71
75/* 72/*
diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
index c83679c9d2b0..2eb1778a3a15 100644
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -478,10 +478,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
478#define __HAVE_ARCH_PTE_SAME 478#define __HAVE_ARCH_PTE_SAME
479#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) 479#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
480 480
481#define pte_ERROR(e) \
482 printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
481#define pmd_ERROR(e) \ 483#define pmd_ERROR(e) \
482 printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) 484 printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
483#define pud_ERROR(e) \ 485#define pud_ERROR(e) \
484 printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) 486 printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
485#define pgd_ERROR(e) \ 487#define pgd_ERROR(e) \
486 printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) 488 printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
487 489
diff --git a/include/asm-ppc64/rwsem.h b/include/asm-ppc64/rwsem.h
index bd5c2f093575..7a647fae3765 100644
--- a/include/asm-ppc64/rwsem.h
+++ b/include/asm-ppc64/rwsem.h
@@ -163,5 +163,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
163 return atomic_add_return(delta, (atomic_t *)(&sem->count)); 163 return atomic_add_return(delta, (atomic_t *)(&sem->count));
164} 164}
165 165
166static inline int rwsem_is_locked(struct rw_semaphore *sem)
167{
168 return (sem->count != 0);
169}
170
166#endif /* __KERNEL__ */ 171#endif /* __KERNEL__ */
167#endif /* _PPC_RWSEM_XADD_H */ 172#endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-s390/rwsem.h b/include/asm-s390/rwsem.h
index 8c0cebbfc034..0422a085dd56 100644
--- a/include/asm-s390/rwsem.h
+++ b/include/asm-s390/rwsem.h
@@ -351,5 +351,10 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
351 return new; 351 return new;
352} 352}
353 353
354static inline int rwsem_is_locked(struct rw_semaphore *sem)
355{
356 return (sem->count != 0);
357}
358
354#endif /* __KERNEL__ */ 359#endif /* __KERNEL__ */
355#endif /* _S390_RWSEM_H */ 360#endif /* _S390_RWSEM_H */
diff --git a/include/asm-sh/rwsem.h b/include/asm-sh/rwsem.h
index 1be4337f5259..0262d3d1e5e0 100644
--- a/include/asm-sh/rwsem.h
+++ b/include/asm-sh/rwsem.h
@@ -166,5 +166,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
166 return atomic_add_return(delta, (atomic_t *)(&sem->count)); 166 return atomic_add_return(delta, (atomic_t *)(&sem->count));
167} 167}
168 168
169static inline int rwsem_is_locked(struct rw_semaphore *sem)
170{
171 return (sem->count != 0);
172}
173
169#endif /* __KERNEL__ */ 174#endif /* __KERNEL__ */
170#endif /* _ASM_SH_RWSEM_H */ 175#endif /* _ASM_SH_RWSEM_H */
diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h
index 4568ee4022df..cef5e8270421 100644
--- a/include/asm-sparc64/rwsem.h
+++ b/include/asm-sparc64/rwsem.h
@@ -56,6 +56,11 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
56 atomic_add(delta, (atomic_t *)(&sem->count)); 56 atomic_add(delta, (atomic_t *)(&sem->count));
57} 57}
58 58
59static inline int rwsem_is_locked(struct rw_semaphore *sem)
60{
61 return (sem->count != 0);
62}
63
59#endif /* __KERNEL__ */ 64#endif /* __KERNEL__ */
60 65
61#endif /* _SPARC64_RWSEM_H */ 66#endif /* _SPARC64_RWSEM_H */
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 9baf57db01d2..66138d959df5 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -25,9 +25,8 @@ struct mmu_gather {
25 struct mm_struct *mm; 25 struct mm_struct *mm;
26 unsigned int pages_nr; 26 unsigned int pages_nr;
27 unsigned int need_flush; 27 unsigned int need_flush;
28 unsigned int tlb_frozen; 28 unsigned int fullmm;
29 unsigned int tlb_nr; 29 unsigned int tlb_nr;
30 unsigned long freed;
31 unsigned long vaddrs[TLB_BATCH_NR]; 30 unsigned long vaddrs[TLB_BATCH_NR];
32 struct page *pages[FREE_PTE_NR]; 31 struct page *pages[FREE_PTE_NR];
33}; 32};
@@ -44,14 +43,13 @@ extern void flush_tlb_pending(void);
44 43
45static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) 44static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
46{ 45{
47 struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); 46 struct mmu_gather *mp = &get_cpu_var(mmu_gathers);
48 47
49 BUG_ON(mp->tlb_nr); 48 BUG_ON(mp->tlb_nr);
50 49
51 mp->mm = mm; 50 mp->mm = mm;
52 mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U; 51 mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U;
53 mp->tlb_frozen = full_mm_flush; 52 mp->fullmm = full_mm_flush;
54 mp->freed = 0;
55 53
56 return mp; 54 return mp;
57} 55}
@@ -78,30 +76,19 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm);
78 76
79static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end) 77static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end)
80{ 78{
81 unsigned long freed = mp->freed;
82 struct mm_struct *mm = mp->mm;
83 unsigned long rss = get_mm_counter(mm, rss);
84
85 if (rss < freed)
86 freed = rss;
87 add_mm_counter(mm, rss, -freed);
88
89 tlb_flush_mmu(mp); 79 tlb_flush_mmu(mp);
90 80
91 if (mp->tlb_frozen) { 81 if (mp->fullmm) {
92 if (CTX_VALID(mm->context)) 82 if (CTX_VALID(mp->mm->context))
93 do_flush_tlb_mm(mm); 83 do_flush_tlb_mm(mp->mm);
94 mp->tlb_frozen = 0; 84 mp->fullmm = 0;
95 } else 85 } else
96 flush_tlb_pending(); 86 flush_tlb_pending();
97 87
98 /* keep the page table cache within bounds */ 88 /* keep the page table cache within bounds */
99 check_pgt_cache(); 89 check_pgt_cache();
100}
101 90
102static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp) 91 put_cpu_var(mmu_gathers);
103{
104 return mp->tlb_frozen;
105} 92}
106 93
107static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) 94static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page)
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index 616d02b57ea9..ac64eb955868 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -138,7 +138,7 @@ extern unsigned long pg0[1024];
138 138
139#define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) 139#define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE))
140 140
141#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) 141#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE))
142#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) 142#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
143#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) 143#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
144#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) 144#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
diff --git a/include/asm-x86_64/rwsem.h b/include/asm-x86_64/rwsem.h
index c002175b6e82..46077e9c1910 100644
--- a/include/asm-x86_64/rwsem.h
+++ b/include/asm-x86_64/rwsem.h
@@ -274,5 +274,10 @@ LOCK_PREFIX "xaddl %0,(%2)"
274 return tmp+delta; 274 return tmp+delta;
275} 275}
276 276
277static inline int rwsem_is_locked(struct rw_semaphore *sem)
278{
279 return (sem->count != 0);
280}
281
277#endif /* __KERNEL__ */ 282#endif /* __KERNEL__ */
278#endif /* _X8664_RWSEM_H */ 283#endif /* _X8664_RWSEM_H */
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 88af42f5e04a..c937d6e65502 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
126/* If we *know* page->private refers to buffer_heads */ 126/* If we *know* page->private refers to buffer_heads */
127#define page_buffers(page) \ 127#define page_buffers(page) \
128 ({ \ 128 ({ \
129 BUG_ON(!PagePrivate(page)); \ 129 BUG_ON(!PagePrivate(page)); \
130 ((struct buffer_head *)(page)->private); \ 130 ((struct buffer_head *)page_private(page)); \
131 }) 131 })
132#define page_has_buffers(page) PagePrivate(page) 132#define page_has_buffers(page) PagePrivate(page)
133 133
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
219{ 219{
220 page_cache_get(page); 220 page_cache_get(page);
221 SetPagePrivate(page); 221 SetPagePrivate(page);
222 page->private = (unsigned long)head; 222 set_page_private(page, (unsigned long)head);
223} 223}
224 224
225static inline void get_bh(struct buffer_head *bh) 225static inline void get_bh(struct buffer_head *bh)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d664330d900e..0cea162b08c0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
16int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); 16int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
17int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); 17int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
18int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); 18int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
19void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
20void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); 19void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
21int hugetlb_prefault(struct address_space *, struct vm_area_struct *); 20int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
22int hugetlb_report_meminfo(char *); 21int hugetlb_report_meminfo(char *);
@@ -87,7 +86,6 @@ static inline unsigned long hugetlb_total_pages(void)
87#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) 86#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
88#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) 87#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
89#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) 88#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
90#define zap_hugepage_range(vma, start, len) BUG()
91#define unmap_hugepage_range(vma, start, end) BUG() 89#define unmap_hugepage_range(vma, start, end) BUG()
92#define is_hugepage_mem_enough(size) 0 90#define is_hugepage_mem_enough(size) 0
93#define hugetlb_report_meminfo(buf) 0 91#define hugetlb_report_meminfo(buf) 0
diff --git a/include/linux/memory.h b/include/linux/memory.h
new file mode 100644
index 000000000000..0def328ab5cf
--- /dev/null
+++ b/include/linux/memory.h
@@ -0,0 +1,94 @@
1/*
2 * include/linux/memory.h - generic memory definition
3 *
4 * This is mainly for topological representation. We define the
5 * basic "struct memory_block" here, which can be embedded in per-arch
6 * definitions or NUMA information.
7 *
8 * Basic handling of the devices is done in drivers/base/memory.c
9 * and system devices are handled in drivers/base/sys.c.
10 *
11 * Memory block are exported via sysfs in the class/memory/devices/
12 * directory.
13 *
14 */
15#ifndef _LINUX_MEMORY_H_
16#define _LINUX_MEMORY_H_
17
18#include <linux/sysdev.h>
19#include <linux/node.h>
20#include <linux/compiler.h>
21
22#include <asm/semaphore.h>
23
24struct memory_block {
25 unsigned long phys_index;
26 unsigned long state;
27 /*
28 * This serializes all state change requests. It isn't
29 * held during creation because the control files are
30 * created long after the critical areas during
31 * initialization.
32 */
33 struct semaphore state_sem;
34 int phys_device; /* to which fru does this belong? */
35 void *hw; /* optional pointer to fw/hw data */
36 int (*phys_callback)(struct memory_block *);
37 struct sys_device sysdev;
38};
39
40/* These states are exposed to userspace as text strings in sysfs */
41#define MEM_ONLINE (1<<0) /* exposed to userspace */
42#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */
43#define MEM_OFFLINE (1<<2) /* exposed to userspace */
44
45/*
46 * All of these states are currently kernel-internal for notifying
47 * kernel components and architectures.
48 *
49 * For MEM_MAPPING_INVALID, all notifier chains with priority >0
50 * are called before pfn_to_page() becomes invalid. The priority=0
51 * entry is reserved for the function that actually makes
52 * pfn_to_page() stop working. Any notifiers that want to be called
53 * after that should have priority <0.
54 */
55#define MEM_MAPPING_INVALID (1<<3)
56
57#ifndef CONFIG_MEMORY_HOTPLUG
58static inline int memory_dev_init(void)
59{
60 return 0;
61}
62static inline int register_memory_notifier(struct notifier_block *nb)
63{
64 return 0;
65}
66static inline void unregister_memory_notifier(struct notifier_block *nb)
67{
68}
69#else
70extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
71extern int register_new_memory(struct mem_section *);
72extern int unregister_memory_section(struct mem_section *);
73extern int memory_dev_init(void);
74extern int register_memory_notifier(struct notifier_block *nb);
75extern void unregister_memory_notifier(struct notifier_block *nb);
76
77#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
78
79extern int invalidate_phys_mapping(unsigned long, unsigned long);
80struct notifier_block;
81
82extern int register_memory_notifier(struct notifier_block *nb);
83extern void unregister_memory_notifier(struct notifier_block *nb);
84
85extern struct sysdev_class memory_sysdev_class;
86#endif /* CONFIG_MEMORY_HOTPLUG */
87
88#define hotplug_memory_notifier(fn, pri) { \
89 static struct notifier_block fn##_mem_nb = \
90 { .notifier_call = fn, .priority = pri }; \
91 register_memory_notifier(&fn##_mem_nb); \
92}
93
94#endif /* _LINUX_MEMORY_H_ */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
new file mode 100644
index 000000000000..01f03bc06eff
--- /dev/null
+++ b/include/linux/memory_hotplug.h
@@ -0,0 +1,104 @@
1#ifndef __LINUX_MEMORY_HOTPLUG_H
2#define __LINUX_MEMORY_HOTPLUG_H
3
4#include <linux/mmzone.h>
5#include <linux/spinlock.h>
6#include <linux/mmzone.h>
7#include <linux/notifier.h>
8
9#ifdef CONFIG_MEMORY_HOTPLUG
10/*
11 * pgdat resizing functions
12 */
13static inline
14void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
15{
16 spin_lock_irqsave(&pgdat->node_size_lock, *flags);
17}
18static inline
19void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
20{
21 spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
22}
23static inline
24void pgdat_resize_init(struct pglist_data *pgdat)
25{
26 spin_lock_init(&pgdat->node_size_lock);
27}
28/*
29 * Zone resizing functions
30 */
31static inline unsigned zone_span_seqbegin(struct zone *zone)
32{
33 return read_seqbegin(&zone->span_seqlock);
34}
35static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
36{
37 return read_seqretry(&zone->span_seqlock, iv);
38}
39static inline void zone_span_writelock(struct zone *zone)
40{
41 write_seqlock(&zone->span_seqlock);
42}
43static inline void zone_span_writeunlock(struct zone *zone)
44{
45 write_sequnlock(&zone->span_seqlock);
46}
47static inline void zone_seqlock_init(struct zone *zone)
48{
49 seqlock_init(&zone->span_seqlock);
50}
51extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
52extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
53extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
54/* need some defines for these for archs that don't support it */
55extern void online_page(struct page *page);
56/* VM interface that may be used by firmware interface */
57extern int add_memory(u64 start, u64 size);
58extern int remove_memory(u64 start, u64 size);
59extern int online_pages(unsigned long, unsigned long);
60
61/* reasonably generic interface to expand the physical pages in a zone */
62extern int __add_pages(struct zone *zone, unsigned long start_pfn,
63 unsigned long nr_pages);
64#else /* ! CONFIG_MEMORY_HOTPLUG */
65/*
66 * Stub functions for when hotplug is off
67 */
68static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
69static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
70static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
71
72static inline unsigned zone_span_seqbegin(struct zone *zone)
73{
74 return 0;
75}
76static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
77{
78 return 0;
79}
80static inline void zone_span_writelock(struct zone *zone) {}
81static inline void zone_span_writeunlock(struct zone *zone) {}
82static inline void zone_seqlock_init(struct zone *zone) {}
83
84static inline int mhp_notimplemented(const char *func)
85{
86 printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
87 dump_stack();
88 return -ENOSYS;
89}
90
91static inline int __add_pages(struct zone *zone, unsigned long start_pfn,
92 unsigned long nr_pages)
93{
94 return mhp_notimplemented(__FUNCTION__);
95}
96#endif /* ! CONFIG_MEMORY_HOTPLUG */
97static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
98 unsigned long nr_pages)
99{
100 printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__);
101 dump_stack();
102 return -ENOSYS;
103}
104#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 58385ee1c0ac..7af8cb836e78 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -27,10 +27,10 @@
27 27
28#include <linux/config.h> 28#include <linux/config.h>
29#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/bitmap.h>
31#include <linux/slab.h> 30#include <linux/slab.h>
32#include <linux/rbtree.h> 31#include <linux/rbtree.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33#include <linux/nodemask.h>
34 34
35struct vm_area_struct; 35struct vm_area_struct;
36 36
@@ -47,8 +47,7 @@ struct vm_area_struct;
47 * Locking policy for interlave: 47 * Locking policy for interlave:
48 * In process context there is no locking because only the process accesses 48 * In process context there is no locking because only the process accesses
49 * its own state. All vma manipulation is somewhat protected by a down_read on 49 * its own state. All vma manipulation is somewhat protected by a down_read on
50 * mmap_sem. For allocating in the interleave policy the page_table_lock 50 * mmap_sem.
51 * must be also aquired to protect il_next.
52 * 51 *
53 * Freeing policy: 52 * Freeing policy:
54 * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. 53 * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
@@ -63,7 +62,7 @@ struct mempolicy {
63 union { 62 union {
64 struct zonelist *zonelist; /* bind */ 63 struct zonelist *zonelist; /* bind */
65 short preferred_node; /* preferred */ 64 short preferred_node; /* preferred */
66 DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */ 65 nodemask_t nodes; /* interleave */
67 /* undefined for default */ 66 /* undefined for default */
68 } v; 67 } v;
69}; 68};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e1649578fb0c..5c1fb0a2e806 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
157 157
158#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 158#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
159#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 159#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
160#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ 160#define VM_RESERVED 0x00080000 /* Pages managed in a special way */
161#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 161#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
162#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 162#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
163#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ 163#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
@@ -226,13 +226,18 @@ struct page {
226 * to show when page is mapped 226 * to show when page is mapped
227 * & limit reverse map searches. 227 * & limit reverse map searches.
228 */ 228 */
229 unsigned long private; /* Mapping-private opaque data: 229 union {
230 unsigned long private; /* Mapping-private opaque data:
230 * usually used for buffer_heads 231 * usually used for buffer_heads
231 * if PagePrivate set; used for 232 * if PagePrivate set; used for
232 * swp_entry_t if PageSwapCache 233 * swp_entry_t if PageSwapCache
233 * When page is free, this indicates 234 * When page is free, this indicates
234 * order in the buddy system. 235 * order in the buddy system.
235 */ 236 */
237#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
238 spinlock_t ptl;
239#endif
240 } u;
236 struct address_space *mapping; /* If low bit clear, points to 241 struct address_space *mapping; /* If low bit clear, points to
237 * inode address_space, or NULL. 242 * inode address_space, or NULL.
238 * If page mapped as anonymous 243 * If page mapped as anonymous
@@ -260,6 +265,9 @@ struct page {
260#endif /* WANT_PAGE_VIRTUAL */ 265#endif /* WANT_PAGE_VIRTUAL */
261}; 266};
262 267
268#define page_private(page) ((page)->u.private)
269#define set_page_private(page, v) ((page)->u.private = (v))
270
263/* 271/*
264 * FIXME: take this include out, include page-flags.h in 272 * FIXME: take this include out, include page-flags.h in
265 * files which need it (119 of them) 273 * files which need it (119 of them)
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
311 319
312#ifdef CONFIG_HUGETLB_PAGE 320#ifdef CONFIG_HUGETLB_PAGE
313 321
314static inline int page_count(struct page *p) 322static inline int page_count(struct page *page)
315{ 323{
316 if (PageCompound(p)) 324 if (PageCompound(page))
317 p = (struct page *)p->private; 325 page = (struct page *)page_private(page);
318 return atomic_read(&(p)->_count) + 1; 326 return atomic_read(&page->_count) + 1;
319} 327}
320 328
321static inline void get_page(struct page *page) 329static inline void get_page(struct page *page)
322{ 330{
323 if (unlikely(PageCompound(page))) 331 if (unlikely(PageCompound(page)))
324 page = (struct page *)page->private; 332 page = (struct page *)page_private(page);
325 atomic_inc(&page->_count); 333 atomic_inc(&page->_count);
326} 334}
327 335
@@ -338,7 +346,7 @@ static inline void get_page(struct page *page)
338 346
339static inline void put_page(struct page *page) 347static inline void put_page(struct page *page)
340{ 348{
341 if (!PageReserved(page) && put_page_testzero(page)) 349 if (put_page_testzero(page))
342 __page_cache_release(page); 350 __page_cache_release(page);
343} 351}
344 352
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
587static inline pgoff_t page_index(struct page *page) 595static inline pgoff_t page_index(struct page *page)
588{ 596{
589 if (unlikely(PageSwapCache(page))) 597 if (unlikely(PageSwapCache(page)))
590 return page->private; 598 return page_private(page);
591 return page->index; 599 return page->index;
592} 600}
593 601
@@ -682,7 +690,7 @@ struct zap_details {
682 690
683unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 691unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
684 unsigned long size, struct zap_details *); 692 unsigned long size, struct zap_details *);
685unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, 693unsigned long unmap_vmas(struct mmu_gather **tlb,
686 struct vm_area_struct *start_vma, unsigned long start_addr, 694 struct vm_area_struct *start_vma, unsigned long start_addr,
687 unsigned long end_addr, unsigned long *nr_accounted, 695 unsigned long end_addr, unsigned long *nr_accounted,
688 struct zap_details *); 696 struct zap_details *);
@@ -704,10 +712,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
704} 712}
705 713
706extern int vmtruncate(struct inode * inode, loff_t offset); 714extern int vmtruncate(struct inode * inode, loff_t offset);
707extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
708extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
709extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
710extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
711extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); 715extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
712extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); 716extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
713extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); 717extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
@@ -723,6 +727,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
723 727
724int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, 728int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
725 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); 729 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
730void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
726 731
727int __set_page_dirty_buffers(struct page *page); 732int __set_page_dirty_buffers(struct page *page);
728int __set_page_dirty_nobuffers(struct page *page); 733int __set_page_dirty_nobuffers(struct page *page);
@@ -759,38 +764,83 @@ struct shrinker;
759extern struct shrinker *set_shrinker(int, shrinker_t); 764extern struct shrinker *set_shrinker(int, shrinker_t);
760extern void remove_shrinker(struct shrinker *shrinker); 765extern void remove_shrinker(struct shrinker *shrinker);
761 766
762/* 767int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
763 * On a two-level or three-level page table, this ends up being trivial. Thus 768int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
764 * the inlining and the symmetry break with pte_alloc_map() that does all 769int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
765 * of this out-of-line. 770int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
766 */ 771
767/* 772/*
768 * The following ifdef needed to get the 4level-fixup.h header to work. 773 * The following ifdef needed to get the 4level-fixup.h header to work.
769 * Remove it when 4level-fixup.h has been removed. 774 * Remove it when 4level-fixup.h has been removed.
770 */ 775 */
771#ifdef CONFIG_MMU 776#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
772#ifndef __ARCH_HAS_4LEVEL_HACK
773static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 777static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
774{ 778{
775 if (pgd_none(*pgd)) 779 return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
776 return __pud_alloc(mm, pgd, address); 780 NULL: pud_offset(pgd, address);
777 return pud_offset(pgd, address);
778} 781}
779 782
780static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 783static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
781{ 784{
782 if (pud_none(*pud)) 785 return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
783 return __pmd_alloc(mm, pud, address); 786 NULL: pmd_offset(pud, address);
784 return pmd_offset(pud, address);
785} 787}
786#endif 788#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
787#endif /* CONFIG_MMU */ 789
790#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
791/*
792 * We tuck a spinlock to guard each pagetable page into its struct page,
793 * at page->private, with BUILD_BUG_ON to make sure that this will not
794 * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
795 * When freeing, reset page->mapping so free_pages_check won't complain.
796 */
797#define __pte_lockptr(page) &((page)->u.ptl)
798#define pte_lock_init(_page) do { \
799 spin_lock_init(__pte_lockptr(_page)); \
800} while (0)
801#define pte_lock_deinit(page) ((page)->mapping = NULL)
802#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
803#else
804/*
805 * We use mm->page_table_lock to guard all pagetable pages of the mm.
806 */
807#define pte_lock_init(page) do {} while (0)
808#define pte_lock_deinit(page) do {} while (0)
809#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
810#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
811
812#define pte_offset_map_lock(mm, pmd, address, ptlp) \
813({ \
814 spinlock_t *__ptl = pte_lockptr(mm, pmd); \
815 pte_t *__pte = pte_offset_map(pmd, address); \
816 *(ptlp) = __ptl; \
817 spin_lock(__ptl); \
818 __pte; \
819})
820
821#define pte_unmap_unlock(pte, ptl) do { \
822 spin_unlock(ptl); \
823 pte_unmap(pte); \
824} while (0)
825
826#define pte_alloc_map(mm, pmd, address) \
827 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
828 NULL: pte_offset_map(pmd, address))
829
830#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
831 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
832 NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
833
834#define pte_alloc_kernel(pmd, address) \
835 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
836 NULL: pte_offset_kernel(pmd, address))
788 837
789extern void free_area_init(unsigned long * zones_size); 838extern void free_area_init(unsigned long * zones_size);
790extern void free_area_init_node(int nid, pg_data_t *pgdat, 839extern void free_area_init_node(int nid, pg_data_t *pgdat,
791 unsigned long * zones_size, unsigned long zone_start_pfn, 840 unsigned long * zones_size, unsigned long zone_start_pfn,
792 unsigned long *zholes_size); 841 unsigned long *zholes_size);
793extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); 842extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
843extern void setup_per_zone_pages_min(void);
794extern void mem_init(void); 844extern void mem_init(void);
795extern void show_mem(void); 845extern void show_mem(void);
796extern void si_meminfo(struct sysinfo * val); 846extern void si_meminfo(struct sysinfo * val);
@@ -834,6 +884,7 @@ extern int split_vma(struct mm_struct *,
834extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); 884extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
835extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, 885extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
836 struct rb_node **, struct rb_node *); 886 struct rb_node **, struct rb_node *);
887extern void unlink_file_vma(struct vm_area_struct *);
837extern struct vm_area_struct *copy_vma(struct vm_area_struct **, 888extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
838 unsigned long addr, unsigned long len, pgoff_t pgoff); 889 unsigned long addr, unsigned long len, pgoff_t pgoff);
839extern void exit_mmap(struct mm_struct *); 890extern void exit_mmap(struct mm_struct *);
@@ -894,7 +945,8 @@ void handle_ra_miss(struct address_space *mapping,
894unsigned long max_sane_readahead(unsigned long nr); 945unsigned long max_sane_readahead(unsigned long nr);
895 946
896/* Do stack extension */ 947/* Do stack extension */
897extern int expand_stack(struct vm_area_struct * vma, unsigned long address); 948extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
949extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
898 950
899/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 951/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
900extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); 952extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
@@ -917,40 +969,28 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
917 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 969 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
918} 970}
919 971
920extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); 972struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
973struct page *vmalloc_to_page(void *addr);
974unsigned long vmalloc_to_pfn(void *addr);
975int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
976 unsigned long pfn, unsigned long size, pgprot_t);
921 977
922extern struct page * vmalloc_to_page(void *addr); 978struct page *follow_page(struct mm_struct *, unsigned long address,
923extern unsigned long vmalloc_to_pfn(void *addr); 979 unsigned int foll_flags);
924extern struct page * follow_page(struct mm_struct *mm, unsigned long address, 980#define FOLL_WRITE 0x01 /* check pte is writable */
925 int write); 981#define FOLL_TOUCH 0x02 /* mark page accessed */
926extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); 982#define FOLL_GET 0x04 /* do get_page on page */
927int remap_pfn_range(struct vm_area_struct *, unsigned long, 983#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
928 unsigned long, unsigned long, pgprot_t);
929 984
930#ifdef CONFIG_PROC_FS 985#ifdef CONFIG_PROC_FS
931void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); 986void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
932#else 987#else
933static inline void __vm_stat_account(struct mm_struct *mm, 988static inline void vm_stat_account(struct mm_struct *mm,
934 unsigned long flags, struct file *file, long pages) 989 unsigned long flags, struct file *file, long pages)
935{ 990{
936} 991}
937#endif /* CONFIG_PROC_FS */ 992#endif /* CONFIG_PROC_FS */
938 993
939static inline void vm_stat_account(struct vm_area_struct *vma)
940{
941 __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
942 vma_pages(vma));
943}
944
945static inline void vm_stat_unaccount(struct vm_area_struct *vma)
946{
947 __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
948 -vma_pages(vma));
949}
950
951/* update per process rss and vm hiwater data */
952extern void update_mem_hiwater(struct task_struct *tsk);
953
954#ifndef CONFIG_DEBUG_PAGEALLOC 994#ifndef CONFIG_DEBUG_PAGEALLOC
955static inline void 995static inline void
956kernel_map_pages(struct page *page, int numpages, int enable) 996kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7519eb4191e7..f5fa3082fd6a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -12,6 +12,7 @@
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/seqlock.h>
15#include <asm/atomic.h> 16#include <asm/atomic.h>
16 17
17/* Free memory management - zoned buddy allocator. */ 18/* Free memory management - zoned buddy allocator. */
@@ -137,6 +138,10 @@ struct zone {
137 * free areas of different sizes 138 * free areas of different sizes
138 */ 139 */
139 spinlock_t lock; 140 spinlock_t lock;
141#ifdef CONFIG_MEMORY_HOTPLUG
142 /* see spanned/present_pages for more description */
143 seqlock_t span_seqlock;
144#endif
140 struct free_area free_area[MAX_ORDER]; 145 struct free_area free_area[MAX_ORDER];
141 146
142 147
@@ -220,6 +225,16 @@ struct zone {
220 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 225 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
221 unsigned long zone_start_pfn; 226 unsigned long zone_start_pfn;
222 227
228 /*
229 * zone_start_pfn, spanned_pages and present_pages are all
230 * protected by span_seqlock. It is a seqlock because it has
231 * to be read outside of zone->lock, and it is done in the main
232 * allocator path. But, it is written quite infrequently.
233 *
234 * The lock is declared along with zone->lock because it is
235 * frequently read in proximity to zone->lock. It's good to
236 * give them a chance of being in the same cacheline.
237 */
223 unsigned long spanned_pages; /* total size, including holes */ 238 unsigned long spanned_pages; /* total size, including holes */
224 unsigned long present_pages; /* amount of memory (excluding holes) */ 239 unsigned long present_pages; /* amount of memory (excluding holes) */
225 240
@@ -273,6 +288,16 @@ typedef struct pglist_data {
273 struct page *node_mem_map; 288 struct page *node_mem_map;
274#endif 289#endif
275 struct bootmem_data *bdata; 290 struct bootmem_data *bdata;
291#ifdef CONFIG_MEMORY_HOTPLUG
292 /*
293 * Must be held any time you expect node_start_pfn, node_present_pages
294 * or node_spanned_pages stay constant. Holding this will also
295 * guarantee that any pfn_valid() stays that way.
296 *
297 * Nests above zone->lock and zone->size_seqlock.
298 */
299 spinlock_t node_size_lock;
300#endif
276 unsigned long node_start_pfn; 301 unsigned long node_start_pfn;
277 unsigned long node_present_pages; /* total number of physical pages */ 302 unsigned long node_present_pages; /* total number of physical pages */
278 unsigned long node_spanned_pages; /* total size of physical page 303 unsigned long node_spanned_pages; /* total size of physical page
@@ -293,6 +318,8 @@ typedef struct pglist_data {
293#endif 318#endif
294#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 319#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
295 320
321#include <linux/memory_hotplug.h>
322
296extern struct pglist_data *pgdat_list; 323extern struct pglist_data *pgdat_list;
297 324
298void __get_zone_counts(unsigned long *active, unsigned long *inactive, 325void __get_zone_counts(unsigned long *active, unsigned long *inactive,
@@ -509,6 +536,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
509 return NULL; 536 return NULL;
510 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; 537 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
511} 538}
539extern int __section_nr(struct mem_section* ms);
512 540
513/* 541/*
514 * We use the lower bits of the mem_map pointer to store 542 * We use the lower bits of the mem_map pointer to store
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e80fb7ee6efd..35b30e6c8cf8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -95,8 +95,8 @@ int try_to_unmap(struct page *);
95/* 95/*
96 * Called from mm/filemap_xip.c to unmap empty zero page 96 * Called from mm/filemap_xip.c to unmap empty zero page
97 */ 97 */
98pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long); 98pte_t *page_check_address(struct page *, struct mm_struct *,
99 99 unsigned long, spinlock_t **);
100 100
101/* 101/*
102 * Used by swapoff to help locate where page is expected in vma. 102 * Used by swapoff to help locate where page is expected in vma.
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index b52a2af25f1f..f30f805080ae 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -61,5 +61,10 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem));
61extern void FASTCALL(__up_write(struct rw_semaphore *sem)); 61extern void FASTCALL(__up_write(struct rw_semaphore *sem));
62extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); 62extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
63 63
64static inline int rwsem_is_locked(struct rw_semaphore *sem)
65{
66 return (sem->activity != 0);
67}
68
64#endif /* __KERNEL__ */ 69#endif /* __KERNEL__ */
65#endif /* _LINUX_RWSEM_SPINLOCK_H */ 70#endif /* _LINUX_RWSEM_SPINLOCK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27519df0f987..1c30bc308ef1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,6 +249,36 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
249extern void arch_unmap_area(struct mm_struct *, unsigned long); 249extern void arch_unmap_area(struct mm_struct *, unsigned long);
250extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); 250extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
251 251
252#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
253/*
254 * The mm counters are not protected by its page_table_lock,
255 * so must be incremented atomically.
256 */
257#ifdef ATOMIC64_INIT
258#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
259#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
260#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
261#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
262#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
263typedef atomic64_t mm_counter_t;
264#else /* !ATOMIC64_INIT */
265/*
266 * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
267 * that is, at 16TB if using 4kB page size.
268 */
269#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
270#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
271#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
272#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
273#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
274typedef atomic_t mm_counter_t;
275#endif /* !ATOMIC64_INIT */
276
277#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
278/*
279 * The mm counters are protected by its page_table_lock,
280 * so can be incremented directly.
281 */
252#define set_mm_counter(mm, member, value) (mm)->_##member = (value) 282#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
253#define get_mm_counter(mm, member) ((mm)->_##member) 283#define get_mm_counter(mm, member) ((mm)->_##member)
254#define add_mm_counter(mm, member, value) (mm)->_##member += (value) 284#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
@@ -256,6 +286,20 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
256#define dec_mm_counter(mm, member) (mm)->_##member-- 286#define dec_mm_counter(mm, member) (mm)->_##member--
257typedef unsigned long mm_counter_t; 287typedef unsigned long mm_counter_t;
258 288
289#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
290
291#define get_mm_rss(mm) \
292 (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
293#define update_hiwater_rss(mm) do { \
294 unsigned long _rss = get_mm_rss(mm); \
295 if ((mm)->hiwater_rss < _rss) \
296 (mm)->hiwater_rss = _rss; \
297} while (0)
298#define update_hiwater_vm(mm) do { \
299 if ((mm)->hiwater_vm < (mm)->total_vm) \
300 (mm)->hiwater_vm = (mm)->total_vm; \
301} while (0)
302
259struct mm_struct { 303struct mm_struct {
260 struct vm_area_struct * mmap; /* list of VMAs */ 304 struct vm_area_struct * mmap; /* list of VMAs */
261 struct rb_root mm_rb; 305 struct rb_root mm_rb;
@@ -279,15 +323,20 @@ struct mm_struct {
279 * by mmlist_lock 323 * by mmlist_lock
280 */ 324 */
281 325
326 /* Special counters, in some configurations protected by the
327 * page_table_lock, in other configurations by being atomic.
328 */
329 mm_counter_t _file_rss;
330 mm_counter_t _anon_rss;
331
332 unsigned long hiwater_rss; /* High-watermark of RSS usage */
333 unsigned long hiwater_vm; /* High-water virtual memory usage */
334
335 unsigned long total_vm, locked_vm, shared_vm, exec_vm;
336 unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
282 unsigned long start_code, end_code, start_data, end_data; 337 unsigned long start_code, end_code, start_data, end_data;
283 unsigned long start_brk, brk, start_stack; 338 unsigned long start_brk, brk, start_stack;
284 unsigned long arg_start, arg_end, env_start, env_end; 339 unsigned long arg_start, arg_end, env_start, env_end;
285 unsigned long total_vm, locked_vm, shared_vm;
286 unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
287
288 /* Special counters protected by the page_table_lock */
289 mm_counter_t _rss;
290 mm_counter_t _anon_rss;
291 340
292 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 341 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
293 342
@@ -308,11 +357,7 @@ struct mm_struct {
308 /* aio bits */ 357 /* aio bits */
309 rwlock_t ioctx_list_lock; 358 rwlock_t ioctx_list_lock;
310 struct kioctx *ioctx_list; 359 struct kioctx *ioctx_list;
311
312 struct kioctx default_kioctx; 360 struct kioctx default_kioctx;
313
314 unsigned long hiwater_rss; /* High-water RSS usage */
315 unsigned long hiwater_vm; /* High-water virtual memory usage */
316}; 361};
317 362
318struct sighand_struct { 363struct sighand_struct {
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3701a0673d2c..1d5577b2b752 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -32,10 +32,14 @@ struct vm_struct {
32 * Highlevel APIs for driver use 32 * Highlevel APIs for driver use
33 */ 33 */
34extern void *vmalloc(unsigned long size); 34extern void *vmalloc(unsigned long size);
35extern void *vmalloc_node(unsigned long size, int node);
35extern void *vmalloc_exec(unsigned long size); 36extern void *vmalloc_exec(unsigned long size);
36extern void *vmalloc_32(unsigned long size); 37extern void *vmalloc_32(unsigned long size);
37extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); 38extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
38extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); 39extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
40 pgprot_t prot);
41extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask,
42 pgprot_t prot, int node);
39extern void vfree(void *addr); 43extern void vfree(void *addr);
40 44
41extern void *vmap(struct page **pages, unsigned int count, 45extern void *vmap(struct page **pages, unsigned int count,
@@ -48,6 +52,8 @@ extern void vunmap(void *addr);
48extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); 52extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
49extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 53extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
50 unsigned long start, unsigned long end); 54 unsigned long start, unsigned long end);
55extern struct vm_struct *get_vm_area_node(unsigned long size,
56 unsigned long flags, int node);
51extern struct vm_struct *remove_vm_area(void *addr); 57extern struct vm_struct *remove_vm_area(void *addr);
52extern struct vm_struct *__remove_vm_area(void *addr); 58extern struct vm_struct *__remove_vm_area(void *addr);
53extern int map_vm_area(struct vm_struct *area, pgprot_t prot, 59extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
diff --git a/ipc/shm.c b/ipc/shm.c
index dca90489e3b0..b58c651d31ae 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -233,10 +233,11 @@ static int newseg (key_t key, int shmflg, size_t size)
233 shp->id = shm_buildid(id,shp->shm_perm.seq); 233 shp->id = shm_buildid(id,shp->shm_perm.seq);
234 shp->shm_file = file; 234 shp->shm_file = file;
235 file->f_dentry->d_inode->i_ino = shp->id; 235 file->f_dentry->d_inode->i_ino = shp->id;
236 if (shmflg & SHM_HUGETLB) 236
237 set_file_hugepages(file); 237 /* Hugetlb ops would have already been assigned. */
238 else 238 if (!(shmflg & SHM_HUGETLB))
239 file->f_op = &shm_file_operations; 239 file->f_op = &shm_file_operations;
240
240 shm_tot += numpages; 241 shm_tot += numpages;
241 shm_unlock(shp); 242 shm_unlock(shp);
242 return shp->id; 243 return shp->id;
diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497e..2e3f4a47e7d0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
553 if (delta == 0) 553 if (delta == 0)
554 return; 554 return;
555 tsk->acct_stimexpd = tsk->stime; 555 tsk->acct_stimexpd = tsk->stime;
556 tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); 556 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
557 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 557 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
558 } 558 }
559} 559}
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..79f52b85d6ed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
839 preempt_count()); 839 preempt_count());
840 840
841 acct_update_integrals(tsk); 841 acct_update_integrals(tsk);
842 update_mem_hiwater(tsk); 842 if (tsk->mm) {
843 update_hiwater_rss(tsk->mm);
844 update_hiwater_vm(tsk->mm);
845 }
843 group_dead = atomic_dec_and_test(&tsk->signal->live); 846 group_dead = atomic_dec_and_test(&tsk->signal->live);
844 if (group_dead) { 847 if (group_dead) {
845 del_timer_sync(&tsk->signal->real_timer); 848 del_timer_sync(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index 280bd44ac441..8a069612eac3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
182} 182}
183 183
184#ifdef CONFIG_MMU 184#ifdef CONFIG_MMU
185static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) 185static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
186{ 186{
187 struct vm_area_struct * mpnt, *tmp, **pprev; 187 struct vm_area_struct *mpnt, *tmp, **pprev;
188 struct rb_node **rb_link, *rb_parent; 188 struct rb_node **rb_link, *rb_parent;
189 int retval; 189 int retval;
190 unsigned long charge; 190 unsigned long charge;
191 struct mempolicy *pol; 191 struct mempolicy *pol;
192 192
193 down_write(&oldmm->mmap_sem); 193 down_write(&oldmm->mmap_sem);
194 flush_cache_mm(current->mm); 194 flush_cache_mm(oldmm);
195 down_write(&mm->mmap_sem);
196
195 mm->locked_vm = 0; 197 mm->locked_vm = 0;
196 mm->mmap = NULL; 198 mm->mmap = NULL;
197 mm->mmap_cache = NULL; 199 mm->mmap_cache = NULL;
198 mm->free_area_cache = oldmm->mmap_base; 200 mm->free_area_cache = oldmm->mmap_base;
199 mm->cached_hole_size = ~0UL; 201 mm->cached_hole_size = ~0UL;
200 mm->map_count = 0; 202 mm->map_count = 0;
201 set_mm_counter(mm, rss, 0);
202 set_mm_counter(mm, anon_rss, 0);
203 cpus_clear(mm->cpu_vm_mask); 203 cpus_clear(mm->cpu_vm_mask);
204 mm->mm_rb = RB_ROOT; 204 mm->mm_rb = RB_ROOT;
205 rb_link = &mm->mm_rb.rb_node; 205 rb_link = &mm->mm_rb.rb_node;
206 rb_parent = NULL; 206 rb_parent = NULL;
207 pprev = &mm->mmap; 207 pprev = &mm->mmap;
208 208
209 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { 209 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
210 struct file *file; 210 struct file *file;
211 211
212 if (mpnt->vm_flags & VM_DONTCOPY) { 212 if (mpnt->vm_flags & VM_DONTCOPY) {
213 long pages = vma_pages(mpnt); 213 long pages = vma_pages(mpnt);
214 mm->total_vm -= pages; 214 mm->total_vm -= pages;
215 __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 215 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
216 -pages); 216 -pages);
217 continue; 217 continue;
218 } 218 }
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
253 } 253 }
254 254
255 /* 255 /*
256 * Link in the new vma and copy the page table entries: 256 * Link in the new vma and copy the page table entries.
257 * link in first so that swapoff can see swap entries.
258 * Note that, exceptionally, here the vma is inserted
259 * without holding mm->mmap_sem.
260 */ 257 */
261 spin_lock(&mm->page_table_lock);
262 *pprev = tmp; 258 *pprev = tmp;
263 pprev = &tmp->vm_next; 259 pprev = &tmp->vm_next;
264 260
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
267 rb_parent = &tmp->vm_rb; 263 rb_parent = &tmp->vm_rb;
268 264
269 mm->map_count++; 265 mm->map_count++;
270 retval = copy_page_range(mm, current->mm, tmp); 266 retval = copy_page_range(mm, oldmm, tmp);
271 spin_unlock(&mm->page_table_lock);
272 267
273 if (tmp->vm_ops && tmp->vm_ops->open) 268 if (tmp->vm_ops && tmp->vm_ops->open)
274 tmp->vm_ops->open(tmp); 269 tmp->vm_ops->open(tmp);
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
277 goto out; 272 goto out;
278 } 273 }
279 retval = 0; 274 retval = 0;
280
281out: 275out:
282 flush_tlb_mm(current->mm); 276 up_write(&mm->mmap_sem);
277 flush_tlb_mm(oldmm);
283 up_write(&oldmm->mmap_sem); 278 up_write(&oldmm->mmap_sem);
284 return retval; 279 return retval;
285fail_nomem_policy: 280fail_nomem_policy:
@@ -323,6 +318,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
323 INIT_LIST_HEAD(&mm->mmlist); 318 INIT_LIST_HEAD(&mm->mmlist);
324 mm->core_waiters = 0; 319 mm->core_waiters = 0;
325 mm->nr_ptes = 0; 320 mm->nr_ptes = 0;
321 set_mm_counter(mm, file_rss, 0);
322 set_mm_counter(mm, anon_rss, 0);
326 spin_lock_init(&mm->page_table_lock); 323 spin_lock_init(&mm->page_table_lock);
327 rwlock_init(&mm->ioctx_list_lock); 324 rwlock_init(&mm->ioctx_list_lock);
328 mm->ioctx_list = NULL; 325 mm->ioctx_list = NULL;
@@ -499,7 +496,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
499 if (retval) 496 if (retval)
500 goto free_pt; 497 goto free_pt;
501 498
502 mm->hiwater_rss = get_mm_counter(mm,rss); 499 mm->hiwater_rss = get_mm_rss(mm);
503 mm->hiwater_vm = mm->total_vm; 500 mm->hiwater_vm = mm->total_vm;
504 501
505good_mm: 502good_mm:
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b2..3b4d5ad44cc6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 /* 205 /*
206 * Do a quick atomic lookup first - this is the fastpath. 206 * Do a quick atomic lookup first - this is the fastpath.
207 */ 207 */
208 spin_lock(&current->mm->page_table_lock); 208 page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
209 page = follow_page(mm, uaddr, 0);
210 if (likely(page != NULL)) { 209 if (likely(page != NULL)) {
211 key->shared.pgoff = 210 key->shared.pgoff =
212 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
213 spin_unlock(&current->mm->page_table_lock); 212 put_page(page);
214 return 0; 213 return 0;
215 } 214 }
216 spin_unlock(&current->mm->page_table_lock);
217 215
218 /* 216 /*
219 * Do it the general way. 217 * Do it the general way.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 36c5d9cd4cc1..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
334 if (pages) { 334 if (pages) {
335 unsigned int count, i; 335 unsigned int count, i;
336 pages->mapping = NULL; 336 pages->mapping = NULL;
337 pages->private = order; 337 set_page_private(pages, order);
338 count = 1 << order; 338 count = 1 << order;
339 for (i = 0; i < count; i++) 339 for (i = 0; i < count; i++)
340 SetPageReserved(pages + i); 340 SetPageReserved(pages + i);
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
347{ 347{
348 unsigned int order, count, i; 348 unsigned int order, count, i;
349 349
350 order = page->private; 350 order = page_private(page);
351 count = 1 << order; 351 count = 1 << order;
352 for (i = 0; i < count; i++) 352 for (i = 0; i < count; i++)
353 ClearPageReserved(page + i); 353 ClearPageReserved(page + i);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 10bc5ec496d7..016504ccfccf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
578 continue; 578 continue;
579 page = pfn_to_page(pfn); 579 page = pfn_to_page(pfn);
580 /* 580 /*
581 * This condition results from rvmalloc() sans vmalloc_32() 581 * PageReserved results from rvmalloc() sans vmalloc_32()
582 * and architectural memory reservations. This should be 582 * and architectural memory reservations.
583 * corrected eventually when the cases giving rise to this 583 *
584 * are better understood. 584 * rvmalloc should not cause this, because all implementations
585 * appear to always be using vmalloc_32 on architectures with
586 * highmem. This is a good thing, because we would like to save
587 * rvmalloc pages.
588 *
589 * It appears to be triggered by pages which do not point to
590 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
591 * which sets PageReserved if the page does not point to valid
592 * RAM.
593 *
594 * XXX: must remove usage of PageReserved!
585 */ 595 */
586 if (PageReserved(page)) { 596 if (PageReserved(page))
587 printk("highmem reserved page?!\n");
588 continue; 597 continue;
589 }
590 BUG_ON(PageNosave(page)); 598 BUG_ON(PageNosave(page));
591 if (PageNosaveFree(page)) 599 if (PageNosaveFree(page))
592 continue; 600 continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
672 return 0; 680 return 0;
673 681
674 page = pfn_to_page(pfn); 682 page = pfn_to_page(pfn);
675 BUG_ON(PageReserved(page) && PageNosave(page));
676 if (PageNosave(page)) 683 if (PageNosave(page))
677 return 0; 684 return 0;
678 if (PageReserved(page) && pfn_is_nosave(pfn)) { 685 if (pfn_is_nosave(pfn)) {
679 pr_debug("[nosave pfn 0x%lx]", pfn); 686 pr_debug("[nosave pfn 0x%lx]", pfn);
680 return 0; 687 return 0;
681 } 688 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 1e5cafdf4e27..4f26c544d02c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
2511 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2511 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2512 /* Account for system time used */ 2512 /* Account for system time used */
2513 acct_update_integrals(p); 2513 acct_update_integrals(p);
2514 /* Update rss highwater mark */
2515 update_mem_hiwater(p);
2516} 2514}
2517 2515
2518/* 2516/*
diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b60..6a2e5f8dc725 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -752,6 +752,15 @@ static void second_overflow(void)
752 else 752 else
753 time_adj += (time_adj >> 2) + (time_adj >> 5); 753 time_adj += (time_adj >> 2) + (time_adj >> 5);
754#endif 754#endif
755#if HZ == 250
756 /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
757 * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
758 */
759 if (time_adj < 0)
760 time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
761 else
762 time_adj += (time_adj >> 6) + (time_adj >> 7);
763#endif
755#if HZ == 1000 764#if HZ == 1000
756 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). 765 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
757 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) 766 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d136..1a4473fcb2ca 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,24 @@ config SPARSEMEM_STATIC
111config SPARSEMEM_EXTREME 111config SPARSEMEM_EXTREME
112 def_bool y 112 def_bool y
113 depends on SPARSEMEM && !SPARSEMEM_STATIC 113 depends on SPARSEMEM && !SPARSEMEM_STATIC
114
115# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add"
118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
119
120comment "Memory hotplug is currently incompatible with Software Suspend"
121 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
122
123# Heavily threaded applications may benefit from splitting the mm-wide
124# page_table_lock, so that faults on different parts of the user address
125# space can be handled with less contention: split it at this NR_CPUS.
126# Default to 4 for wider testing, though 8 might be more appropriate.
127# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
128# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
129#
130config SPLIT_PTLOCK_CPUS
131 int
132 default "4096" if ARM && !CPU_CACHE_VIPT
133 default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
134 default "4"
diff --git a/mm/Makefile b/mm/Makefile
index 4cd69e3ce421..2fa6d2ca9f28 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 18obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21 21obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
22obj-$(CONFIG_FS_XIP) += filemap_xip.o 22obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579e..e8c567177dcf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
305 if (j + 16 < BITS_PER_LONG) 305 if (j + 16 < BITS_PER_LONG)
306 prefetchw(page + j + 16); 306 prefetchw(page + j + 16);
307 __ClearPageReserved(page + j); 307 __ClearPageReserved(page + j);
308 set_page_count(page + j, 0);
308 } 309 }
309 __free_pages(page, order); 310 __free_pages(page, order);
310 i += BITS_PER_LONG; 311 i += BITS_PER_LONG;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1c31b2fd2ca5..768687f1d46b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
66 * 66 *
67 * ->mmap_sem 67 * ->mmap_sem
68 * ->i_mmap_lock 68 * ->i_mmap_lock
69 * ->page_table_lock (various places, mainly in mmap.c) 69 * ->page_table_lock or pte_lock (various, mainly in memory.c)
70 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 70 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
71 * 71 *
72 * ->mmap_sem 72 * ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
86 * ->anon_vma.lock (vma_adjust) 86 * ->anon_vma.lock (vma_adjust)
87 * 87 *
88 * ->anon_vma.lock 88 * ->anon_vma.lock
89 * ->page_table_lock (anon_vma_prepare and various) 89 * ->page_table_lock or pte_lock (anon_vma_prepare and various)
90 * 90 *
91 * ->page_table_lock 91 * ->page_table_lock or pte_lock
92 * ->swap_lock (try_to_unmap_one) 92 * ->swap_lock (try_to_unmap_one)
93 * ->private_lock (try_to_unmap_one) 93 * ->private_lock (try_to_unmap_one)
94 * ->tree_lock (try_to_unmap_one) 94 * ->tree_lock (try_to_unmap_one)
@@ -152,7 +152,7 @@ static int sync_page(void *word)
152 * in the ->sync_page() methods make essential use of the 152 * in the ->sync_page() methods make essential use of the
153 * page_mapping(), merely passing the page down to the backing 153 * page_mapping(), merely passing the page down to the backing
154 * device's unplug functions when it's non-NULL, which in turn 154 * device's unplug functions when it's non-NULL, which in turn
155 * ignore it for all cases but swap, where only page->private is 155 * ignore it for all cases but swap, where only page_private(page) is
156 * of interest. When page_mapping() does go NULL, the entire 156 * of interest. When page_mapping() does go NULL, the entire
157 * call stack gracefully ignores the page and returns. 157 * call stack gracefully ignores the page and returns.
158 * -- wli 158 * -- wli
@@ -1520,7 +1520,7 @@ repeat:
1520 page_cache_release(page); 1520 page_cache_release(page);
1521 return err; 1521 return err;
1522 } 1522 }
1523 } else { 1523 } else if (vma->vm_flags & VM_NONLINEAR) {
1524 /* No page was found just because we can't read it in now (being 1524 /* No page was found just because we can't read it in now (being
1525 * here implies nonblock != 0), but the page may exist, so set 1525 * here implies nonblock != 0), but the page may exist, so set
1526 * the PTE to fault it in later. */ 1526 * the PTE to fault it in later. */
@@ -1537,6 +1537,7 @@ repeat:
1537 1537
1538 return 0; 1538 return 0;
1539} 1539}
1540EXPORT_SYMBOL(filemap_populate);
1540 1541
1541struct vm_operations_struct generic_file_vm_ops = { 1542struct vm_operations_struct generic_file_vm_ops = {
1542 .nopage = filemap_nopage, 1543 .nopage = filemap_nopage,
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1555 vma->vm_ops = &generic_file_vm_ops; 1556 vma->vm_ops = &generic_file_vm_ops;
1556 return 0; 1557 return 0;
1557} 1558}
1558EXPORT_SYMBOL(filemap_populate);
1559 1559
1560/* 1560/*
1561 * This is for filesystems which do not implement ->writepage. 1561 * This is for filesystems which do not implement ->writepage.
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f537732..9cf687e4a29a 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping,
174 unsigned long address; 174 unsigned long address;
175 pte_t *pte; 175 pte_t *pte;
176 pte_t pteval; 176 pte_t pteval;
177 spinlock_t *ptl;
178 struct page *page;
177 179
178 spin_lock(&mapping->i_mmap_lock); 180 spin_lock(&mapping->i_mmap_lock);
179 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 181 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping,
181 address = vma->vm_start + 183 address = vma->vm_start +
182 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 184 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
183 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 185 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
184 /* 186 page = ZERO_PAGE(address);
185 * We need the page_table_lock to protect us from page faults, 187 pte = page_check_address(page, mm, address, &ptl);
186 * munmap, fork, etc... 188 if (pte) {
187 */
188 pte = page_check_address(ZERO_PAGE(address), mm,
189 address);
190 if (!IS_ERR(pte)) {
191 /* Nuke the page table entry. */ 189 /* Nuke the page table entry. */
192 flush_cache_page(vma, address, pte_pfn(*pte)); 190 flush_cache_page(vma, address, pte_pfn(*pte));
193 pteval = ptep_clear_flush(vma, address, pte); 191 pteval = ptep_clear_flush(vma, address, pte);
192 page_remove_rmap(page);
193 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 194 BUG_ON(pte_dirty(pteval));
195 pte_unmap(pte); 195 pte_unmap_unlock(pte, ptl);
196 spin_unlock(&mm->page_table_lock); 196 page_cache_release(page);
197 } 197 }
198 } 198 }
199 spin_unlock(&mapping->i_mmap_lock); 199 spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area,
228 228
229 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); 229 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
230 if (!IS_ERR(page)) { 230 if (!IS_ERR(page)) {
231 return page; 231 goto out;
232 } 232 }
233 if (PTR_ERR(page) != -ENODATA) 233 if (PTR_ERR(page) != -ENODATA)
234 return NULL; 234 return NULL;
@@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area,
249 page = ZERO_PAGE(address); 249 page = ZERO_PAGE(address);
250 } 250 }
251 251
252out:
253 page_cache_get(page);
252 return page; 254 return page;
253} 255}
254 256
diff --git a/mm/fremap.c b/mm/fremap.c
index ab23a0673c35..d862be3bc3e3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,33 +20,32 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, 23static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
24 unsigned long addr, pte_t *ptep) 24 unsigned long addr, pte_t *ptep)
25{ 25{
26 pte_t pte = *ptep; 26 pte_t pte = *ptep;
27 struct page *page = NULL;
27 28
28 if (pte_none(pte))
29 return;
30 if (pte_present(pte)) { 29 if (pte_present(pte)) {
31 unsigned long pfn = pte_pfn(pte); 30 unsigned long pfn = pte_pfn(pte);
32
33 flush_cache_page(vma, addr, pfn); 31 flush_cache_page(vma, addr, pfn);
34 pte = ptep_clear_flush(vma, addr, ptep); 32 pte = ptep_clear_flush(vma, addr, ptep);
35 if (pfn_valid(pfn)) { 33 if (unlikely(!pfn_valid(pfn))) {
36 struct page *page = pfn_to_page(pfn); 34 print_bad_pte(vma, pte, addr);
37 if (!PageReserved(page)) { 35 goto out;
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 dec_mm_counter(mm, rss);
43 }
44 } 36 }
37 page = pfn_to_page(pfn);
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
45 } else { 42 } else {
46 if (!pte_file(pte)) 43 if (!pte_file(pte))
47 free_swap_and_cache(pte_to_swp_entry(pte)); 44 free_swap_and_cache(pte_to_swp_entry(pte));
48 pte_clear(mm, addr, ptep); 45 pte_clear(mm, addr, ptep);
49 } 46 }
47out:
48 return !!page;
50} 49}
51 50
52/* 51/*
@@ -64,21 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
64 pud_t *pud; 63 pud_t *pud;
65 pgd_t *pgd; 64 pgd_t *pgd;
66 pte_t pte_val; 65 pte_t pte_val;
66 spinlock_t *ptl;
67
68 BUG_ON(vma->vm_flags & VM_RESERVED);
67 69
68 pgd = pgd_offset(mm, addr); 70 pgd = pgd_offset(mm, addr);
69 spin_lock(&mm->page_table_lock);
70
71 pud = pud_alloc(mm, pgd, addr); 71 pud = pud_alloc(mm, pgd, addr);
72 if (!pud) 72 if (!pud)
73 goto err_unlock; 73 goto out;
74
75 pmd = pmd_alloc(mm, pud, addr); 74 pmd = pmd_alloc(mm, pud, addr);
76 if (!pmd) 75 if (!pmd)
77 goto err_unlock; 76 goto out;
78 77 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
79 pte = pte_alloc_map(mm, pmd, addr);
80 if (!pte) 78 if (!pte)
81 goto err_unlock; 79 goto out;
82 80
83 /* 81 /*
84 * This page may have been truncated. Tell the 82 * This page may have been truncated. Tell the
@@ -88,29 +86,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
88 inode = vma->vm_file->f_mapping->host; 86 inode = vma->vm_file->f_mapping->host;
89 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 87 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
90 if (!page->mapping || page->index >= size) 88 if (!page->mapping || page->index >= size)
91 goto err_unlock; 89 goto unlock;
92 err = -ENOMEM; 90 err = -ENOMEM;
93 if (page_mapcount(page) > INT_MAX/2) 91 if (page_mapcount(page) > INT_MAX/2)
94 goto err_unlock; 92 goto unlock;
95 93
96 zap_pte(mm, vma, addr, pte); 94 if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
95 inc_mm_counter(mm, file_rss);
97 96
98 inc_mm_counter(mm,rss);
99 flush_icache_page(vma, page); 97 flush_icache_page(vma, page);
100 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 98 set_pte_at(mm, addr, pte, mk_pte(page, prot));
101 page_add_file_rmap(page); 99 page_add_file_rmap(page);
102 pte_val = *pte; 100 pte_val = *pte;
103 pte_unmap(pte);
104 update_mmu_cache(vma, addr, pte_val); 101 update_mmu_cache(vma, addr, pte_val);
105
106 err = 0; 102 err = 0;
107err_unlock: 103unlock:
108 spin_unlock(&mm->page_table_lock); 104 pte_unmap_unlock(pte, ptl);
105out:
109 return err; 106 return err;
110} 107}
111EXPORT_SYMBOL(install_page); 108EXPORT_SYMBOL(install_page);
112 109
113
114/* 110/*
115 * Install a file pte to a given virtual memory address, release any 111 * Install a file pte to a given virtual memory address, release any
116 * previously existing mapping. 112 * previously existing mapping.
@@ -124,37 +120,35 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
124 pud_t *pud; 120 pud_t *pud;
125 pgd_t *pgd; 121 pgd_t *pgd;
126 pte_t pte_val; 122 pte_t pte_val;
123 spinlock_t *ptl;
124
125 BUG_ON(vma->vm_flags & VM_RESERVED);
127 126
128 pgd = pgd_offset(mm, addr); 127 pgd = pgd_offset(mm, addr);
129 spin_lock(&mm->page_table_lock);
130
131 pud = pud_alloc(mm, pgd, addr); 128 pud = pud_alloc(mm, pgd, addr);
132 if (!pud) 129 if (!pud)
133 goto err_unlock; 130 goto out;
134
135 pmd = pmd_alloc(mm, pud, addr); 131 pmd = pmd_alloc(mm, pud, addr);
136 if (!pmd) 132 if (!pmd)
137 goto err_unlock; 133 goto out;
138 134 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
139 pte = pte_alloc_map(mm, pmd, addr);
140 if (!pte) 135 if (!pte)
141 goto err_unlock; 136 goto out;
142 137
143 zap_pte(mm, vma, addr, pte); 138 if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
139 update_hiwater_rss(mm);
140 dec_mm_counter(mm, file_rss);
141 }
144 142
145 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 143 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
146 pte_val = *pte; 144 pte_val = *pte;
147 pte_unmap(pte);
148 update_mmu_cache(vma, addr, pte_val); 145 update_mmu_cache(vma, addr, pte_val);
149 spin_unlock(&mm->page_table_lock); 146 pte_unmap_unlock(pte, ptl);
150 return 0; 147 err = 0;
151 148out:
152err_unlock:
153 spin_unlock(&mm->page_table_lock);
154 return err; 149 return err;
155} 150}
156 151
157
158/*** 152/***
159 * sys_remap_file_pages - remap arbitrary pages of a shared backing store 153 * sys_remap_file_pages - remap arbitrary pages of a shared backing store
160 * file within an existing vma. 154 * file within an existing vma.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d380678030..c9b43360fd33 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
277 unsigned long addr; 277 unsigned long addr;
278 278
279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
280 src_pte = huge_pte_offset(src, addr);
281 if (!src_pte)
282 continue;
280 dst_pte = huge_pte_alloc(dst, addr); 283 dst_pte = huge_pte_alloc(dst, addr);
281 if (!dst_pte) 284 if (!dst_pte)
282 goto nomem; 285 goto nomem;
286 spin_lock(&dst->page_table_lock);
283 spin_lock(&src->page_table_lock); 287 spin_lock(&src->page_table_lock);
284 src_pte = huge_pte_offset(src, addr); 288 if (!pte_none(*src_pte)) {
285 if (src_pte && !pte_none(*src_pte)) {
286 entry = *src_pte; 289 entry = *src_pte;
287 ptepage = pte_page(entry); 290 ptepage = pte_page(entry);
288 get_page(ptepage); 291 get_page(ptepage);
289 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); 292 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
290 set_huge_pte_at(dst, addr, dst_pte, entry); 293 set_huge_pte_at(dst, addr, dst_pte, entry);
291 } 294 }
292 spin_unlock(&src->page_table_lock); 295 spin_unlock(&src->page_table_lock);
296 spin_unlock(&dst->page_table_lock);
293 } 297 }
294 return 0; 298 return 0;
295 299
@@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
310 BUG_ON(start & ~HPAGE_MASK); 314 BUG_ON(start & ~HPAGE_MASK);
311 BUG_ON(end & ~HPAGE_MASK); 315 BUG_ON(end & ~HPAGE_MASK);
312 316
317 spin_lock(&mm->page_table_lock);
318
319 /* Update high watermark before we lower rss */
320 update_hiwater_rss(mm);
321
313 for (address = start; address < end; address += HPAGE_SIZE) { 322 for (address = start; address < end; address += HPAGE_SIZE) {
314 ptep = huge_pte_offset(mm, address); 323 ptep = huge_pte_offset(mm, address);
315 if (! ptep) 324 if (!ptep)
316 /* This can happen on truncate, or if an
317 * mmap() is aborted due to an error before
318 * the prefault */
319 continue; 325 continue;
320 326
321 pte = huge_ptep_get_and_clear(mm, address, ptep); 327 pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
324 330
325 page = pte_page(pte); 331 page = pte_page(pte);
326 put_page(page); 332 put_page(page);
327 add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); 333 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
328 } 334 }
329 flush_tlb_range(vma, start, end);
330}
331
332void zap_hugepage_range(struct vm_area_struct *vma,
333 unsigned long start, unsigned long length)
334{
335 struct mm_struct *mm = vma->vm_mm;
336 335
337 spin_lock(&mm->page_table_lock);
338 unmap_hugepage_range(vma, start, start + length);
339 spin_unlock(&mm->page_table_lock); 336 spin_unlock(&mm->page_table_lock);
337 flush_tlb_range(vma, start, end);
340} 338}
341 339
342int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) 340static struct page *find_lock_huge_page(struct address_space *mapping,
341 unsigned long idx)
343{ 342{
344 struct mm_struct *mm = current->mm; 343 struct page *page;
345 unsigned long addr; 344 int err;
346 int ret = 0; 345 struct inode *inode = mapping->host;
347 346 unsigned long size;
348 WARN_ON(!is_vm_hugetlb_page(vma)); 347
349 BUG_ON(vma->vm_start & ~HPAGE_MASK); 348retry:
350 BUG_ON(vma->vm_end & ~HPAGE_MASK); 349 page = find_lock_page(mapping, idx);
351 350 if (page)
352 hugetlb_prefault_arch_hook(mm); 351 goto out;
353 352
354 spin_lock(&mm->page_table_lock); 353 /* Check to make sure the mapping hasn't been truncated */
355 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 354 size = i_size_read(inode) >> HPAGE_SHIFT;
356 unsigned long idx; 355 if (idx >= size)
357 pte_t *pte = huge_pte_alloc(mm, addr); 356 goto out;
358 struct page *page; 357
359 358 if (hugetlb_get_quota(mapping))
360 if (!pte) { 359 goto out;
361 ret = -ENOMEM; 360 page = alloc_huge_page();
362 goto out; 361 if (!page) {
363 } 362 hugetlb_put_quota(mapping);
363 goto out;
364 }
364 365
365 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) 366 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
366 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 367 if (err) {
367 page = find_get_page(mapping, idx); 368 put_page(page);
368 if (!page) { 369 hugetlb_put_quota(mapping);
369 /* charge the fs quota first */ 370 if (err == -EEXIST)
370 if (hugetlb_get_quota(mapping)) { 371 goto retry;
371 ret = -ENOMEM; 372 page = NULL;
372 goto out;
373 }
374 page = alloc_huge_page();
375 if (!page) {
376 hugetlb_put_quota(mapping);
377 ret = -ENOMEM;
378 goto out;
379 }
380 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
381 if (! ret) {
382 unlock_page(page);
383 } else {
384 hugetlb_put_quota(mapping);
385 free_huge_page(page);
386 goto out;
387 }
388 }
389 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
390 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
391 } 373 }
392out: 374out:
393 spin_unlock(&mm->page_table_lock); 375 return page;
394 return ret;
395} 376}
396 377
397/*
398 * On ia64 at least, it is possible to receive a hugetlb fault from a
399 * stale zero entry left in the TLB from earlier hardware prefetching.
400 * Low-level arch code should already have flushed the stale entry as
401 * part of its fault handling, but we do need to accept this minor fault
402 * and return successfully. Whereas the "normal" case is that this is
403 * an access to a hugetlb page which has been truncated off since mmap.
404 */
405int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 378int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
406 unsigned long address, int write_access) 379 unsigned long address, int write_access)
407{ 380{
408 int ret = VM_FAULT_SIGBUS; 381 int ret = VM_FAULT_SIGBUS;
382 unsigned long idx;
383 unsigned long size;
409 pte_t *pte; 384 pte_t *pte;
385 struct page *page;
386 struct address_space *mapping;
387
388 pte = huge_pte_alloc(mm, address);
389 if (!pte)
390 goto out;
391
392 mapping = vma->vm_file->f_mapping;
393 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
394 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
395
396 /*
397 * Use page lock to guard against racing truncation
398 * before we get page_table_lock.
399 */
400 page = find_lock_huge_page(mapping, idx);
401 if (!page)
402 goto out;
410 403
411 spin_lock(&mm->page_table_lock); 404 spin_lock(&mm->page_table_lock);
412 pte = huge_pte_offset(mm, address); 405 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
413 if (pte && !pte_none(*pte)) 406 if (idx >= size)
414 ret = VM_FAULT_MINOR; 407 goto backout;
408
409 ret = VM_FAULT_MINOR;
410 if (!pte_none(*pte))
411 goto backout;
412
413 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
414 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
415 spin_unlock(&mm->page_table_lock); 415 spin_unlock(&mm->page_table_lock);
416 unlock_page(page);
417out:
416 return ret; 418 return ret;
419
420backout:
421 spin_unlock(&mm->page_table_lock);
422 hugetlb_put_quota(mapping);
423 unlock_page(page);
424 put_page(page);
425 goto out;
417} 426}
418 427
419int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 428int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
423 unsigned long vpfn, vaddr = *position; 432 unsigned long vpfn, vaddr = *position;
424 int remainder = *length; 433 int remainder = *length;
425 434
426 BUG_ON(!is_vm_hugetlb_page(vma));
427
428 vpfn = vaddr/PAGE_SIZE; 435 vpfn = vaddr/PAGE_SIZE;
429 spin_lock(&mm->page_table_lock); 436 spin_lock(&mm->page_table_lock);
430 while (vaddr < vma->vm_end && remainder) { 437 while (vaddr < vma->vm_end && remainder) {
438 pte_t *pte;
439 struct page *page;
431 440
432 if (pages) { 441 /*
433 pte_t *pte; 442 * Some archs (sparc64, sh*) have multiple pte_ts to
434 struct page *page; 443 * each hugepage. We have to make * sure we get the
435 444 * first, for the page indexing below to work.
436 /* Some archs (sparc64, sh*) have multiple 445 */
437 * pte_ts to each hugepage. We have to make 446 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
438 * sure we get the first, for the page
439 * indexing below to work. */
440 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
441
442 /* the hugetlb file might have been truncated */
443 if (!pte || pte_none(*pte)) {
444 remainder = 0;
445 if (!i)
446 i = -EFAULT;
447 break;
448 }
449 447
450 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 448 if (!pte || pte_none(*pte)) {
449 int ret;
451 450
452 WARN_ON(!PageCompound(page)); 451 spin_unlock(&mm->page_table_lock);
452 ret = hugetlb_fault(mm, vma, vaddr, 0);
453 spin_lock(&mm->page_table_lock);
454 if (ret == VM_FAULT_MINOR)
455 continue;
456
457 remainder = 0;
458 if (!i)
459 i = -EFAULT;
460 break;
461 }
453 462
463 if (pages) {
464 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
454 get_page(page); 465 get_page(page);
455 pages[i] = page; 466 pages[i] = page;
456 } 467 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64c..17aaf3e16449 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
126 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
127{ 127{
128 *prev = vma; 128 *prev = vma;
129 if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) 129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
130 return -EINVAL; 130 return -EINVAL;
131 131
132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index 1db40e935e55..0f60baf6f69b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
114{ 114{
115 struct page *page = pmd_page(*pmd); 115 struct page *page = pmd_page(*pmd);
116 pmd_clear(pmd); 116 pmd_clear(pmd);
117 pte_lock_deinit(page);
117 pte_free_tlb(tlb, page); 118 pte_free_tlb(tlb, page);
118 dec_page_state(nr_page_table_pages); 119 dec_page_state(nr_page_table_pages);
119 tlb->mm->nr_ptes--; 120 tlb->mm->nr_ptes--;
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb,
249 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 250 free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
250 } while (pgd++, addr = next, addr != end); 251 } while (pgd++, addr = next, addr != end);
251 252
252 if (!tlb_is_full_mm(*tlb)) 253 if (!(*tlb)->fullmm)
253 flush_tlb_pgtables((*tlb)->mm, start, end); 254 flush_tlb_pgtables((*tlb)->mm, start, end);
254} 255}
255 256
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
260 struct vm_area_struct *next = vma->vm_next; 261 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start; 262 unsigned long addr = vma->vm_start;
262 263
264 /*
265 * Hide vma from rmap and vmtruncate before freeing pgtables
266 */
267 anon_vma_unlink(vma);
268 unlink_file_vma(vma);
269
263 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { 270 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
264 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 271 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
265 floor, next? next->vm_start: ceiling); 272 floor, next? next->vm_start: ceiling);
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
272 HPAGE_SIZE)) { 279 HPAGE_SIZE)) {
273 vma = next; 280 vma = next;
274 next = vma->vm_next; 281 next = vma->vm_next;
282 anon_vma_unlink(vma);
283 unlink_file_vma(vma);
275 } 284 }
276 free_pgd_range(tlb, addr, vma->vm_end, 285 free_pgd_range(tlb, addr, vma->vm_end,
277 floor, next? next->vm_start: ceiling); 286 floor, next? next->vm_start: ceiling);
@@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
280 } 289 }
281} 290}
282 291
283pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, 292int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
284 unsigned long address)
285{ 293{
286 if (!pmd_present(*pmd)) { 294 struct page *new = pte_alloc_one(mm, address);
287 struct page *new; 295 if (!new)
288 296 return -ENOMEM;
289 spin_unlock(&mm->page_table_lock); 297
290 new = pte_alloc_one(mm, address); 298 pte_lock_init(new);
291 spin_lock(&mm->page_table_lock); 299 spin_lock(&mm->page_table_lock);
292 if (!new) 300 if (pmd_present(*pmd)) { /* Another has populated it */
293 return NULL; 301 pte_lock_deinit(new);
294 /* 302 pte_free(new);
295 * Because we dropped the lock, we should re-check the 303 } else {
296 * entry, as somebody else could have populated it..
297 */
298 if (pmd_present(*pmd)) {
299 pte_free(new);
300 goto out;
301 }
302 mm->nr_ptes++; 304 mm->nr_ptes++;
303 inc_page_state(nr_page_table_pages); 305 inc_page_state(nr_page_table_pages);
304 pmd_populate(mm, pmd, new); 306 pmd_populate(mm, pmd, new);
305 } 307 }
306out: 308 spin_unlock(&mm->page_table_lock);
307 return pte_offset_map(pmd, address); 309 return 0;
308} 310}
309 311
310pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 312int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
311{ 313{
312 if (!pmd_present(*pmd)) { 314 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
313 pte_t *new; 315 if (!new)
316 return -ENOMEM;
314 317
315 spin_unlock(&mm->page_table_lock); 318 spin_lock(&init_mm.page_table_lock);
316 new = pte_alloc_one_kernel(mm, address); 319 if (pmd_present(*pmd)) /* Another has populated it */
317 spin_lock(&mm->page_table_lock); 320 pte_free_kernel(new);
318 if (!new) 321 else
319 return NULL; 322 pmd_populate_kernel(&init_mm, pmd, new);
323 spin_unlock(&init_mm.page_table_lock);
324 return 0;
325}
320 326
321 /* 327static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
322 * Because we dropped the lock, we should re-check the 328{
323 * entry, as somebody else could have populated it.. 329 if (file_rss)
324 */ 330 add_mm_counter(mm, file_rss, file_rss);
325 if (pmd_present(*pmd)) { 331 if (anon_rss)
326 pte_free_kernel(new); 332 add_mm_counter(mm, anon_rss, anon_rss);
327 goto out; 333}
328 } 334
329 pmd_populate_kernel(mm, pmd, new); 335/*
330 } 336 * This function is called to print an error when a pte in a
331out: 337 * !VM_RESERVED region is found pointing to an invalid pfn (which
332 return pte_offset_kernel(pmd, address); 338 * is an error.
339 *
340 * The calling function must still handle the error.
341 */
342void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
343{
344 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
345 "vm_flags = %lx, vaddr = %lx\n",
346 (long long)pte_val(pte),
347 (vma->vm_mm == current->mm ? current->comm : "???"),
348 vma->vm_flags, vaddr);
349 dump_stack();
333} 350}
334 351
335/* 352/*
336 * copy one vm_area from one task to the other. Assumes the page tables 353 * copy one vm_area from one task to the other. Assumes the page tables
337 * already present in the new task to be cleared in the whole range 354 * already present in the new task to be cleared in the whole range
338 * covered by this vma. 355 * covered by this vma.
339 *
340 * dst->page_table_lock is held on entry and exit,
341 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
342 */ 356 */
343 357
344static inline void 358static inline void
345copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 359copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
346 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, 360 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
347 unsigned long addr) 361 unsigned long addr, int *rss)
348{ 362{
363 unsigned long vm_flags = vma->vm_flags;
349 pte_t pte = *src_pte; 364 pte_t pte = *src_pte;
350 struct page *page; 365 struct page *page;
351 unsigned long pfn; 366 unsigned long pfn;
@@ -357,29 +372,32 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
357 /* make sure dst_mm is on swapoff's mmlist. */ 372 /* make sure dst_mm is on swapoff's mmlist. */
358 if (unlikely(list_empty(&dst_mm->mmlist))) { 373 if (unlikely(list_empty(&dst_mm->mmlist))) {
359 spin_lock(&mmlist_lock); 374 spin_lock(&mmlist_lock);
360 list_add(&dst_mm->mmlist, &src_mm->mmlist); 375 if (list_empty(&dst_mm->mmlist))
376 list_add(&dst_mm->mmlist,
377 &src_mm->mmlist);
361 spin_unlock(&mmlist_lock); 378 spin_unlock(&mmlist_lock);
362 } 379 }
363 } 380 }
364 set_pte_at(dst_mm, addr, dst_pte, pte); 381 goto out_set_pte;
365 return;
366 } 382 }
367 383
368 pfn = pte_pfn(pte); 384 /* If the region is VM_RESERVED, the mapping is not
369 /* the pte points outside of valid memory, the 385 * mapped via rmap - duplicate the pte as is.
370 * mapping is assumed to be good, meaningful
371 * and not mapped via rmap - duplicate the
372 * mapping as is.
373 */ 386 */
374 page = NULL; 387 if (vm_flags & VM_RESERVED)
375 if (pfn_valid(pfn)) 388 goto out_set_pte;
376 page = pfn_to_page(pfn);
377 389
378 if (!page || PageReserved(page)) { 390 pfn = pte_pfn(pte);
379 set_pte_at(dst_mm, addr, dst_pte, pte); 391 /* If the pte points outside of valid memory but
380 return; 392 * the region is not VM_RESERVED, we have a problem.
393 */
394 if (unlikely(!pfn_valid(pfn))) {
395 print_bad_pte(vma, pte, addr);
396 goto out_set_pte; /* try to do something sane */
381 } 397 }
382 398
399 page = pfn_to_page(pfn);
400
383 /* 401 /*
384 * If it's a COW mapping, write protect it both 402 * If it's a COW mapping, write protect it both
385 * in the parent and the child 403 * in the parent and the child
@@ -397,11 +415,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
397 pte = pte_mkclean(pte); 415 pte = pte_mkclean(pte);
398 pte = pte_mkold(pte); 416 pte = pte_mkold(pte);
399 get_page(page); 417 get_page(page);
400 inc_mm_counter(dst_mm, rss);
401 if (PageAnon(page))
402 inc_mm_counter(dst_mm, anon_rss);
403 set_pte_at(dst_mm, addr, dst_pte, pte);
404 page_dup_rmap(page); 418 page_dup_rmap(page);
419 rss[!!PageAnon(page)]++;
420
421out_set_pte:
422 set_pte_at(dst_mm, addr, dst_pte, pte);
405} 423}
406 424
407static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 425static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -409,38 +427,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
409 unsigned long addr, unsigned long end) 427 unsigned long addr, unsigned long end)
410{ 428{
411 pte_t *src_pte, *dst_pte; 429 pte_t *src_pte, *dst_pte;
412 unsigned long vm_flags = vma->vm_flags; 430 spinlock_t *src_ptl, *dst_ptl;
413 int progress; 431 int progress = 0;
432 int rss[2];
414 433
415again: 434again:
416 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); 435 rss[1] = rss[0] = 0;
436 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
417 if (!dst_pte) 437 if (!dst_pte)
418 return -ENOMEM; 438 return -ENOMEM;
419 src_pte = pte_offset_map_nested(src_pmd, addr); 439 src_pte = pte_offset_map_nested(src_pmd, addr);
440 src_ptl = pte_lockptr(src_mm, src_pmd);
441 spin_lock(src_ptl);
420 442
421 progress = 0;
422 spin_lock(&src_mm->page_table_lock);
423 do { 443 do {
424 /* 444 /*
425 * We are holding two locks at this point - either of them 445 * We are holding two locks at this point - either of them
426 * could generate latencies in another task on another CPU. 446 * could generate latencies in another task on another CPU.
427 */ 447 */
428 if (progress >= 32 && (need_resched() || 448 if (progress >= 32) {
429 need_lockbreak(&src_mm->page_table_lock) || 449 progress = 0;
430 need_lockbreak(&dst_mm->page_table_lock))) 450 if (need_resched() ||
431 break; 451 need_lockbreak(src_ptl) ||
452 need_lockbreak(dst_ptl))
453 break;
454 }
432 if (pte_none(*src_pte)) { 455 if (pte_none(*src_pte)) {
433 progress++; 456 progress++;
434 continue; 457 continue;
435 } 458 }
436 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); 459 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
437 progress += 8; 460 progress += 8;
438 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 461 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
439 spin_unlock(&src_mm->page_table_lock);
440 462
463 spin_unlock(src_ptl);
441 pte_unmap_nested(src_pte - 1); 464 pte_unmap_nested(src_pte - 1);
442 pte_unmap(dst_pte - 1); 465 add_mm_rss(dst_mm, rss[0], rss[1]);
443 cond_resched_lock(&dst_mm->page_table_lock); 466 pte_unmap_unlock(dst_pte - 1, dst_ptl);
467 cond_resched();
444 if (addr != end) 468 if (addr != end)
445 goto again; 469 goto again;
446 return 0; 470 return 0;
@@ -525,24 +549,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
525 return 0; 549 return 0;
526} 550}
527 551
528static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 552static void zap_pte_range(struct mmu_gather *tlb,
553 struct vm_area_struct *vma, pmd_t *pmd,
529 unsigned long addr, unsigned long end, 554 unsigned long addr, unsigned long end,
530 struct zap_details *details) 555 struct zap_details *details)
531{ 556{
557 struct mm_struct *mm = tlb->mm;
532 pte_t *pte; 558 pte_t *pte;
559 spinlock_t *ptl;
560 int file_rss = 0;
561 int anon_rss = 0;
533 562
534 pte = pte_offset_map(pmd, addr); 563 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
535 do { 564 do {
536 pte_t ptent = *pte; 565 pte_t ptent = *pte;
537 if (pte_none(ptent)) 566 if (pte_none(ptent))
538 continue; 567 continue;
539 if (pte_present(ptent)) { 568 if (pte_present(ptent)) {
540 struct page *page = NULL; 569 struct page *page = NULL;
541 unsigned long pfn = pte_pfn(ptent); 570 if (!(vma->vm_flags & VM_RESERVED)) {
542 if (pfn_valid(pfn)) { 571 unsigned long pfn = pte_pfn(ptent);
543 page = pfn_to_page(pfn); 572 if (unlikely(!pfn_valid(pfn)))
544 if (PageReserved(page)) 573 print_bad_pte(vma, ptent, addr);
545 page = NULL; 574 else
575 page = pfn_to_page(pfn);
546 } 576 }
547 if (unlikely(details) && page) { 577 if (unlikely(details) && page) {
548 /* 578 /*
@@ -562,7 +592,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
562 page->index > details->last_index)) 592 page->index > details->last_index))
563 continue; 593 continue;
564 } 594 }
565 ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, 595 ptent = ptep_get_and_clear_full(mm, addr, pte,
566 tlb->fullmm); 596 tlb->fullmm);
567 tlb_remove_tlb_entry(tlb, pte, addr); 597 tlb_remove_tlb_entry(tlb, pte, addr);
568 if (unlikely(!page)) 598 if (unlikely(!page))
@@ -570,15 +600,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
570 if (unlikely(details) && details->nonlinear_vma 600 if (unlikely(details) && details->nonlinear_vma
571 && linear_page_index(details->nonlinear_vma, 601 && linear_page_index(details->nonlinear_vma,
572 addr) != page->index) 602 addr) != page->index)
573 set_pte_at(tlb->mm, addr, pte, 603 set_pte_at(mm, addr, pte,
574 pgoff_to_pte(page->index)); 604 pgoff_to_pte(page->index));
575 if (pte_dirty(ptent))
576 set_page_dirty(page);
577 if (PageAnon(page)) 605 if (PageAnon(page))
578 dec_mm_counter(tlb->mm, anon_rss); 606 anon_rss--;
579 else if (pte_young(ptent)) 607 else {
580 mark_page_accessed(page); 608 if (pte_dirty(ptent))
581 tlb->freed++; 609 set_page_dirty(page);
610 if (pte_young(ptent))
611 mark_page_accessed(page);
612 file_rss--;
613 }
582 page_remove_rmap(page); 614 page_remove_rmap(page);
583 tlb_remove_page(tlb, page); 615 tlb_remove_page(tlb, page);
584 continue; 616 continue;
@@ -591,12 +623,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
591 continue; 623 continue;
592 if (!pte_file(ptent)) 624 if (!pte_file(ptent))
593 free_swap_and_cache(pte_to_swp_entry(ptent)); 625 free_swap_and_cache(pte_to_swp_entry(ptent));
594 pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); 626 pte_clear_full(mm, addr, pte, tlb->fullmm);
595 } while (pte++, addr += PAGE_SIZE, addr != end); 627 } while (pte++, addr += PAGE_SIZE, addr != end);
596 pte_unmap(pte - 1); 628
629 add_mm_rss(mm, file_rss, anon_rss);
630 pte_unmap_unlock(pte - 1, ptl);
597} 631}
598 632
599static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, 633static inline void zap_pmd_range(struct mmu_gather *tlb,
634 struct vm_area_struct *vma, pud_t *pud,
600 unsigned long addr, unsigned long end, 635 unsigned long addr, unsigned long end,
601 struct zap_details *details) 636 struct zap_details *details)
602{ 637{
@@ -608,11 +643,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
608 next = pmd_addr_end(addr, end); 643 next = pmd_addr_end(addr, end);
609 if (pmd_none_or_clear_bad(pmd)) 644 if (pmd_none_or_clear_bad(pmd))
610 continue; 645 continue;
611 zap_pte_range(tlb, pmd, addr, next, details); 646 zap_pte_range(tlb, vma, pmd, addr, next, details);
612 } while (pmd++, addr = next, addr != end); 647 } while (pmd++, addr = next, addr != end);
613} 648}
614 649
615static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 650static inline void zap_pud_range(struct mmu_gather *tlb,
651 struct vm_area_struct *vma, pgd_t *pgd,
616 unsigned long addr, unsigned long end, 652 unsigned long addr, unsigned long end,
617 struct zap_details *details) 653 struct zap_details *details)
618{ 654{
@@ -624,7 +660,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
624 next = pud_addr_end(addr, end); 660 next = pud_addr_end(addr, end);
625 if (pud_none_or_clear_bad(pud)) 661 if (pud_none_or_clear_bad(pud))
626 continue; 662 continue;
627 zap_pmd_range(tlb, pud, addr, next, details); 663 zap_pmd_range(tlb, vma, pud, addr, next, details);
628 } while (pud++, addr = next, addr != end); 664 } while (pud++, addr = next, addr != end);
629} 665}
630 666
@@ -645,7 +681,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
645 next = pgd_addr_end(addr, end); 681 next = pgd_addr_end(addr, end);
646 if (pgd_none_or_clear_bad(pgd)) 682 if (pgd_none_or_clear_bad(pgd))
647 continue; 683 continue;
648 zap_pud_range(tlb, pgd, addr, next, details); 684 zap_pud_range(tlb, vma, pgd, addr, next, details);
649 } while (pgd++, addr = next, addr != end); 685 } while (pgd++, addr = next, addr != end);
650 tlb_end_vma(tlb, vma); 686 tlb_end_vma(tlb, vma);
651} 687}
@@ -660,7 +696,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
660/** 696/**
661 * unmap_vmas - unmap a range of memory covered by a list of vma's 697 * unmap_vmas - unmap a range of memory covered by a list of vma's
662 * @tlbp: address of the caller's struct mmu_gather 698 * @tlbp: address of the caller's struct mmu_gather
663 * @mm: the controlling mm_struct
664 * @vma: the starting vma 699 * @vma: the starting vma
665 * @start_addr: virtual address at which to start unmapping 700 * @start_addr: virtual address at which to start unmapping
666 * @end_addr: virtual address at which to end unmapping 701 * @end_addr: virtual address at which to end unmapping
@@ -669,10 +704,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
669 * 704 *
670 * Returns the end address of the unmapping (restart addr if interrupted). 705 * Returns the end address of the unmapping (restart addr if interrupted).
671 * 706 *
672 * Unmap all pages in the vma list. Called under page_table_lock. 707 * Unmap all pages in the vma list.
673 * 708 *
674 * We aim to not hold page_table_lock for too long (for scheduling latency 709 * We aim to not hold locks for too long (for scheduling latency reasons).
675 * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to 710 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
676 * return the ending mmu_gather to the caller. 711 * return the ending mmu_gather to the caller.
677 * 712 *
678 * Only addresses between `start' and `end' will be unmapped. 713 * Only addresses between `start' and `end' will be unmapped.
@@ -684,7 +719,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
684 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 719 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
685 * drops the lock and schedules. 720 * drops the lock and schedules.
686 */ 721 */
687unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, 722unsigned long unmap_vmas(struct mmu_gather **tlbp,
688 struct vm_area_struct *vma, unsigned long start_addr, 723 struct vm_area_struct *vma, unsigned long start_addr,
689 unsigned long end_addr, unsigned long *nr_accounted, 724 unsigned long end_addr, unsigned long *nr_accounted,
690 struct zap_details *details) 725 struct zap_details *details)
@@ -694,7 +729,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
694 int tlb_start_valid = 0; 729 int tlb_start_valid = 0;
695 unsigned long start = start_addr; 730 unsigned long start = start_addr;
696 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 731 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
697 int fullmm = tlb_is_full_mm(*tlbp); 732 int fullmm = (*tlbp)->fullmm;
698 733
699 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 734 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
700 unsigned long end; 735 unsigned long end;
@@ -734,19 +769,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
734 tlb_finish_mmu(*tlbp, tlb_start, start); 769 tlb_finish_mmu(*tlbp, tlb_start, start);
735 770
736 if (need_resched() || 771 if (need_resched() ||
737 need_lockbreak(&mm->page_table_lock) ||
738 (i_mmap_lock && need_lockbreak(i_mmap_lock))) { 772 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
739 if (i_mmap_lock) { 773 if (i_mmap_lock) {
740 /* must reset count of rss freed */ 774 *tlbp = NULL;
741 *tlbp = tlb_gather_mmu(mm, fullmm);
742 goto out; 775 goto out;
743 } 776 }
744 spin_unlock(&mm->page_table_lock);
745 cond_resched(); 777 cond_resched();
746 spin_lock(&mm->page_table_lock);
747 } 778 }
748 779
749 *tlbp = tlb_gather_mmu(mm, fullmm); 780 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
750 tlb_start_valid = 0; 781 tlb_start_valid = 0;
751 zap_bytes = ZAP_BLOCK_SIZE; 782 zap_bytes = ZAP_BLOCK_SIZE;
752 } 783 }
@@ -770,123 +801,93 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
770 unsigned long end = address + size; 801 unsigned long end = address + size;
771 unsigned long nr_accounted = 0; 802 unsigned long nr_accounted = 0;
772 803
773 if (is_vm_hugetlb_page(vma)) {
774 zap_hugepage_range(vma, address, size);
775 return end;
776 }
777
778 lru_add_drain(); 804 lru_add_drain();
779 spin_lock(&mm->page_table_lock);
780 tlb = tlb_gather_mmu(mm, 0); 805 tlb = tlb_gather_mmu(mm, 0);
781 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); 806 update_hiwater_rss(mm);
782 tlb_finish_mmu(tlb, address, end); 807 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
783 spin_unlock(&mm->page_table_lock); 808 if (tlb)
809 tlb_finish_mmu(tlb, address, end);
784 return end; 810 return end;
785} 811}
786 812
787/* 813/*
788 * Do a quick page-table lookup for a single page. 814 * Do a quick page-table lookup for a single page.
789 * mm->page_table_lock must be held.
790 */ 815 */
791static struct page *__follow_page(struct mm_struct *mm, unsigned long address, 816struct page *follow_page(struct mm_struct *mm, unsigned long address,
792 int read, int write, int accessed) 817 unsigned int flags)
793{ 818{
794 pgd_t *pgd; 819 pgd_t *pgd;
795 pud_t *pud; 820 pud_t *pud;
796 pmd_t *pmd; 821 pmd_t *pmd;
797 pte_t *ptep, pte; 822 pte_t *ptep, pte;
823 spinlock_t *ptl;
798 unsigned long pfn; 824 unsigned long pfn;
799 struct page *page; 825 struct page *page;
800 826
801 page = follow_huge_addr(mm, address, write); 827 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
802 if (! IS_ERR(page)) 828 if (!IS_ERR(page)) {
803 return page; 829 BUG_ON(flags & FOLL_GET);
830 goto out;
831 }
804 832
833 page = NULL;
805 pgd = pgd_offset(mm, address); 834 pgd = pgd_offset(mm, address);
806 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 835 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
807 goto out; 836 goto no_page_table;
808 837
809 pud = pud_offset(pgd, address); 838 pud = pud_offset(pgd, address);
810 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 839 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
811 goto out; 840 goto no_page_table;
812 841
813 pmd = pmd_offset(pud, address); 842 pmd = pmd_offset(pud, address);
814 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 843 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
844 goto no_page_table;
845
846 if (pmd_huge(*pmd)) {
847 BUG_ON(flags & FOLL_GET);
848 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
815 goto out; 849 goto out;
816 if (pmd_huge(*pmd)) 850 }
817 return follow_huge_pmd(mm, address, pmd, write);
818 851
819 ptep = pte_offset_map(pmd, address); 852 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
820 if (!ptep) 853 if (!ptep)
821 goto out; 854 goto out;
822 855
823 pte = *ptep; 856 pte = *ptep;
824 pte_unmap(ptep); 857 if (!pte_present(pte))
825 if (pte_present(pte)) { 858 goto unlock;
826 if (write && !pte_write(pte)) 859 if ((flags & FOLL_WRITE) && !pte_write(pte))
827 goto out; 860 goto unlock;
828 if (read && !pte_read(pte)) 861 pfn = pte_pfn(pte);
829 goto out; 862 if (!pfn_valid(pfn))
830 pfn = pte_pfn(pte); 863 goto unlock;
831 if (pfn_valid(pfn)) { 864
832 page = pfn_to_page(pfn); 865 page = pfn_to_page(pfn);
833 if (accessed) { 866 if (flags & FOLL_GET)
834 if (write && !pte_dirty(pte) &&!PageDirty(page)) 867 get_page(page);
835 set_page_dirty(page); 868 if (flags & FOLL_TOUCH) {
836 mark_page_accessed(page); 869 if ((flags & FOLL_WRITE) &&
837 } 870 !pte_dirty(pte) && !PageDirty(page))
838 return page; 871 set_page_dirty(page);
839 } 872 mark_page_accessed(page);
840 } 873 }
841 874unlock:
875 pte_unmap_unlock(ptep, ptl);
842out: 876out:
843 return NULL; 877 return page;
844}
845
846inline struct page *
847follow_page(struct mm_struct *mm, unsigned long address, int write)
848{
849 return __follow_page(mm, address, 0, write, 1);
850}
851
852/*
853 * check_user_page_readable() can be called frm niterrupt context by oprofile,
854 * so we need to avoid taking any non-irq-safe locks
855 */
856int check_user_page_readable(struct mm_struct *mm, unsigned long address)
857{
858 return __follow_page(mm, address, 1, 0, 0) != NULL;
859}
860EXPORT_SYMBOL(check_user_page_readable);
861
862static inline int
863untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
864 unsigned long address)
865{
866 pgd_t *pgd;
867 pud_t *pud;
868 pmd_t *pmd;
869
870 /* Check if the vma is for an anonymous mapping. */
871 if (vma->vm_ops && vma->vm_ops->nopage)
872 return 0;
873
874 /* Check if page directory entry exists. */
875 pgd = pgd_offset(mm, address);
876 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
877 return 1;
878
879 pud = pud_offset(pgd, address);
880 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
881 return 1;
882
883 /* Check if page middle directory entry exists. */
884 pmd = pmd_offset(pud, address);
885 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
886 return 1;
887 878
888 /* There is a pte slot for 'address' in 'mm'. */ 879no_page_table:
889 return 0; 880 /*
881 * When core dumping an enormous anonymous area that nobody
882 * has touched so far, we don't want to allocate page tables.
883 */
884 if (flags & FOLL_ANON) {
885 page = ZERO_PAGE(address);
886 if (flags & FOLL_GET)
887 get_page(page);
888 BUG_ON(flags & FOLL_WRITE);
889 }
890 return page;
890} 891}
891 892
892int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 893int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +895,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
894 struct page **pages, struct vm_area_struct **vmas) 895 struct page **pages, struct vm_area_struct **vmas)
895{ 896{
896 int i; 897 int i;
897 unsigned int flags; 898 unsigned int vm_flags;
898 899
899 /* 900 /*
900 * Require read or write permissions. 901 * Require read or write permissions.
901 * If 'force' is set, we only require the "MAY" flags. 902 * If 'force' is set, we only require the "MAY" flags.
902 */ 903 */
903 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 904 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
904 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 905 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
905 i = 0; 906 i = 0;
906 907
907 do { 908 do {
908 struct vm_area_struct * vma; 909 struct vm_area_struct *vma;
910 unsigned int foll_flags;
909 911
910 vma = find_extend_vma(mm, start); 912 vma = find_extend_vma(mm, start);
911 if (!vma && in_gate_area(tsk, start)) { 913 if (!vma && in_gate_area(tsk, start)) {
@@ -945,8 +947,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
945 continue; 947 continue;
946 } 948 }
947 949
948 if (!vma || (vma->vm_flags & VM_IO) 950 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
949 || !(flags & vma->vm_flags)) 951 || !(vm_flags & vma->vm_flags))
950 return i ? : -EFAULT; 952 return i ? : -EFAULT;
951 953
952 if (is_vm_hugetlb_page(vma)) { 954 if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +956,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
954 &start, &len, i); 956 &start, &len, i);
955 continue; 957 continue;
956 } 958 }
957 spin_lock(&mm->page_table_lock); 959
960 foll_flags = FOLL_TOUCH;
961 if (pages)
962 foll_flags |= FOLL_GET;
963 if (!write && !(vma->vm_flags & VM_LOCKED) &&
964 (!vma->vm_ops || !vma->vm_ops->nopage))
965 foll_flags |= FOLL_ANON;
966
958 do { 967 do {
959 int write_access = write;
960 struct page *page; 968 struct page *page;
961 969
962 cond_resched_lock(&mm->page_table_lock); 970 if (write)
963 while (!(page = follow_page(mm, start, write_access))) { 971 foll_flags |= FOLL_WRITE;
964 int ret;
965
966 /*
967 * Shortcut for anonymous pages. We don't want
968 * to force the creation of pages tables for
969 * insanely big anonymously mapped areas that
970 * nobody touched so far. This is important
971 * for doing a core dump for these mappings.
972 */
973 if (!write && untouched_anonymous_page(mm,vma,start)) {
974 page = ZERO_PAGE(start);
975 break;
976 }
977 spin_unlock(&mm->page_table_lock);
978 ret = __handle_mm_fault(mm, vma, start, write_access);
979 972
973 cond_resched();
974 while (!(page = follow_page(mm, start, foll_flags))) {
975 int ret;
976 ret = __handle_mm_fault(mm, vma, start,
977 foll_flags & FOLL_WRITE);
980 /* 978 /*
981 * The VM_FAULT_WRITE bit tells us that do_wp_page has 979 * The VM_FAULT_WRITE bit tells us that do_wp_page has
982 * broken COW when necessary, even if maybe_mkwrite 980 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +982,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
984 * subsequent page lookups as if they were reads. 982 * subsequent page lookups as if they were reads.
985 */ 983 */
986 if (ret & VM_FAULT_WRITE) 984 if (ret & VM_FAULT_WRITE)
987 write_access = 0; 985 foll_flags &= ~FOLL_WRITE;
988 986
989 switch (ret & ~VM_FAULT_WRITE) { 987 switch (ret & ~VM_FAULT_WRITE) {
990 case VM_FAULT_MINOR: 988 case VM_FAULT_MINOR:
@@ -1000,13 +998,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1000 default: 998 default:
1001 BUG(); 999 BUG();
1002 } 1000 }
1003 spin_lock(&mm->page_table_lock);
1004 } 1001 }
1005 if (pages) { 1002 if (pages) {
1006 pages[i] = page; 1003 pages[i] = page;
1007 flush_dcache_page(page); 1004 flush_dcache_page(page);
1008 if (!PageReserved(page))
1009 page_cache_get(page);
1010 } 1005 }
1011 if (vmas) 1006 if (vmas)
1012 vmas[i] = vma; 1007 vmas[i] = vma;
@@ -1014,7 +1009,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1014 start += PAGE_SIZE; 1009 start += PAGE_SIZE;
1015 len--; 1010 len--;
1016 } while (len && start < vma->vm_end); 1011 } while (len && start < vma->vm_end);
1017 spin_unlock(&mm->page_table_lock);
1018 } while (len); 1012 } while (len);
1019 return i; 1013 return i;
1020} 1014}
@@ -1024,16 +1018,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1024 unsigned long addr, unsigned long end, pgprot_t prot) 1018 unsigned long addr, unsigned long end, pgprot_t prot)
1025{ 1019{
1026 pte_t *pte; 1020 pte_t *pte;
1021 spinlock_t *ptl;
1027 1022
1028 pte = pte_alloc_map(mm, pmd, addr); 1023 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1029 if (!pte) 1024 if (!pte)
1030 return -ENOMEM; 1025 return -ENOMEM;
1031 do { 1026 do {
1032 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); 1027 struct page *page = ZERO_PAGE(addr);
1028 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1029 page_cache_get(page);
1030 page_add_file_rmap(page);
1031 inc_mm_counter(mm, file_rss);
1033 BUG_ON(!pte_none(*pte)); 1032 BUG_ON(!pte_none(*pte));
1034 set_pte_at(mm, addr, pte, zero_pte); 1033 set_pte_at(mm, addr, pte, zero_pte);
1035 } while (pte++, addr += PAGE_SIZE, addr != end); 1034 } while (pte++, addr += PAGE_SIZE, addr != end);
1036 pte_unmap(pte - 1); 1035 pte_unmap_unlock(pte - 1, ptl);
1037 return 0; 1036 return 0;
1038} 1037}
1039 1038
@@ -1083,14 +1082,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
1083 BUG_ON(addr >= end); 1082 BUG_ON(addr >= end);
1084 pgd = pgd_offset(mm, addr); 1083 pgd = pgd_offset(mm, addr);
1085 flush_cache_range(vma, addr, end); 1084 flush_cache_range(vma, addr, end);
1086 spin_lock(&mm->page_table_lock);
1087 do { 1085 do {
1088 next = pgd_addr_end(addr, end); 1086 next = pgd_addr_end(addr, end);
1089 err = zeromap_pud_range(mm, pgd, addr, next, prot); 1087 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1090 if (err) 1088 if (err)
1091 break; 1089 break;
1092 } while (pgd++, addr = next, addr != end); 1090 } while (pgd++, addr = next, addr != end);
1093 spin_unlock(&mm->page_table_lock);
1094 return err; 1091 return err;
1095} 1092}
1096 1093
@@ -1104,17 +1101,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1104 unsigned long pfn, pgprot_t prot) 1101 unsigned long pfn, pgprot_t prot)
1105{ 1102{
1106 pte_t *pte; 1103 pte_t *pte;
1104 spinlock_t *ptl;
1107 1105
1108 pte = pte_alloc_map(mm, pmd, addr); 1106 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1109 if (!pte) 1107 if (!pte)
1110 return -ENOMEM; 1108 return -ENOMEM;
1111 do { 1109 do {
1112 BUG_ON(!pte_none(*pte)); 1110 BUG_ON(!pte_none(*pte));
1113 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) 1111 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1114 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1115 pfn++; 1112 pfn++;
1116 } while (pte++, addr += PAGE_SIZE, addr != end); 1113 } while (pte++, addr += PAGE_SIZE, addr != end);
1117 pte_unmap(pte - 1); 1114 pte_unmap_unlock(pte - 1, ptl);
1118 return 0; 1115 return 0;
1119} 1116}
1120 1117
@@ -1173,8 +1170,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1173 * rest of the world about it: 1170 * rest of the world about it:
1174 * VM_IO tells people not to look at these pages 1171 * VM_IO tells people not to look at these pages
1175 * (accesses can have side effects). 1172 * (accesses can have side effects).
1176 * VM_RESERVED tells swapout not to try to touch 1173 * VM_RESERVED tells the core MM not to "manage" these pages
1177 * this region. 1174 * (e.g. refcount, mapcount, try to swap them out).
1178 */ 1175 */
1179 vma->vm_flags |= VM_IO | VM_RESERVED; 1176 vma->vm_flags |= VM_IO | VM_RESERVED;
1180 1177
@@ -1182,7 +1179,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1182 pfn -= addr >> PAGE_SHIFT; 1179 pfn -= addr >> PAGE_SHIFT;
1183 pgd = pgd_offset(mm, addr); 1180 pgd = pgd_offset(mm, addr);
1184 flush_cache_range(vma, addr, end); 1181 flush_cache_range(vma, addr, end);
1185 spin_lock(&mm->page_table_lock);
1186 do { 1182 do {
1187 next = pgd_addr_end(addr, end); 1183 next = pgd_addr_end(addr, end);
1188 err = remap_pud_range(mm, pgd, addr, next, 1184 err = remap_pud_range(mm, pgd, addr, next,
@@ -1190,12 +1186,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1190 if (err) 1186 if (err)
1191 break; 1187 break;
1192 } while (pgd++, addr = next, addr != end); 1188 } while (pgd++, addr = next, addr != end);
1193 spin_unlock(&mm->page_table_lock);
1194 return err; 1189 return err;
1195} 1190}
1196EXPORT_SYMBOL(remap_pfn_range); 1191EXPORT_SYMBOL(remap_pfn_range);
1197 1192
1198/* 1193/*
1194 * handle_pte_fault chooses page fault handler according to an entry
1195 * which was read non-atomically. Before making any commitment, on
1196 * those architectures or configurations (e.g. i386 with PAE) which
1197 * might give a mix of unmatched parts, do_swap_page and do_file_page
1198 * must check under lock before unmapping the pte and proceeding
1199 * (but do_wp_page is only called after already making such a check;
1200 * and do_anonymous_page and do_no_page can safely check later on).
1201 */
1202static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1203 pte_t *page_table, pte_t orig_pte)
1204{
1205 int same = 1;
1206#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1207 if (sizeof(pte_t) > sizeof(unsigned long)) {
1208 spinlock_t *ptl = pte_lockptr(mm, pmd);
1209 spin_lock(ptl);
1210 same = pte_same(*page_table, orig_pte);
1211 spin_unlock(ptl);
1212 }
1213#endif
1214 pte_unmap(page_table);
1215 return same;
1216}
1217
1218/*
1199 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 1219 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1200 * servicing faults for write access. In the normal case, do always want 1220 * servicing faults for write access. In the normal case, do always want
1201 * pte_mkwrite. But get_user_pages can cause write faults for mappings 1221 * pte_mkwrite. But get_user_pages can cause write faults for mappings
@@ -1209,28 +1229,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1209} 1229}
1210 1230
1211/* 1231/*
1212 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
1213 */
1214static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
1215 pte_t *page_table)
1216{
1217 pte_t entry;
1218
1219 entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
1220 vma);
1221 ptep_establish(vma, address, page_table, entry);
1222 update_mmu_cache(vma, address, entry);
1223 lazy_mmu_prot_update(entry);
1224}
1225
1226/*
1227 * This routine handles present pages, when users try to write 1232 * This routine handles present pages, when users try to write
1228 * to a shared page. It is done by copying the page to a new address 1233 * to a shared page. It is done by copying the page to a new address
1229 * and decrementing the shared-page counter for the old page. 1234 * and decrementing the shared-page counter for the old page.
1230 * 1235 *
1231 * Goto-purists beware: the only reason for goto's here is that it results
1232 * in better assembly code.. The "default" path will see no jumps at all.
1233 *
1234 * Note that this routine assumes that the protection checks have been 1236 * Note that this routine assumes that the protection checks have been
1235 * done by the caller (the low-level page fault routine in most cases). 1237 * done by the caller (the low-level page fault routine in most cases).
1236 * Thus we can safely just mark it writable once we've done any necessary 1238 * Thus we can safely just mark it writable once we've done any necessary
@@ -1240,28 +1242,28 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
1240 * change only once the write actually happens. This avoids a few races, 1242 * change only once the write actually happens. This avoids a few races,
1241 * and potentially makes it more efficient. 1243 * and potentially makes it more efficient.
1242 * 1244 *
1243 * We hold the mm semaphore and the page_table_lock on entry and exit 1245 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1244 * with the page_table_lock released. 1246 * but allow concurrent faults), with pte both mapped and locked.
1247 * We return with mmap_sem still held, but pte unmapped and unlocked.
1245 */ 1248 */
1246static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, 1249static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1247 unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) 1250 unsigned long address, pte_t *page_table, pmd_t *pmd,
1251 spinlock_t *ptl, pte_t orig_pte)
1248{ 1252{
1249 struct page *old_page, *new_page; 1253 struct page *old_page, *new_page;
1250 unsigned long pfn = pte_pfn(pte); 1254 unsigned long pfn = pte_pfn(orig_pte);
1251 pte_t entry; 1255 pte_t entry;
1252 int ret; 1256 int ret = VM_FAULT_MINOR;
1257
1258 BUG_ON(vma->vm_flags & VM_RESERVED);
1253 1259
1254 if (unlikely(!pfn_valid(pfn))) { 1260 if (unlikely(!pfn_valid(pfn))) {
1255 /* 1261 /*
1256 * This should really halt the system so it can be debugged or 1262 * Page table corrupted: show pte and kill process.
1257 * at least the kernel stops what it's doing before it corrupts
1258 * data, but for the moment just pretend this is OOM.
1259 */ 1263 */
1260 pte_unmap(page_table); 1264 print_bad_pte(vma, orig_pte, address);
1261 printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", 1265 ret = VM_FAULT_OOM;
1262 address); 1266 goto unlock;
1263 spin_unlock(&mm->page_table_lock);
1264 return VM_FAULT_OOM;
1265 } 1267 }
1266 old_page = pfn_to_page(pfn); 1268 old_page = pfn_to_page(pfn);
1267 1269
@@ -1270,52 +1272,51 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1270 unlock_page(old_page); 1272 unlock_page(old_page);
1271 if (reuse) { 1273 if (reuse) {
1272 flush_cache_page(vma, address, pfn); 1274 flush_cache_page(vma, address, pfn);
1273 entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), 1275 entry = pte_mkyoung(orig_pte);
1274 vma); 1276 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1275 ptep_set_access_flags(vma, address, page_table, entry, 1); 1277 ptep_set_access_flags(vma, address, page_table, entry, 1);
1276 update_mmu_cache(vma, address, entry); 1278 update_mmu_cache(vma, address, entry);
1277 lazy_mmu_prot_update(entry); 1279 lazy_mmu_prot_update(entry);
1278 pte_unmap(page_table); 1280 ret |= VM_FAULT_WRITE;
1279 spin_unlock(&mm->page_table_lock); 1281 goto unlock;
1280 return VM_FAULT_MINOR|VM_FAULT_WRITE;
1281 } 1282 }
1282 } 1283 }
1283 pte_unmap(page_table);
1284 1284
1285 /* 1285 /*
1286 * Ok, we need to copy. Oh, well.. 1286 * Ok, we need to copy. Oh, well..
1287 */ 1287 */
1288 if (!PageReserved(old_page)) 1288 page_cache_get(old_page);
1289 page_cache_get(old_page); 1289 pte_unmap_unlock(page_table, ptl);
1290 spin_unlock(&mm->page_table_lock);
1291 1290
1292 if (unlikely(anon_vma_prepare(vma))) 1291 if (unlikely(anon_vma_prepare(vma)))
1293 goto no_new_page; 1292 goto oom;
1294 if (old_page == ZERO_PAGE(address)) { 1293 if (old_page == ZERO_PAGE(address)) {
1295 new_page = alloc_zeroed_user_highpage(vma, address); 1294 new_page = alloc_zeroed_user_highpage(vma, address);
1296 if (!new_page) 1295 if (!new_page)
1297 goto no_new_page; 1296 goto oom;
1298 } else { 1297 } else {
1299 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1298 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1300 if (!new_page) 1299 if (!new_page)
1301 goto no_new_page; 1300 goto oom;
1302 copy_user_highpage(new_page, old_page, address); 1301 copy_user_highpage(new_page, old_page, address);
1303 } 1302 }
1303
1304 /* 1304 /*
1305 * Re-check the pte - we dropped the lock 1305 * Re-check the pte - we dropped the lock
1306 */ 1306 */
1307 ret = VM_FAULT_MINOR; 1307 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1308 spin_lock(&mm->page_table_lock); 1308 if (likely(pte_same(*page_table, orig_pte))) {
1309 page_table = pte_offset_map(pmd, address); 1309 page_remove_rmap(old_page);
1310 if (likely(pte_same(*page_table, pte))) { 1310 if (!PageAnon(old_page)) {
1311 if (PageAnon(old_page)) 1311 inc_mm_counter(mm, anon_rss);
1312 dec_mm_counter(mm, anon_rss); 1312 dec_mm_counter(mm, file_rss);
1313 if (PageReserved(old_page)) 1313 }
1314 inc_mm_counter(mm, rss);
1315 else
1316 page_remove_rmap(old_page);
1317 flush_cache_page(vma, address, pfn); 1314 flush_cache_page(vma, address, pfn);
1318 break_cow(vma, new_page, address, page_table); 1315 entry = mk_pte(new_page, vma->vm_page_prot);
1316 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1317 ptep_establish(vma, address, page_table, entry);
1318 update_mmu_cache(vma, address, entry);
1319 lazy_mmu_prot_update(entry);
1319 lru_cache_add_active(new_page); 1320 lru_cache_add_active(new_page);
1320 page_add_anon_rmap(new_page, vma, address); 1321 page_add_anon_rmap(new_page, vma, address);
1321 1322
@@ -1323,13 +1324,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1323 new_page = old_page; 1324 new_page = old_page;
1324 ret |= VM_FAULT_WRITE; 1325 ret |= VM_FAULT_WRITE;
1325 } 1326 }
1326 pte_unmap(page_table);
1327 page_cache_release(new_page); 1327 page_cache_release(new_page);
1328 page_cache_release(old_page); 1328 page_cache_release(old_page);
1329 spin_unlock(&mm->page_table_lock); 1329unlock:
1330 pte_unmap_unlock(page_table, ptl);
1330 return ret; 1331 return ret;
1331 1332oom:
1332no_new_page:
1333 page_cache_release(old_page); 1333 page_cache_release(old_page);
1334 return VM_FAULT_OOM; 1334 return VM_FAULT_OOM;
1335} 1335}
@@ -1399,13 +1399,6 @@ again:
1399 1399
1400 restart_addr = zap_page_range(vma, start_addr, 1400 restart_addr = zap_page_range(vma, start_addr,
1401 end_addr - start_addr, details); 1401 end_addr - start_addr, details);
1402
1403 /*
1404 * We cannot rely on the break test in unmap_vmas:
1405 * on the one hand, we don't want to restart our loop
1406 * just because that broke out for the page_table_lock;
1407 * on the other hand, it does no test when vma is small.
1408 */
1409 need_break = need_resched() || 1402 need_break = need_resched() ||
1410 need_lockbreak(details->i_mmap_lock); 1403 need_lockbreak(details->i_mmap_lock);
1411 1404
@@ -1654,38 +1647,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
1654} 1647}
1655 1648
1656/* 1649/*
1657 * We hold the mm semaphore and the page_table_lock on entry and 1650 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1658 * should release the pagetable lock on exit.. 1651 * but allow concurrent faults), and pte mapped but not yet locked.
1652 * We return with mmap_sem still held, but pte unmapped and unlocked.
1659 */ 1653 */
1660static int do_swap_page(struct mm_struct * mm, 1654static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1661 struct vm_area_struct * vma, unsigned long address, 1655 unsigned long address, pte_t *page_table, pmd_t *pmd,
1662 pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) 1656 int write_access, pte_t orig_pte)
1663{ 1657{
1658 spinlock_t *ptl;
1664 struct page *page; 1659 struct page *page;
1665 swp_entry_t entry = pte_to_swp_entry(orig_pte); 1660 swp_entry_t entry;
1666 pte_t pte; 1661 pte_t pte;
1667 int ret = VM_FAULT_MINOR; 1662 int ret = VM_FAULT_MINOR;
1668 1663
1669 pte_unmap(page_table); 1664 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1670 spin_unlock(&mm->page_table_lock); 1665 goto out;
1666
1667 entry = pte_to_swp_entry(orig_pte);
1671 page = lookup_swap_cache(entry); 1668 page = lookup_swap_cache(entry);
1672 if (!page) { 1669 if (!page) {
1673 swapin_readahead(entry, address, vma); 1670 swapin_readahead(entry, address, vma);
1674 page = read_swap_cache_async(entry, vma, address); 1671 page = read_swap_cache_async(entry, vma, address);
1675 if (!page) { 1672 if (!page) {
1676 /* 1673 /*
1677 * Back out if somebody else faulted in this pte while 1674 * Back out if somebody else faulted in this pte
1678 * we released the page table lock. 1675 * while we released the pte lock.
1679 */ 1676 */
1680 spin_lock(&mm->page_table_lock); 1677 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1681 page_table = pte_offset_map(pmd, address);
1682 if (likely(pte_same(*page_table, orig_pte))) 1678 if (likely(pte_same(*page_table, orig_pte)))
1683 ret = VM_FAULT_OOM; 1679 ret = VM_FAULT_OOM;
1684 else 1680 goto unlock;
1685 ret = VM_FAULT_MINOR;
1686 pte_unmap(page_table);
1687 spin_unlock(&mm->page_table_lock);
1688 goto out;
1689 } 1681 }
1690 1682
1691 /* Had to read the page from swap area: Major fault */ 1683 /* Had to read the page from swap area: Major fault */
@@ -1698,15 +1690,11 @@ static int do_swap_page(struct mm_struct * mm,
1698 lock_page(page); 1690 lock_page(page);
1699 1691
1700 /* 1692 /*
1701 * Back out if somebody else faulted in this pte while we 1693 * Back out if somebody else already faulted in this pte.
1702 * released the page table lock.
1703 */ 1694 */
1704 spin_lock(&mm->page_table_lock); 1695 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1705 page_table = pte_offset_map(pmd, address); 1696 if (unlikely(!pte_same(*page_table, orig_pte)))
1706 if (unlikely(!pte_same(*page_table, orig_pte))) {
1707 ret = VM_FAULT_MINOR;
1708 goto out_nomap; 1697 goto out_nomap;
1709 }
1710 1698
1711 if (unlikely(!PageUptodate(page))) { 1699 if (unlikely(!PageUptodate(page))) {
1712 ret = VM_FAULT_SIGBUS; 1700 ret = VM_FAULT_SIGBUS;
@@ -1715,7 +1703,7 @@ static int do_swap_page(struct mm_struct * mm,
1715 1703
1716 /* The page isn't present yet, go ahead with the fault. */ 1704 /* The page isn't present yet, go ahead with the fault. */
1717 1705
1718 inc_mm_counter(mm, rss); 1706 inc_mm_counter(mm, anon_rss);
1719 pte = mk_pte(page, vma->vm_page_prot); 1707 pte = mk_pte(page, vma->vm_page_prot);
1720 if (write_access && can_share_swap_page(page)) { 1708 if (write_access && can_share_swap_page(page)) {
1721 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 1709 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1733,7 +1721,7 @@ static int do_swap_page(struct mm_struct * mm,
1733 1721
1734 if (write_access) { 1722 if (write_access) {
1735 if (do_wp_page(mm, vma, address, 1723 if (do_wp_page(mm, vma, address,
1736 page_table, pmd, pte) == VM_FAULT_OOM) 1724 page_table, pmd, ptl, pte) == VM_FAULT_OOM)
1737 ret = VM_FAULT_OOM; 1725 ret = VM_FAULT_OOM;
1738 goto out; 1726 goto out;
1739 } 1727 }
@@ -1741,74 +1729,76 @@ static int do_swap_page(struct mm_struct * mm,
1741 /* No need to invalidate - it was non-present before */ 1729 /* No need to invalidate - it was non-present before */
1742 update_mmu_cache(vma, address, pte); 1730 update_mmu_cache(vma, address, pte);
1743 lazy_mmu_prot_update(pte); 1731 lazy_mmu_prot_update(pte);
1744 pte_unmap(page_table); 1732unlock:
1745 spin_unlock(&mm->page_table_lock); 1733 pte_unmap_unlock(page_table, ptl);
1746out: 1734out:
1747 return ret; 1735 return ret;
1748out_nomap: 1736out_nomap:
1749 pte_unmap(page_table); 1737 pte_unmap_unlock(page_table, ptl);
1750 spin_unlock(&mm->page_table_lock);
1751 unlock_page(page); 1738 unlock_page(page);
1752 page_cache_release(page); 1739 page_cache_release(page);
1753 goto out; 1740 return ret;
1754} 1741}
1755 1742
1756/* 1743/*
1757 * We are called with the MM semaphore and page_table_lock 1744 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1758 * spinlock held to protect against concurrent faults in 1745 * but allow concurrent faults), and pte mapped but not yet locked.
1759 * multithreaded programs. 1746 * We return with mmap_sem still held, but pte unmapped and unlocked.
1760 */ 1747 */
1761static int 1748static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1762do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 1749 unsigned long address, pte_t *page_table, pmd_t *pmd,
1763 pte_t *page_table, pmd_t *pmd, int write_access, 1750 int write_access)
1764 unsigned long addr)
1765{ 1751{
1752 struct page *page;
1753 spinlock_t *ptl;
1766 pte_t entry; 1754 pte_t entry;
1767 struct page * page = ZERO_PAGE(addr);
1768
1769 /* Read-only mapping of ZERO_PAGE. */
1770 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1771 1755
1772 /* ..except if it's a write access */
1773 if (write_access) { 1756 if (write_access) {
1774 /* Allocate our own private page. */ 1757 /* Allocate our own private page. */
1775 pte_unmap(page_table); 1758 pte_unmap(page_table);
1776 spin_unlock(&mm->page_table_lock);
1777 1759
1778 if (unlikely(anon_vma_prepare(vma))) 1760 if (unlikely(anon_vma_prepare(vma)))
1779 goto no_mem; 1761 goto oom;
1780 page = alloc_zeroed_user_highpage(vma, addr); 1762 page = alloc_zeroed_user_highpage(vma, address);
1781 if (!page) 1763 if (!page)
1782 goto no_mem; 1764 goto oom;
1783 1765
1784 spin_lock(&mm->page_table_lock); 1766 entry = mk_pte(page, vma->vm_page_prot);
1785 page_table = pte_offset_map(pmd, addr); 1767 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1786 1768
1787 if (!pte_none(*page_table)) { 1769 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1788 pte_unmap(page_table); 1770 if (!pte_none(*page_table))
1789 page_cache_release(page); 1771 goto release;
1790 spin_unlock(&mm->page_table_lock); 1772 inc_mm_counter(mm, anon_rss);
1791 goto out;
1792 }
1793 inc_mm_counter(mm, rss);
1794 entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
1795 vma->vm_page_prot)),
1796 vma);
1797 lru_cache_add_active(page); 1773 lru_cache_add_active(page);
1798 SetPageReferenced(page); 1774 SetPageReferenced(page);
1799 page_add_anon_rmap(page, vma, addr); 1775 page_add_anon_rmap(page, vma, address);
1776 } else {
1777 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1778 page = ZERO_PAGE(address);
1779 page_cache_get(page);
1780 entry = mk_pte(page, vma->vm_page_prot);
1781
1782 ptl = pte_lockptr(mm, pmd);
1783 spin_lock(ptl);
1784 if (!pte_none(*page_table))
1785 goto release;
1786 inc_mm_counter(mm, file_rss);
1787 page_add_file_rmap(page);
1800 } 1788 }
1801 1789
1802 set_pte_at(mm, addr, page_table, entry); 1790 set_pte_at(mm, address, page_table, entry);
1803 pte_unmap(page_table);
1804 1791
1805 /* No need to invalidate - it was non-present before */ 1792 /* No need to invalidate - it was non-present before */
1806 update_mmu_cache(vma, addr, entry); 1793 update_mmu_cache(vma, address, entry);
1807 lazy_mmu_prot_update(entry); 1794 lazy_mmu_prot_update(entry);
1808 spin_unlock(&mm->page_table_lock); 1795unlock:
1809out: 1796 pte_unmap_unlock(page_table, ptl);
1810 return VM_FAULT_MINOR; 1797 return VM_FAULT_MINOR;
1811no_mem: 1798release:
1799 page_cache_release(page);
1800 goto unlock;
1801oom:
1812 return VM_FAULT_OOM; 1802 return VM_FAULT_OOM;
1813} 1803}
1814 1804
@@ -1821,25 +1811,23 @@ no_mem:
1821 * As this is called only for pages that do not currently exist, we 1811 * As this is called only for pages that do not currently exist, we
1822 * do not need to flush old virtual caches or the TLB. 1812 * do not need to flush old virtual caches or the TLB.
1823 * 1813 *
1824 * This is called with the MM semaphore held and the page table 1814 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1825 * spinlock held. Exit with the spinlock released. 1815 * but allow concurrent faults), and pte mapped but not yet locked.
1816 * We return with mmap_sem still held, but pte unmapped and unlocked.
1826 */ 1817 */
1827static int 1818static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1828do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1819 unsigned long address, pte_t *page_table, pmd_t *pmd,
1829 unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) 1820 int write_access)
1830{ 1821{
1831 struct page * new_page; 1822 spinlock_t *ptl;
1823 struct page *new_page;
1832 struct address_space *mapping = NULL; 1824 struct address_space *mapping = NULL;
1833 pte_t entry; 1825 pte_t entry;
1834 unsigned int sequence = 0; 1826 unsigned int sequence = 0;
1835 int ret = VM_FAULT_MINOR; 1827 int ret = VM_FAULT_MINOR;
1836 int anon = 0; 1828 int anon = 0;
1837 1829
1838 if (!vma->vm_ops || !vma->vm_ops->nopage)
1839 return do_anonymous_page(mm, vma, page_table,
1840 pmd, write_access, address);
1841 pte_unmap(page_table); 1830 pte_unmap(page_table);
1842 spin_unlock(&mm->page_table_lock);
1843 1831
1844 if (vma->vm_file) { 1832 if (vma->vm_file) {
1845 mapping = vma->vm_file->f_mapping; 1833 mapping = vma->vm_file->f_mapping;
@@ -1847,7 +1835,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1847 smp_rmb(); /* serializes i_size against truncate_count */ 1835 smp_rmb(); /* serializes i_size against truncate_count */
1848 } 1836 }
1849retry: 1837retry:
1850 cond_resched();
1851 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); 1838 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
1852 /* 1839 /*
1853 * No smp_rmb is needed here as long as there's a full 1840 * No smp_rmb is needed here as long as there's a full
@@ -1880,19 +1867,20 @@ retry:
1880 anon = 1; 1867 anon = 1;
1881 } 1868 }
1882 1869
1883 spin_lock(&mm->page_table_lock); 1870 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1884 /* 1871 /*
1885 * For a file-backed vma, someone could have truncated or otherwise 1872 * For a file-backed vma, someone could have truncated or otherwise
1886 * invalidated this page. If unmap_mapping_range got called, 1873 * invalidated this page. If unmap_mapping_range got called,
1887 * retry getting the page. 1874 * retry getting the page.
1888 */ 1875 */
1889 if (mapping && unlikely(sequence != mapping->truncate_count)) { 1876 if (mapping && unlikely(sequence != mapping->truncate_count)) {
1890 sequence = mapping->truncate_count; 1877 pte_unmap_unlock(page_table, ptl);
1891 spin_unlock(&mm->page_table_lock);
1892 page_cache_release(new_page); 1878 page_cache_release(new_page);
1879 cond_resched();
1880 sequence = mapping->truncate_count;
1881 smp_rmb();
1893 goto retry; 1882 goto retry;
1894 } 1883 }
1895 page_table = pte_offset_map(pmd, address);
1896 1884
1897 /* 1885 /*
1898 * This silly early PAGE_DIRTY setting removes a race 1886 * This silly early PAGE_DIRTY setting removes a race
@@ -1906,68 +1894,67 @@ retry:
1906 */ 1894 */
1907 /* Only go through if we didn't race with anybody else... */ 1895 /* Only go through if we didn't race with anybody else... */
1908 if (pte_none(*page_table)) { 1896 if (pte_none(*page_table)) {
1909 if (!PageReserved(new_page))
1910 inc_mm_counter(mm, rss);
1911
1912 flush_icache_page(vma, new_page); 1897 flush_icache_page(vma, new_page);
1913 entry = mk_pte(new_page, vma->vm_page_prot); 1898 entry = mk_pte(new_page, vma->vm_page_prot);
1914 if (write_access) 1899 if (write_access)
1915 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1900 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1916 set_pte_at(mm, address, page_table, entry); 1901 set_pte_at(mm, address, page_table, entry);
1917 if (anon) { 1902 if (anon) {
1903 inc_mm_counter(mm, anon_rss);
1918 lru_cache_add_active(new_page); 1904 lru_cache_add_active(new_page);
1919 page_add_anon_rmap(new_page, vma, address); 1905 page_add_anon_rmap(new_page, vma, address);
1920 } else 1906 } else if (!(vma->vm_flags & VM_RESERVED)) {
1907 inc_mm_counter(mm, file_rss);
1921 page_add_file_rmap(new_page); 1908 page_add_file_rmap(new_page);
1922 pte_unmap(page_table); 1909 }
1923 } else { 1910 } else {
1924 /* One of our sibling threads was faster, back out. */ 1911 /* One of our sibling threads was faster, back out. */
1925 pte_unmap(page_table);
1926 page_cache_release(new_page); 1912 page_cache_release(new_page);
1927 spin_unlock(&mm->page_table_lock); 1913 goto unlock;
1928 goto out;
1929 } 1914 }
1930 1915
1931 /* no need to invalidate: a not-present page shouldn't be cached */ 1916 /* no need to invalidate: a not-present page shouldn't be cached */
1932 update_mmu_cache(vma, address, entry); 1917 update_mmu_cache(vma, address, entry);
1933 lazy_mmu_prot_update(entry); 1918 lazy_mmu_prot_update(entry);
1934 spin_unlock(&mm->page_table_lock); 1919unlock:
1935out: 1920 pte_unmap_unlock(page_table, ptl);
1936 return ret; 1921 return ret;
1937oom: 1922oom:
1938 page_cache_release(new_page); 1923 page_cache_release(new_page);
1939 ret = VM_FAULT_OOM; 1924 return VM_FAULT_OOM;
1940 goto out;
1941} 1925}
1942 1926
1943/* 1927/*
1944 * Fault of a previously existing named mapping. Repopulate the pte 1928 * Fault of a previously existing named mapping. Repopulate the pte
1945 * from the encoded file_pte if possible. This enables swappable 1929 * from the encoded file_pte if possible. This enables swappable
1946 * nonlinear vmas. 1930 * nonlinear vmas.
1931 *
1932 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1933 * but allow concurrent faults), and pte mapped but not yet locked.
1934 * We return with mmap_sem still held, but pte unmapped and unlocked.
1947 */ 1935 */
1948static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, 1936static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1949 unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) 1937 unsigned long address, pte_t *page_table, pmd_t *pmd,
1938 int write_access, pte_t orig_pte)
1950{ 1939{
1951 unsigned long pgoff; 1940 pgoff_t pgoff;
1952 int err; 1941 int err;
1953 1942
1954 BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); 1943 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1955 /* 1944 return VM_FAULT_MINOR;
1956 * Fall back to the linear mapping if the fs does not support
1957 * ->populate:
1958 */
1959 if (!vma->vm_ops->populate ||
1960 (write_access && !(vma->vm_flags & VM_SHARED))) {
1961 pte_clear(mm, address, pte);
1962 return do_no_page(mm, vma, address, write_access, pte, pmd);
1963 }
1964
1965 pgoff = pte_to_pgoff(*pte);
1966 1945
1967 pte_unmap(pte); 1946 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
1968 spin_unlock(&mm->page_table_lock); 1947 /*
1948 * Page table corrupted: show pte and kill process.
1949 */
1950 print_bad_pte(vma, orig_pte, address);
1951 return VM_FAULT_OOM;
1952 }
1953 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
1969 1954
1970 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); 1955 pgoff = pte_to_pgoff(orig_pte);
1956 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
1957 vma->vm_page_prot, pgoff, 0);
1971 if (err == -ENOMEM) 1958 if (err == -ENOMEM)
1972 return VM_FAULT_OOM; 1959 return VM_FAULT_OOM;
1973 if (err) 1960 if (err)
@@ -1984,56 +1971,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
1984 * with external mmu caches can use to update those (ie the Sparc or 1971 * with external mmu caches can use to update those (ie the Sparc or
1985 * PowerPC hashed page tables that act as extended TLBs). 1972 * PowerPC hashed page tables that act as extended TLBs).
1986 * 1973 *
1987 * Note the "page_table_lock". It is to protect against kswapd removing 1974 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1988 * pages from under us. Note that kswapd only ever _removes_ pages, never 1975 * but allow concurrent faults), and pte mapped but not yet locked.
1989 * adds them. As such, once we have noticed that the page is not present, 1976 * We return with mmap_sem still held, but pte unmapped and unlocked.
1990 * we can drop the lock early.
1991 *
1992 * The adding of pages is protected by the MM semaphore (which we hold),
1993 * so we don't need to worry about a page being suddenly been added into
1994 * our VM.
1995 *
1996 * We enter with the pagetable spinlock held, we are supposed to
1997 * release it when done.
1998 */ 1977 */
1999static inline int handle_pte_fault(struct mm_struct *mm, 1978static inline int handle_pte_fault(struct mm_struct *mm,
2000 struct vm_area_struct * vma, unsigned long address, 1979 struct vm_area_struct *vma, unsigned long address,
2001 int write_access, pte_t *pte, pmd_t *pmd) 1980 pte_t *pte, pmd_t *pmd, int write_access)
2002{ 1981{
2003 pte_t entry; 1982 pte_t entry;
1983 pte_t old_entry;
1984 spinlock_t *ptl;
2004 1985
2005 entry = *pte; 1986 old_entry = entry = *pte;
2006 if (!pte_present(entry)) { 1987 if (!pte_present(entry)) {
2007 /* 1988 if (pte_none(entry)) {
2008 * If it truly wasn't present, we know that kswapd 1989 if (!vma->vm_ops || !vma->vm_ops->nopage)
2009 * and the PTE updates will not touch it later. So 1990 return do_anonymous_page(mm, vma, address,
2010 * drop the lock. 1991 pte, pmd, write_access);
2011 */ 1992 return do_no_page(mm, vma, address,
2012 if (pte_none(entry)) 1993 pte, pmd, write_access);
2013 return do_no_page(mm, vma, address, write_access, pte, pmd); 1994 }
2014 if (pte_file(entry)) 1995 if (pte_file(entry))
2015 return do_file_page(mm, vma, address, write_access, pte, pmd); 1996 return do_file_page(mm, vma, address,
2016 return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); 1997 pte, pmd, write_access, entry);
1998 return do_swap_page(mm, vma, address,
1999 pte, pmd, write_access, entry);
2017 } 2000 }
2018 2001
2002 ptl = pte_lockptr(mm, pmd);
2003 spin_lock(ptl);
2004 if (unlikely(!pte_same(*pte, entry)))
2005 goto unlock;
2019 if (write_access) { 2006 if (write_access) {
2020 if (!pte_write(entry)) 2007 if (!pte_write(entry))
2021 return do_wp_page(mm, vma, address, pte, pmd, entry); 2008 return do_wp_page(mm, vma, address,
2009 pte, pmd, ptl, entry);
2022 entry = pte_mkdirty(entry); 2010 entry = pte_mkdirty(entry);
2023 } 2011 }
2024 entry = pte_mkyoung(entry); 2012 entry = pte_mkyoung(entry);
2025 ptep_set_access_flags(vma, address, pte, entry, write_access); 2013 if (!pte_same(old_entry, entry)) {
2026 update_mmu_cache(vma, address, entry); 2014 ptep_set_access_flags(vma, address, pte, entry, write_access);
2027 lazy_mmu_prot_update(entry); 2015 update_mmu_cache(vma, address, entry);
2028 pte_unmap(pte); 2016 lazy_mmu_prot_update(entry);
2029 spin_unlock(&mm->page_table_lock); 2017 } else {
2018 /*
2019 * This is needed only for protection faults but the arch code
2020 * is not yet telling us if this is a protection fault or not.
2021 * This still avoids useless tlb flushes for .text page faults
2022 * with threads.
2023 */
2024 if (write_access)
2025 flush_tlb_page(vma, address);
2026 }
2027unlock:
2028 pte_unmap_unlock(pte, ptl);
2030 return VM_FAULT_MINOR; 2029 return VM_FAULT_MINOR;
2031} 2030}
2032 2031
2033/* 2032/*
2034 * By the time we get here, we already hold the mm semaphore 2033 * By the time we get here, we already hold the mm semaphore
2035 */ 2034 */
2036int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, 2035int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2037 unsigned long address, int write_access) 2036 unsigned long address, int write_access)
2038{ 2037{
2039 pgd_t *pgd; 2038 pgd_t *pgd;
@@ -2048,100 +2047,66 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
2048 if (unlikely(is_vm_hugetlb_page(vma))) 2047 if (unlikely(is_vm_hugetlb_page(vma)))
2049 return hugetlb_fault(mm, vma, address, write_access); 2048 return hugetlb_fault(mm, vma, address, write_access);
2050 2049
2051 /*
2052 * We need the page table lock to synchronize with kswapd
2053 * and the SMP-safe atomic PTE updates.
2054 */
2055 pgd = pgd_offset(mm, address); 2050 pgd = pgd_offset(mm, address);
2056 spin_lock(&mm->page_table_lock);
2057
2058 pud = pud_alloc(mm, pgd, address); 2051 pud = pud_alloc(mm, pgd, address);
2059 if (!pud) 2052 if (!pud)
2060 goto oom; 2053 return VM_FAULT_OOM;
2061
2062 pmd = pmd_alloc(mm, pud, address); 2054 pmd = pmd_alloc(mm, pud, address);
2063 if (!pmd) 2055 if (!pmd)
2064 goto oom; 2056 return VM_FAULT_OOM;
2065
2066 pte = pte_alloc_map(mm, pmd, address); 2057 pte = pte_alloc_map(mm, pmd, address);
2067 if (!pte) 2058 if (!pte)
2068 goto oom; 2059 return VM_FAULT_OOM;
2069
2070 return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
2071 2060
2072 oom: 2061 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2073 spin_unlock(&mm->page_table_lock);
2074 return VM_FAULT_OOM;
2075} 2062}
2076 2063
2077#ifndef __PAGETABLE_PUD_FOLDED 2064#ifndef __PAGETABLE_PUD_FOLDED
2078/* 2065/*
2079 * Allocate page upper directory. 2066 * Allocate page upper directory.
2080 * 2067 * We've already handled the fast-path in-line.
2081 * We've already handled the fast-path in-line, and we own the
2082 * page table lock.
2083 */ 2068 */
2084pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 2069int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2085{ 2070{
2086 pud_t *new; 2071 pud_t *new = pud_alloc_one(mm, address);
2087
2088 spin_unlock(&mm->page_table_lock);
2089 new = pud_alloc_one(mm, address);
2090 spin_lock(&mm->page_table_lock);
2091 if (!new) 2072 if (!new)
2092 return NULL; 2073 return -ENOMEM;
2093 2074
2094 /* 2075 spin_lock(&mm->page_table_lock);
2095 * Because we dropped the lock, we should re-check the 2076 if (pgd_present(*pgd)) /* Another has populated it */
2096 * entry, as somebody else could have populated it..
2097 */
2098 if (pgd_present(*pgd)) {
2099 pud_free(new); 2077 pud_free(new);
2100 goto out; 2078 else
2101 } 2079 pgd_populate(mm, pgd, new);
2102 pgd_populate(mm, pgd, new); 2080 spin_unlock(&mm->page_table_lock);
2103 out: 2081 return 0;
2104 return pud_offset(pgd, address);
2105} 2082}
2106#endif /* __PAGETABLE_PUD_FOLDED */ 2083#endif /* __PAGETABLE_PUD_FOLDED */
2107 2084
2108#ifndef __PAGETABLE_PMD_FOLDED 2085#ifndef __PAGETABLE_PMD_FOLDED
2109/* 2086/*
2110 * Allocate page middle directory. 2087 * Allocate page middle directory.
2111 * 2088 * We've already handled the fast-path in-line.
2112 * We've already handled the fast-path in-line, and we own the
2113 * page table lock.
2114 */ 2089 */
2115pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 2090int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2116{ 2091{
2117 pmd_t *new; 2092 pmd_t *new = pmd_alloc_one(mm, address);
2118
2119 spin_unlock(&mm->page_table_lock);
2120 new = pmd_alloc_one(mm, address);
2121 spin_lock(&mm->page_table_lock);
2122 if (!new) 2093 if (!new)
2123 return NULL; 2094 return -ENOMEM;
2124 2095
2125 /* 2096 spin_lock(&mm->page_table_lock);
2126 * Because we dropped the lock, we should re-check the
2127 * entry, as somebody else could have populated it..
2128 */
2129#ifndef __ARCH_HAS_4LEVEL_HACK 2097#ifndef __ARCH_HAS_4LEVEL_HACK
2130 if (pud_present(*pud)) { 2098 if (pud_present(*pud)) /* Another has populated it */
2131 pmd_free(new); 2099 pmd_free(new);
2132 goto out; 2100 else
2133 } 2101 pud_populate(mm, pud, new);
2134 pud_populate(mm, pud, new);
2135#else 2102#else
2136 if (pgd_present(*pud)) { 2103 if (pgd_present(*pud)) /* Another has populated it */
2137 pmd_free(new); 2104 pmd_free(new);
2138 goto out; 2105 else
2139 } 2106 pgd_populate(mm, pud, new);
2140 pgd_populate(mm, pud, new);
2141#endif /* __ARCH_HAS_4LEVEL_HACK */ 2107#endif /* __ARCH_HAS_4LEVEL_HACK */
2142 2108 spin_unlock(&mm->page_table_lock);
2143 out: 2109 return 0;
2144 return pmd_offset(pud, address);
2145} 2110}
2146#endif /* __PAGETABLE_PMD_FOLDED */ 2111#endif /* __PAGETABLE_PMD_FOLDED */
2147 2112
@@ -2206,22 +2171,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2206 2171
2207EXPORT_SYMBOL(vmalloc_to_pfn); 2172EXPORT_SYMBOL(vmalloc_to_pfn);
2208 2173
2209/*
2210 * update_mem_hiwater
2211 * - update per process rss and vm high water data
2212 */
2213void update_mem_hiwater(struct task_struct *tsk)
2214{
2215 if (tsk->mm) {
2216 unsigned long rss = get_mm_counter(tsk->mm, rss);
2217
2218 if (tsk->mm->hiwater_rss < rss)
2219 tsk->mm->hiwater_rss = rss;
2220 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2221 tsk->mm->hiwater_vm = tsk->mm->total_vm;
2222 }
2223}
2224
2225#if !defined(__HAVE_ARCH_GATE_AREA) 2174#if !defined(__HAVE_ARCH_GATE_AREA)
2226 2175
2227#if defined(AT_SYSINFO_EHDR) 2176#if defined(AT_SYSINFO_EHDR)
@@ -2233,7 +2182,7 @@ static int __init gate_vma_init(void)
2233 gate_vma.vm_start = FIXADDR_USER_START; 2182 gate_vma.vm_start = FIXADDR_USER_START;
2234 gate_vma.vm_end = FIXADDR_USER_END; 2183 gate_vma.vm_end = FIXADDR_USER_END;
2235 gate_vma.vm_page_prot = PAGE_READONLY; 2184 gate_vma.vm_page_prot = PAGE_READONLY;
2236 gate_vma.vm_flags = 0; 2185 gate_vma.vm_flags = VM_RESERVED;
2237 return 0; 2186 return 0;
2238} 2187}
2239__initcall(gate_vma_init); 2188__initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
new file mode 100644
index 000000000000..431a64f021c0
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,138 @@
1/*
2 * linux/mm/memory_hotplug.c
3 *
4 * Copyright (C)
5 */
6
7#include <linux/config.h>
8#include <linux/stddef.h>
9#include <linux/mm.h>
10#include <linux/swap.h>
11#include <linux/interrupt.h>
12#include <linux/pagemap.h>
13#include <linux/bootmem.h>
14#include <linux/compiler.h>
15#include <linux/module.h>
16#include <linux/pagevec.h>
17#include <linux/slab.h>
18#include <linux/sysctl.h>
19#include <linux/cpu.h>
20#include <linux/memory.h>
21#include <linux/memory_hotplug.h>
22#include <linux/highmem.h>
23#include <linux/vmalloc.h>
24
25#include <asm/tlbflush.h>
26
27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
28 unsigned long size);
29static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
30{
31 struct pglist_data *pgdat = zone->zone_pgdat;
32 int nr_pages = PAGES_PER_SECTION;
33 int nid = pgdat->node_id;
34 int zone_type;
35
36 zone_type = zone - pgdat->node_zones;
37 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
38 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
39}
40
41extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
42 int nr_pages);
43static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
44{
45 struct pglist_data *pgdat = zone->zone_pgdat;
46 int nr_pages = PAGES_PER_SECTION;
47 int ret;
48
49 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
50
51 if (ret < 0)
52 return ret;
53
54 __add_zone(zone, phys_start_pfn);
55 return register_new_memory(__pfn_to_section(phys_start_pfn));
56}
57
58/*
59 * Reasonably generic function for adding memory. It is
60 * expected that archs that support memory hotplug will
61 * call this function after deciding the zone to which to
62 * add the new pages.
63 */
64int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
65 unsigned long nr_pages)
66{
67 unsigned long i;
68 int err = 0;
69
70 for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
71 err = __add_section(zone, phys_start_pfn + i);
72
73 if (err)
74 break;
75 }
76
77 return err;
78}
79
80static void grow_zone_span(struct zone *zone,
81 unsigned long start_pfn, unsigned long end_pfn)
82{
83 unsigned long old_zone_end_pfn;
84
85 zone_span_writelock(zone);
86
87 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
88 if (start_pfn < zone->zone_start_pfn)
89 zone->zone_start_pfn = start_pfn;
90
91 if (end_pfn > old_zone_end_pfn)
92 zone->spanned_pages = end_pfn - zone->zone_start_pfn;
93
94 zone_span_writeunlock(zone);
95}
96
97static void grow_pgdat_span(struct pglist_data *pgdat,
98 unsigned long start_pfn, unsigned long end_pfn)
99{
100 unsigned long old_pgdat_end_pfn =
101 pgdat->node_start_pfn + pgdat->node_spanned_pages;
102
103 if (start_pfn < pgdat->node_start_pfn)
104 pgdat->node_start_pfn = start_pfn;
105
106 if (end_pfn > old_pgdat_end_pfn)
107 pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
108}
109
110int online_pages(unsigned long pfn, unsigned long nr_pages)
111{
112 unsigned long i;
113 unsigned long flags;
114 unsigned long onlined_pages = 0;
115 struct zone *zone;
116
117 /*
118 * This doesn't need a lock to do pfn_to_page().
119 * The section can't be removed here because of the
120 * memory_block->state_sem.
121 */
122 zone = page_zone(pfn_to_page(pfn));
123 pgdat_resize_lock(zone->zone_pgdat, &flags);
124 grow_zone_span(zone, pfn, pfn + nr_pages);
125 grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
126 pgdat_resize_unlock(zone->zone_pgdat, &flags);
127
128 for (i = 0; i < nr_pages; i++) {
129 struct page *page = pfn_to_page(pfn + i);
130 online_page(page);
131 onlined_pages++;
132 }
133 zone->present_pages += onlined_pages;
134
135 setup_per_zone_pages_min();
136
137 return 0;
138}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1d5c64df1653..2076b1542b8a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
2 * Simple NUMA memory policy for the Linux kernel. 2 * Simple NUMA memory policy for the Linux kernel.
3 * 3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
5 * Subject to the GNU Public License, version 2. 6 * Subject to the GNU Public License, version 2.
6 * 7 *
7 * NUMA policy allows the user to give hints in which node(s) memory should 8 * NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
17 * offset into the backing object or offset into the mapping 18 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter 19 * for anonymous memory. For process policy an process counter
19 * is used. 20 * is used.
21 *
20 * bind Only allocate memory on a specific set of nodes, 22 * bind Only allocate memory on a specific set of nodes,
21 * no fallback. 23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
22 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation 29 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
26 * process policy. 32 * process policy.
33 *
27 * default Allocate on the local node first, or when on a VMA 34 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did 35 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default. 36 * in a NUMA aware kernel and still does by, ahem, default.
@@ -93,23 +100,10 @@ struct mempolicy default_policy = {
93 .policy = MPOL_DEFAULT, 100 .policy = MPOL_DEFAULT,
94}; 101};
95 102
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */ 103/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes) 104static int mpol_check_policy(int mode, nodemask_t *nodes)
111{ 105{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES); 106 int empty = nodes_empty(*nodes);
113 107
114 switch (mode) { 108 switch (mode) {
115 case MPOL_DEFAULT: 109 case MPOL_DEFAULT:
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes)
124 return -EINVAL; 118 return -EINVAL;
125 break; 119 break;
126 } 120 }
127 return nodes_online(nodes); 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176} 122}
177
178/* Generate a custom zonelist for the BIND policy. */ 123/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes) 124static struct zonelist *bind_zonelist(nodemask_t *nodes)
180{ 125{
181 struct zonelist *zl; 126 struct zonelist *zl;
182 int num, max, nd; 127 int num, max, nd;
183 128
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); 129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl) 131 if (!zl)
187 return NULL; 132 return NULL;
188 num = 0; 133 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES); 134 for_each_node_mask(nd, *nodes) {
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k; 135 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) { 136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes)
199 policy_zone = k; 142 policy_zone = k;
200 } 143 }
201 } 144 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL; 145 zl->zones[num] = NULL;
204 return zl; 146 return zl;
205} 147}
206 148
207/* Create a new policy */ 149/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes) 150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
209{ 151{
210 struct mempolicy *policy; 152 struct mempolicy *policy;
211 153
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); 154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
213 if (mode == MPOL_DEFAULT) 155 if (mode == MPOL_DEFAULT)
214 return NULL; 156 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
218 atomic_set(&policy->refcnt, 1); 160 atomic_set(&policy->refcnt, 1);
219 switch (mode) { 161 switch (mode) {
220 case MPOL_INTERLEAVE: 162 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); 163 policy->v.nodes = *nodes;
222 break; 164 break;
223 case MPOL_PREFERRED: 165 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); 166 policy->v.preferred_node = first_node(*nodes);
225 if (policy->v.preferred_node >= MAX_NUMNODES) 167 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1; 168 policy->v.preferred_node = -1;
227 break; 169 break;
@@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
238} 180}
239 181
240/* Ensure all existing pages follow the policy. */ 182/* Ensure all existing pages follow the policy. */
241static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, 183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
242 unsigned long addr, unsigned long end, unsigned long *nodes) 184 unsigned long addr, unsigned long end, nodemask_t *nodes)
243{ 185{
244 pte_t *orig_pte; 186 pte_t *orig_pte;
245 pte_t *pte; 187 pte_t *pte;
188 spinlock_t *ptl;
246 189
247 spin_lock(&mm->page_table_lock); 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
248 orig_pte = pte = pte_offset_map(pmd, addr);
249 do { 191 do {
250 unsigned long pfn; 192 unsigned long pfn;
251 unsigned int nid; 193 unsigned int nid;
@@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
253 if (!pte_present(*pte)) 195 if (!pte_present(*pte))
254 continue; 196 continue;
255 pfn = pte_pfn(*pte); 197 pfn = pte_pfn(*pte);
256 if (!pfn_valid(pfn)) 198 if (!pfn_valid(pfn)) {
199 print_bad_pte(vma, *pte, addr);
257 continue; 200 continue;
201 }
258 nid = pfn_to_nid(pfn); 202 nid = pfn_to_nid(pfn);
259 if (!test_bit(nid, nodes)) 203 if (!node_isset(nid, *nodes))
260 break; 204 break;
261 } while (pte++, addr += PAGE_SIZE, addr != end); 205 } while (pte++, addr += PAGE_SIZE, addr != end);
262 pte_unmap(orig_pte); 206 pte_unmap_unlock(orig_pte, ptl);
263 spin_unlock(&mm->page_table_lock);
264 return addr != end; 207 return addr != end;
265} 208}
266 209
267static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, 210static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes) 211 unsigned long addr, unsigned long end, nodemask_t *nodes)
269{ 212{
270 pmd_t *pmd; 213 pmd_t *pmd;
271 unsigned long next; 214 unsigned long next;
@@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
275 next = pmd_addr_end(addr, end); 218 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd)) 219 if (pmd_none_or_clear_bad(pmd))
277 continue; 220 continue;
278 if (check_pte_range(mm, pmd, addr, next, nodes)) 221 if (check_pte_range(vma, pmd, addr, next, nodes))
279 return -EIO; 222 return -EIO;
280 } while (pmd++, addr = next, addr != end); 223 } while (pmd++, addr = next, addr != end);
281 return 0; 224 return 0;
282} 225}
283 226
284static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, 227static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
285 unsigned long addr, unsigned long end, unsigned long *nodes) 228 unsigned long addr, unsigned long end, nodemask_t *nodes)
286{ 229{
287 pud_t *pud; 230 pud_t *pud;
288 unsigned long next; 231 unsigned long next;
@@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
292 next = pud_addr_end(addr, end); 235 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud)) 236 if (pud_none_or_clear_bad(pud))
294 continue; 237 continue;
295 if (check_pmd_range(mm, pud, addr, next, nodes)) 238 if (check_pmd_range(vma, pud, addr, next, nodes))
296 return -EIO; 239 return -EIO;
297 } while (pud++, addr = next, addr != end); 240 } while (pud++, addr = next, addr != end);
298 return 0; 241 return 0;
299} 242}
300 243
301static inline int check_pgd_range(struct mm_struct *mm, 244static inline int check_pgd_range(struct vm_area_struct *vma,
302 unsigned long addr, unsigned long end, unsigned long *nodes) 245 unsigned long addr, unsigned long end, nodemask_t *nodes)
303{ 246{
304 pgd_t *pgd; 247 pgd_t *pgd;
305 unsigned long next; 248 unsigned long next;
306 249
307 pgd = pgd_offset(mm, addr); 250 pgd = pgd_offset(vma->vm_mm, addr);
308 do { 251 do {
309 next = pgd_addr_end(addr, end); 252 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd)) 253 if (pgd_none_or_clear_bad(pgd))
311 continue; 254 continue;
312 if (check_pud_range(mm, pgd, addr, next, nodes)) 255 if (check_pud_range(vma, pgd, addr, next, nodes))
313 return -EIO; 256 return -EIO;
314 } while (pgd++, addr = next, addr != end); 257 } while (pgd++, addr = next, addr != end);
315 return 0; 258 return 0;
@@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm,
318/* Step 1: check the range */ 261/* Step 1: check the range */
319static struct vm_area_struct * 262static struct vm_area_struct *
320check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 263check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 unsigned long *nodes, unsigned long flags) 264 nodemask_t *nodes, unsigned long flags)
322{ 265{
323 int err; 266 int err;
324 struct vm_area_struct *first, *vma, *prev; 267 struct vm_area_struct *first, *vma, *prev;
@@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
326 first = find_vma(mm, start); 269 first = find_vma(mm, start);
327 if (!first) 270 if (!first)
328 return ERR_PTR(-EFAULT); 271 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
329 prev = NULL; 274 prev = NULL;
330 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 if (!vma->vm_next && vma->vm_end < end) 276 if (!vma->vm_next && vma->vm_end < end)
@@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
338 endvma = end; 283 endvma = end;
339 if (vma->vm_start > start) 284 if (vma->vm_start > start)
340 start = vma->vm_start; 285 start = vma->vm_start;
341 err = check_pgd_range(vma->vm_mm, 286 err = check_pgd_range(vma, start, endvma, nodes);
342 start, endvma, nodes);
343 if (err) { 287 if (err) {
344 first = ERR_PTR(err); 288 first = ERR_PTR(err);
345 break; 289 break;
@@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
393 return err; 337 return err;
394} 338}
395 339
396/* Change policy for a memory range */ 340static int contextualize_policy(int mode, nodemask_t *nodes)
397asmlinkage long sys_mbind(unsigned long start, unsigned long len, 341{
398 unsigned long mode, 342 if (!nodes)
399 unsigned long __user *nmask, unsigned long maxnode, 343 return 0;
400 unsigned flags) 344
345 /* Update current mems_allowed */
346 cpuset_update_current_mems_allowed();
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
401{ 354{
402 struct vm_area_struct *vma; 355 struct vm_area_struct *vma;
403 struct mm_struct *mm = current->mm; 356 struct mm_struct *mm = current->mm;
404 struct mempolicy *new; 357 struct mempolicy *new;
405 unsigned long end; 358 unsigned long end;
406 DECLARE_BITMAP(nodes, MAX_NUMNODES);
407 int err; 359 int err;
408 360
409 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
418 return -EINVAL; 370 return -EINVAL;
419 if (end == start) 371 if (end == start)
420 return 0; 372 return 0;
421 373 if (mpol_check_policy(mode, nmask))
422 err = get_nodes(nodes, nmask, maxnode, mode); 374 return -EINVAL;
423 if (err) 375 new = mpol_new(mode, nmask);
424 return err;
425
426 new = mpol_new(mode, nodes);
427 if (IS_ERR(new)) 376 if (IS_ERR(new))
428 return PTR_ERR(new); 377 return PTR_ERR(new);
429 378
430 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
431 mode,nodes[0]); 380 mode,nodes_addr(nodes)[0]);
432 381
433 down_write(&mm->mmap_sem); 382 down_write(&mm->mmap_sem);
434 vma = check_range(mm, start, end, nodes, flags); 383 vma = check_range(mm, start, end, nmask, flags);
435 err = PTR_ERR(vma); 384 err = PTR_ERR(vma);
436 if (!IS_ERR(vma)) 385 if (!IS_ERR(vma))
437 err = mbind_range(vma, start, end, new); 386 err = mbind_range(vma, start, end, new);
@@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
441} 390}
442 391
443/* Set the process memory policy */ 392/* Set the process memory policy */
444asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 393long do_set_mempolicy(int mode, nodemask_t *nodes)
445 unsigned long maxnode)
446{ 394{
447 int err;
448 struct mempolicy *new; 395 struct mempolicy *new;
449 DECLARE_BITMAP(nodes, MAX_NUMNODES);
450 396
451 if (mode < 0 || mode > MPOL_MAX) 397 if (contextualize_policy(mode, nodes))
452 return -EINVAL; 398 return -EINVAL;
453 err = get_nodes(nodes, nmask, maxnode, mode);
454 if (err)
455 return err;
456 new = mpol_new(mode, nodes); 399 new = mpol_new(mode, nodes);
457 if (IS_ERR(new)) 400 if (IS_ERR(new))
458 return PTR_ERR(new); 401 return PTR_ERR(new);
459 mpol_free(current->mempolicy); 402 mpol_free(current->mempolicy);
460 current->mempolicy = new; 403 current->mempolicy = new;
461 if (new && new->policy == MPOL_INTERLEAVE) 404 if (new && new->policy == MPOL_INTERLEAVE)
462 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); 405 current->il_next = first_node(new->v.nodes);
463 return 0; 406 return 0;
464} 407}
465 408
466/* Fill a zone bitmap for a policy */ 409/* Fill a zone bitmap for a policy */
467static void get_zonemask(struct mempolicy *p, unsigned long *nodes) 410static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
468{ 411{
469 int i; 412 int i;
470 413
471 bitmap_zero(nodes, MAX_NUMNODES); 414 nodes_clear(*nodes);
472 switch (p->policy) { 415 switch (p->policy) {
473 case MPOL_BIND: 416 case MPOL_BIND:
474 for (i = 0; p->v.zonelist->zones[i]; i++) 417 for (i = 0; p->v.zonelist->zones[i]; i++)
475 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); 418 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 *nodes);
476 break; 420 break;
477 case MPOL_DEFAULT: 421 case MPOL_DEFAULT:
478 break; 422 break;
479 case MPOL_INTERLEAVE: 423 case MPOL_INTERLEAVE:
480 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); 424 *nodes = p->v.nodes;
481 break; 425 break;
482 case MPOL_PREFERRED: 426 case MPOL_PREFERRED:
483 /* or use current node instead of online map? */ 427 /* or use current node instead of online map? */
484 if (p->v.preferred_node < 0) 428 if (p->v.preferred_node < 0)
485 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); 429 *nodes = node_online_map;
486 else 430 else
487 __set_bit(p->v.preferred_node, nodes); 431 node_set(p->v.preferred_node, *nodes);
488 break; 432 break;
489 default: 433 default:
490 BUG(); 434 BUG();
@@ -504,37 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
504 return err; 448 return err;
505} 449}
506 450
507/* Copy a kernel node mask to user space */
508static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
509 void *nodes, unsigned nbytes)
510{
511 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
512
513 if (copy > nbytes) {
514 if (copy > PAGE_SIZE)
515 return -EINVAL;
516 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
517 return -EFAULT;
518 copy = nbytes;
519 }
520 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
521}
522
523/* Retrieve NUMA policy */ 451/* Retrieve NUMA policy */
524asmlinkage long sys_get_mempolicy(int __user *policy, 452long do_get_mempolicy(int *policy, nodemask_t *nmask,
525 unsigned long __user *nmask, 453 unsigned long addr, unsigned long flags)
526 unsigned long maxnode,
527 unsigned long addr, unsigned long flags)
528{ 454{
529 int err, pval; 455 int err;
530 struct mm_struct *mm = current->mm; 456 struct mm_struct *mm = current->mm;
531 struct vm_area_struct *vma = NULL; 457 struct vm_area_struct *vma = NULL;
532 struct mempolicy *pol = current->mempolicy; 458 struct mempolicy *pol = current->mempolicy;
533 459
534 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 460 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
535 return -EINVAL; 461 return -EINVAL;
536 if (nmask != NULL && maxnode < MAX_NUMNODES)
537 return -EINVAL;
538 if (flags & MPOL_F_ADDR) { 462 if (flags & MPOL_F_ADDR) {
539 down_read(&mm->mmap_sem); 463 down_read(&mm->mmap_sem);
540 vma = find_vma_intersection(mm, addr, addr+1); 464 vma = find_vma_intersection(mm, addr, addr+1);
@@ -557,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
557 err = lookup_node(mm, addr); 481 err = lookup_node(mm, addr);
558 if (err < 0) 482 if (err < 0)
559 goto out; 483 goto out;
560 pval = err; 484 *policy = err;
561 } else if (pol == current->mempolicy && 485 } else if (pol == current->mempolicy &&
562 pol->policy == MPOL_INTERLEAVE) { 486 pol->policy == MPOL_INTERLEAVE) {
563 pval = current->il_next; 487 *policy = current->il_next;
564 } else { 488 } else {
565 err = -EINVAL; 489 err = -EINVAL;
566 goto out; 490 goto out;
567 } 491 }
568 } else 492 } else
569 pval = pol->policy; 493 *policy = pol->policy;
570 494
571 if (vma) { 495 if (vma) {
572 up_read(&current->mm->mmap_sem); 496 up_read(&current->mm->mmap_sem);
573 vma = NULL; 497 vma = NULL;
574 } 498 }
575 499
576 if (policy && put_user(pval, policy))
577 return -EFAULT;
578
579 err = 0; 500 err = 0;
580 if (nmask) { 501 if (nmask)
581 DECLARE_BITMAP(nodes, MAX_NUMNODES); 502 get_zonemask(pol, nmask);
582 get_zonemask(pol, nodes);
583 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
584 }
585 503
586 out: 504 out:
587 if (vma) 505 if (vma)
@@ -589,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
589 return err; 507 return err;
590} 508}
591 509
510/*
511 * User space interface with variable sized bitmaps for nodelists.
512 */
513
514/* Copy a node mask from user space. */
515static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
516 unsigned long maxnode)
517{
518 unsigned long k;
519 unsigned long nlongs;
520 unsigned long endmask;
521
522 --maxnode;
523 nodes_clear(*nodes);
524 if (maxnode == 0 || !nmask)
525 return 0;
526
527 nlongs = BITS_TO_LONGS(maxnode);
528 if ((maxnode % BITS_PER_LONG) == 0)
529 endmask = ~0UL;
530 else
531 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
532
533 /* When the user specified more nodes than supported just check
534 if the non supported part is all zero. */
535 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
536 if (nlongs > PAGE_SIZE/sizeof(long))
537 return -EINVAL;
538 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
539 unsigned long t;
540 if (get_user(t, nmask + k))
541 return -EFAULT;
542 if (k == nlongs - 1) {
543 if (t & endmask)
544 return -EINVAL;
545 } else if (t)
546 return -EINVAL;
547 }
548 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
549 endmask = ~0UL;
550 }
551
552 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
553 return -EFAULT;
554 nodes_addr(*nodes)[nlongs-1] &= endmask;
555 return 0;
556}
557
558/* Copy a kernel node mask to user space */
559static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
560 nodemask_t *nodes)
561{
562 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
563 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
564
565 if (copy > nbytes) {
566 if (copy > PAGE_SIZE)
567 return -EINVAL;
568 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
569 return -EFAULT;
570 copy = nbytes;
571 }
572 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
573}
574
575asmlinkage long sys_mbind(unsigned long start, unsigned long len,
576 unsigned long mode,
577 unsigned long __user *nmask, unsigned long maxnode,
578 unsigned flags)
579{
580 nodemask_t nodes;
581 int err;
582
583 err = get_nodes(&nodes, nmask, maxnode);
584 if (err)
585 return err;
586 return do_mbind(start, len, mode, &nodes, flags);
587}
588
589/* Set the process memory policy */
590asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
591 unsigned long maxnode)
592{
593 int err;
594 nodemask_t nodes;
595
596 if (mode < 0 || mode > MPOL_MAX)
597 return -EINVAL;
598 err = get_nodes(&nodes, nmask, maxnode);
599 if (err)
600 return err;
601 return do_set_mempolicy(mode, &nodes);
602}
603
604/* Retrieve NUMA policy */
605asmlinkage long sys_get_mempolicy(int __user *policy,
606 unsigned long __user *nmask,
607 unsigned long maxnode,
608 unsigned long addr, unsigned long flags)
609{
610 int err, pval;
611 nodemask_t nodes;
612
613 if (nmask != NULL && maxnode < MAX_NUMNODES)
614 return -EINVAL;
615
616 err = do_get_mempolicy(&pval, &nodes, addr, flags);
617
618 if (err)
619 return err;
620
621 if (policy && put_user(pval, policy))
622 return -EFAULT;
623
624 if (nmask)
625 err = copy_nodes_to_user(nmask, maxnode, &nodes);
626
627 return err;
628}
629
592#ifdef CONFIG_COMPAT 630#ifdef CONFIG_COMPAT
593 631
594asmlinkage long compat_sys_get_mempolicy(int __user *policy, 632asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -649,15 +687,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
649 long err = 0; 687 long err = 0;
650 unsigned long __user *nm = NULL; 688 unsigned long __user *nm = NULL;
651 unsigned long nr_bits, alloc_size; 689 unsigned long nr_bits, alloc_size;
652 DECLARE_BITMAP(bm, MAX_NUMNODES); 690 nodemask_t bm;
653 691
654 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 692 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
655 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 693 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
656 694
657 if (nmask) { 695 if (nmask) {
658 err = compat_get_bitmap(bm, nmask, nr_bits); 696 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
659 nm = compat_alloc_user_space(alloc_size); 697 nm = compat_alloc_user_space(alloc_size);
660 err |= copy_to_user(nm, bm, alloc_size); 698 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
661 } 699 }
662 700
663 if (err) 701 if (err)
@@ -676,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
676 714
677 if (vma) { 715 if (vma) {
678 if (vma->vm_ops && vma->vm_ops->get_policy) 716 if (vma->vm_ops && vma->vm_ops->get_policy)
679 pol = vma->vm_ops->get_policy(vma, addr); 717 pol = vma->vm_ops->get_policy(vma, addr);
680 else if (vma->vm_policy && 718 else if (vma->vm_policy &&
681 vma->vm_policy->policy != MPOL_DEFAULT) 719 vma->vm_policy->policy != MPOL_DEFAULT)
682 pol = vma->vm_policy; 720 pol = vma->vm_policy;
@@ -722,10 +760,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
722 struct task_struct *me = current; 760 struct task_struct *me = current;
723 761
724 nid = me->il_next; 762 nid = me->il_next;
725 BUG_ON(nid >= MAX_NUMNODES); 763 next = next_node(nid, policy->v.nodes);
726 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
727 if (next >= MAX_NUMNODES) 764 if (next >= MAX_NUMNODES)
728 next = find_first_bit(policy->v.nodes, MAX_NUMNODES); 765 next = first_node(policy->v.nodes);
729 me->il_next = next; 766 me->il_next = next;
730 return nid; 767 return nid;
731} 768}
@@ -734,29 +771,27 @@ static unsigned interleave_nodes(struct mempolicy *policy)
734static unsigned offset_il_node(struct mempolicy *pol, 771static unsigned offset_il_node(struct mempolicy *pol,
735 struct vm_area_struct *vma, unsigned long off) 772 struct vm_area_struct *vma, unsigned long off)
736{ 773{
737 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); 774 unsigned nnodes = nodes_weight(pol->v.nodes);
738 unsigned target = (unsigned)off % nnodes; 775 unsigned target = (unsigned)off % nnodes;
739 int c; 776 int c;
740 int nid = -1; 777 int nid = -1;
741 778
742 c = 0; 779 c = 0;
743 do { 780 do {
744 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); 781 nid = next_node(nid, pol->v.nodes);
745 c++; 782 c++;
746 } while (c <= target); 783 } while (c <= target);
747 BUG_ON(nid >= MAX_NUMNODES);
748 BUG_ON(!test_bit(nid, pol->v.nodes));
749 return nid; 784 return nid;
750} 785}
751 786
752/* Allocate a page in interleaved policy. 787/* Allocate a page in interleaved policy.
753 Own path because it needs to do special accounting. */ 788 Own path because it needs to do special accounting. */
754static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) 789static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
790 unsigned nid)
755{ 791{
756 struct zonelist *zl; 792 struct zonelist *zl;
757 struct page *page; 793 struct page *page;
758 794
759 BUG_ON(!node_online(nid));
760 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 795 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
761 page = __alloc_pages(gfp, order, zl); 796 page = __alloc_pages(gfp, order, zl);
762 if (page && page_zone(page) == zl->zones[0]) { 797 if (page && page_zone(page) == zl->zones[0]) {
@@ -799,8 +834,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
799 unsigned nid; 834 unsigned nid;
800 if (vma) { 835 if (vma) {
801 unsigned long off; 836 unsigned long off;
802 BUG_ON(addr >= vma->vm_end);
803 BUG_ON(addr < vma->vm_start);
804 off = vma->vm_pgoff; 837 off = vma->vm_pgoff;
805 off += (addr - vma->vm_start) >> PAGE_SHIFT; 838 off += (addr - vma->vm_start) >> PAGE_SHIFT;
806 nid = offset_il_node(pol, vma, off); 839 nid = offset_il_node(pol, vma, off);
@@ -878,7 +911,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
878 case MPOL_DEFAULT: 911 case MPOL_DEFAULT:
879 return 1; 912 return 1;
880 case MPOL_INTERLEAVE: 913 case MPOL_INTERLEAVE:
881 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); 914 return nodes_equal(a->v.nodes, b->v.nodes);
882 case MPOL_PREFERRED: 915 case MPOL_PREFERRED:
883 return a->v.preferred_node == b->v.preferred_node; 916 return a->v.preferred_node == b->v.preferred_node;
884 case MPOL_BIND: { 917 case MPOL_BIND: {
@@ -1117,7 +1150,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
1117 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1150 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1118 vma->vm_pgoff, 1151 vma->vm_pgoff,
1119 sz, npol? npol->policy : -1, 1152 sz, npol? npol->policy : -1,
1120 npol ? npol->v.nodes[0] : -1); 1153 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1121 1154
1122 if (npol) { 1155 if (npol) {
1123 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1156 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1164,14 +1197,12 @@ void __init numa_policy_init(void)
1164 /* Set interleaving policy for system init. This way not all 1197 /* Set interleaving policy for system init. This way not all
1165 the data structures allocated at system boot end up in node zero. */ 1198 the data structures allocated at system boot end up in node zero. */
1166 1199
1167 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), 1200 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1168 MAX_NUMNODES) < 0)
1169 printk("numa_policy_init: interleaving failed\n"); 1201 printk("numa_policy_init: interleaving failed\n");
1170} 1202}
1171 1203
1172/* Reset policy of current process to default. 1204/* Reset policy of current process to default */
1173 * Assumes fs == KERNEL_DS */
1174void numa_default_policy(void) 1205void numa_default_policy(void)
1175{ 1206{
1176 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); 1207 do_set_mempolicy(MPOL_DEFAULT, NULL);
1177} 1208}
diff --git a/mm/mmap.c b/mm/mmap.c
index fa11d91242e8..5ecc2cf3e1d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -181,26 +181,36 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
181} 181}
182 182
183/* 183/*
184 * Remove one vm structure and free it. 184 * Unlink a file-based vm structure from its prio_tree, to hide
185 * vma from rmap and vmtruncate before freeing its page tables.
185 */ 186 */
186static void remove_vm_struct(struct vm_area_struct *vma) 187void unlink_file_vma(struct vm_area_struct *vma)
187{ 188{
188 struct file *file = vma->vm_file; 189 struct file *file = vma->vm_file;
189 190
190 might_sleep();
191 if (file) { 191 if (file) {
192 struct address_space *mapping = file->f_mapping; 192 struct address_space *mapping = file->f_mapping;
193 spin_lock(&mapping->i_mmap_lock); 193 spin_lock(&mapping->i_mmap_lock);
194 __remove_shared_vm_struct(vma, file, mapping); 194 __remove_shared_vm_struct(vma, file, mapping);
195 spin_unlock(&mapping->i_mmap_lock); 195 spin_unlock(&mapping->i_mmap_lock);
196 } 196 }
197}
198
199/*
200 * Close a vm structure and free it, returning the next.
201 */
202static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
203{
204 struct vm_area_struct *next = vma->vm_next;
205
206 might_sleep();
197 if (vma->vm_ops && vma->vm_ops->close) 207 if (vma->vm_ops && vma->vm_ops->close)
198 vma->vm_ops->close(vma); 208 vma->vm_ops->close(vma);
199 if (file) 209 if (vma->vm_file)
200 fput(file); 210 fput(vma->vm_file);
201 anon_vma_unlink(vma);
202 mpol_free(vma_policy(vma)); 211 mpol_free(vma_policy(vma));
203 kmem_cache_free(vm_area_cachep, vma); 212 kmem_cache_free(vm_area_cachep, vma);
213 return next;
204} 214}
205 215
206asmlinkage unsigned long sys_brk(unsigned long brk) 216asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -832,7 +842,7 @@ none:
832} 842}
833 843
834#ifdef CONFIG_PROC_FS 844#ifdef CONFIG_PROC_FS
835void __vm_stat_account(struct mm_struct *mm, unsigned long flags, 845void vm_stat_account(struct mm_struct *mm, unsigned long flags,
836 struct file *file, long pages) 846 struct file *file, long pages)
837{ 847{
838 const unsigned long stack_flags 848 const unsigned long stack_flags
@@ -1070,6 +1080,17 @@ munmap_back:
1070 error = file->f_op->mmap(file, vma); 1080 error = file->f_op->mmap(file, vma);
1071 if (error) 1081 if (error)
1072 goto unmap_and_free_vma; 1082 goto unmap_and_free_vma;
1083 if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
1084 == (VM_WRITE | VM_RESERVED)) {
1085 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
1086 "PROT_WRITE mmap of VM_RESERVED memory, which "
1087 "is deprecated. Please report this to "
1088 "linux-kernel@vger.kernel.org\n",current->comm);
1089 if (vma->vm_ops && vma->vm_ops->close)
1090 vma->vm_ops->close(vma);
1091 error = -EACCES;
1092 goto unmap_and_free_vma;
1093 }
1073 } else if (vm_flags & VM_SHARED) { 1094 } else if (vm_flags & VM_SHARED) {
1074 error = shmem_zero_setup(vma); 1095 error = shmem_zero_setup(vma);
1075 if (error) 1096 if (error)
@@ -1110,7 +1131,7 @@ munmap_back:
1110 } 1131 }
1111out: 1132out:
1112 mm->total_vm += len >> PAGE_SHIFT; 1133 mm->total_vm += len >> PAGE_SHIFT;
1113 __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1134 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1114 if (vm_flags & VM_LOCKED) { 1135 if (vm_flags & VM_LOCKED) {
1115 mm->locked_vm += len >> PAGE_SHIFT; 1136 mm->locked_vm += len >> PAGE_SHIFT;
1116 make_pages_present(addr, addr + len); 1137 make_pages_present(addr, addr + len);
@@ -1475,15 +1496,19 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1475 mm->total_vm += grow; 1496 mm->total_vm += grow;
1476 if (vma->vm_flags & VM_LOCKED) 1497 if (vma->vm_flags & VM_LOCKED)
1477 mm->locked_vm += grow; 1498 mm->locked_vm += grow;
1478 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 1499 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
1479 return 0; 1500 return 0;
1480} 1501}
1481 1502
1482#ifdef CONFIG_STACK_GROWSUP 1503#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
1483/* 1504/*
1484 * vma is the first one with address > vma->vm_end. Have to extend vma. 1505 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
1506 * vma is the last one with address > vma->vm_end. Have to extend vma.
1485 */ 1507 */
1486int expand_stack(struct vm_area_struct * vma, unsigned long address) 1508#ifdef CONFIG_STACK_GROWSUP
1509static inline
1510#endif
1511int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1487{ 1512{
1488 int error; 1513 int error;
1489 1514
@@ -1521,6 +1546,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
1521 anon_vma_unlock(vma); 1546 anon_vma_unlock(vma);
1522 return error; 1547 return error;
1523} 1548}
1549#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
1550
1551#ifdef CONFIG_STACK_GROWSUP
1552int expand_stack(struct vm_area_struct *vma, unsigned long address)
1553{
1554 return expand_upwards(vma, address);
1555}
1524 1556
1525struct vm_area_struct * 1557struct vm_area_struct *
1526find_extend_vma(struct mm_struct *mm, unsigned long addr) 1558find_extend_vma(struct mm_struct *mm, unsigned long addr)
@@ -1603,36 +1635,24 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1603} 1635}
1604#endif 1636#endif
1605 1637
1606/* Normal function to fix up a mapping
1607 * This function is the default for when an area has no specific
1608 * function. This may be used as part of a more specific routine.
1609 *
1610 * By the time this function is called, the area struct has been
1611 * removed from the process mapping list.
1612 */
1613static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1614{
1615 size_t len = area->vm_end - area->vm_start;
1616
1617 area->vm_mm->total_vm -= len >> PAGE_SHIFT;
1618 if (area->vm_flags & VM_LOCKED)
1619 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1620 vm_stat_unaccount(area);
1621 remove_vm_struct(area);
1622}
1623
1624/* 1638/*
1625 * Update the VMA and inode share lists. 1639 * Ok - we have the memory areas we should free on the vma list,
1626 *
1627 * Ok - we have the memory areas we should free on the 'free' list,
1628 * so release them, and do the vma updates. 1640 * so release them, and do the vma updates.
1641 *
1642 * Called with the mm semaphore held.
1629 */ 1643 */
1630static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 1644static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1631{ 1645{
1646 /* Update high watermark before we lower total_vm */
1647 update_hiwater_vm(mm);
1632 do { 1648 do {
1633 struct vm_area_struct *next = vma->vm_next; 1649 long nrpages = vma_pages(vma);
1634 unmap_vma(mm, vma); 1650
1635 vma = next; 1651 mm->total_vm -= nrpages;
1652 if (vma->vm_flags & VM_LOCKED)
1653 mm->locked_vm -= nrpages;
1654 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1655 vma = remove_vma(vma);
1636 } while (vma); 1656 } while (vma);
1637 validate_mm(mm); 1657 validate_mm(mm);
1638} 1658}
@@ -1651,14 +1671,13 @@ static void unmap_region(struct mm_struct *mm,
1651 unsigned long nr_accounted = 0; 1671 unsigned long nr_accounted = 0;
1652 1672
1653 lru_add_drain(); 1673 lru_add_drain();
1654 spin_lock(&mm->page_table_lock);
1655 tlb = tlb_gather_mmu(mm, 0); 1674 tlb = tlb_gather_mmu(mm, 0);
1656 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); 1675 update_hiwater_rss(mm);
1676 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1657 vm_unacct_memory(nr_accounted); 1677 vm_unacct_memory(nr_accounted);
1658 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1678 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1659 next? next->vm_start: 0); 1679 next? next->vm_start: 0);
1660 tlb_finish_mmu(tlb, start, end); 1680 tlb_finish_mmu(tlb, start, end);
1661 spin_unlock(&mm->page_table_lock);
1662} 1681}
1663 1682
1664/* 1683/*
@@ -1799,7 +1818,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1799 unmap_region(mm, vma, prev, start, end); 1818 unmap_region(mm, vma, prev, start, end);
1800 1819
1801 /* Fix up all other VM information */ 1820 /* Fix up all other VM information */
1802 unmap_vma_list(mm, vma); 1821 remove_vma_list(mm, vma);
1803 1822
1804 return 0; 1823 return 0;
1805} 1824}
@@ -1933,34 +1952,21 @@ void exit_mmap(struct mm_struct *mm)
1933 unsigned long end; 1952 unsigned long end;
1934 1953
1935 lru_add_drain(); 1954 lru_add_drain();
1936
1937 spin_lock(&mm->page_table_lock);
1938
1939 flush_cache_mm(mm); 1955 flush_cache_mm(mm);
1940 tlb = tlb_gather_mmu(mm, 1); 1956 tlb = tlb_gather_mmu(mm, 1);
1957 /* Don't update_hiwater_rss(mm) here, do_exit already did */
1941 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 1958 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1942 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); 1959 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
1943 vm_unacct_memory(nr_accounted); 1960 vm_unacct_memory(nr_accounted);
1944 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 1961 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
1945 tlb_finish_mmu(tlb, 0, end); 1962 tlb_finish_mmu(tlb, 0, end);
1946 1963
1947 mm->mmap = mm->mmap_cache = NULL;
1948 mm->mm_rb = RB_ROOT;
1949 set_mm_counter(mm, rss, 0);
1950 mm->total_vm = 0;
1951 mm->locked_vm = 0;
1952
1953 spin_unlock(&mm->page_table_lock);
1954
1955 /* 1964 /*
1956 * Walk the list again, actually closing and freeing it 1965 * Walk the list again, actually closing and freeing it,
1957 * without holding any MM locks. 1966 * with preemption enabled, without holding any MM locks.
1958 */ 1967 */
1959 while (vma) { 1968 while (vma)
1960 struct vm_area_struct *next = vma->vm_next; 1969 vma = remove_vma(vma);
1961 remove_vm_struct(vma);
1962 vma = next;
1963 }
1964 1970
1965 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 1971 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
1966} 1972}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 57577f63b305..17a2b52b753b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
29 unsigned long addr, unsigned long end, pgprot_t newprot) 29 unsigned long addr, unsigned long end, pgprot_t newprot)
30{ 30{
31 pte_t *pte; 31 pte_t *pte;
32 spinlock_t *ptl;
32 33
33 pte = pte_offset_map(pmd, addr); 34 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
34 do { 35 do {
35 if (pte_present(*pte)) { 36 if (pte_present(*pte)) {
36 pte_t ptent; 37 pte_t ptent;
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
44 lazy_mmu_prot_update(ptent); 45 lazy_mmu_prot_update(ptent);
45 } 46 }
46 } while (pte++, addr += PAGE_SIZE, addr != end); 47 } while (pte++, addr += PAGE_SIZE, addr != end);
47 pte_unmap(pte - 1); 48 pte_unmap_unlock(pte - 1, ptl);
48} 49}
49 50
50static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 51static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma,
88 BUG_ON(addr >= end); 89 BUG_ON(addr >= end);
89 pgd = pgd_offset(mm, addr); 90 pgd = pgd_offset(mm, addr);
90 flush_cache_range(vma, addr, end); 91 flush_cache_range(vma, addr, end);
91 spin_lock(&mm->page_table_lock);
92 do { 92 do {
93 next = pgd_addr_end(addr, end); 93 next = pgd_addr_end(addr, end);
94 if (pgd_none_or_clear_bad(pgd)) 94 if (pgd_none_or_clear_bad(pgd))
@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma,
96 change_pud_range(mm, pgd, addr, next, newprot); 96 change_pud_range(mm, pgd, addr, next, newprot);
97 } while (pgd++, addr = next, addr != end); 97 } while (pgd++, addr = next, addr != end);
98 flush_tlb_range(vma, start, end); 98 flush_tlb_range(vma, start, end);
99 spin_unlock(&mm->page_table_lock);
100} 99}
101 100
102static int 101static int
@@ -125,6 +124,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
125 * a MAP_NORESERVE private mapping to writable will now reserve. 124 * a MAP_NORESERVE private mapping to writable will now reserve.
126 */ 125 */
127 if (newflags & VM_WRITE) { 126 if (newflags & VM_WRITE) {
127 if (oldflags & VM_RESERVED) {
128 BUG_ON(oldflags & VM_WRITE);
129 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
130 "PROT_WRITE mprotect of VM_RESERVED memory, "
131 "which is deprecated. Please report this to "
132 "linux-kernel@vger.kernel.org\n",current->comm);
133 return -EACCES;
134 }
128 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 135 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
129 charged = nrpages; 136 charged = nrpages;
130 if (security_vm_enough_memory(charged)) 137 if (security_vm_enough_memory(charged))
@@ -168,8 +175,8 @@ success:
168 vma->vm_flags = newflags; 175 vma->vm_flags = newflags;
169 vma->vm_page_prot = newprot; 176 vma->vm_page_prot = newprot;
170 change_protection(vma, start, end, newprot); 177 change_protection(vma, start, end, newprot);
171 __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 178 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
172 __vm_stat_account(mm, newflags, vma->vm_file, nrpages); 179 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
173 return 0; 180 return 0;
174 181
175fail: 182fail:
diff --git a/mm/mremap.c b/mm/mremap.c
index f343fc73a8bd..b535438c363c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,35 +22,7 @@
22#include <asm/cacheflush.h> 22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h> 23#include <asm/tlbflush.h>
24 24
25static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) 25static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
26{
27 pgd_t *pgd;
28 pud_t *pud;
29 pmd_t *pmd;
30 pte_t *pte = NULL;
31
32 pgd = pgd_offset(mm, addr);
33 if (pgd_none_or_clear_bad(pgd))
34 goto end;
35
36 pud = pud_offset(pgd, addr);
37 if (pud_none_or_clear_bad(pud))
38 goto end;
39
40 pmd = pmd_offset(pud, addr);
41 if (pmd_none_or_clear_bad(pmd))
42 goto end;
43
44 pte = pte_offset_map_nested(pmd, addr);
45 if (pte_none(*pte)) {
46 pte_unmap_nested(pte);
47 pte = NULL;
48 }
49end:
50 return pte;
51}
52
53static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
54{ 26{
55 pgd_t *pgd; 27 pgd_t *pgd;
56 pud_t *pud; 28 pud_t *pud;
@@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
68 if (pmd_none_or_clear_bad(pmd)) 40 if (pmd_none_or_clear_bad(pmd))
69 return NULL; 41 return NULL;
70 42
71 return pte_offset_map(pmd, addr); 43 return pmd;
72} 44}
73 45
74static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) 46static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
75{ 47{
76 pgd_t *pgd; 48 pgd_t *pgd;
77 pud_t *pud; 49 pud_t *pud;
78 pmd_t *pmd; 50 pmd_t *pmd;
79 pte_t *pte = NULL;
80 51
81 pgd = pgd_offset(mm, addr); 52 pgd = pgd_offset(mm, addr);
82
83 pud = pud_alloc(mm, pgd, addr); 53 pud = pud_alloc(mm, pgd, addr);
84 if (!pud) 54 if (!pud)
85 return NULL; 55 return NULL;
56
86 pmd = pmd_alloc(mm, pud, addr); 57 pmd = pmd_alloc(mm, pud, addr);
87 if (pmd) 58 if (!pmd)
88 pte = pte_alloc_map(mm, pmd, addr); 59 return NULL;
89 return pte; 60
61 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
62 return NULL;
63
64 return pmd;
90} 65}
91 66
92static int 67static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
93move_one_page(struct vm_area_struct *vma, unsigned long old_addr, 68 unsigned long old_addr, unsigned long old_end,
94 struct vm_area_struct *new_vma, unsigned long new_addr) 69 struct vm_area_struct *new_vma, pmd_t *new_pmd,
70 unsigned long new_addr)
95{ 71{
96 struct address_space *mapping = NULL; 72 struct address_space *mapping = NULL;
97 struct mm_struct *mm = vma->vm_mm; 73 struct mm_struct *mm = vma->vm_mm;
98 int error = 0; 74 pte_t *old_pte, *new_pte, pte;
99 pte_t *src, *dst; 75 spinlock_t *old_ptl, *new_ptl;
100 76
101 if (vma->vm_file) { 77 if (vma->vm_file) {
102 /* 78 /*
@@ -111,74 +87,69 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
111 new_vma->vm_truncate_count != vma->vm_truncate_count) 87 new_vma->vm_truncate_count != vma->vm_truncate_count)
112 new_vma->vm_truncate_count = 0; 88 new_vma->vm_truncate_count = 0;
113 } 89 }
114 spin_lock(&mm->page_table_lock);
115 90
116 src = get_one_pte_map_nested(mm, old_addr); 91 /*
117 if (src) { 92 * We don't have to worry about the ordering of src and dst
118 /* 93 * pte locks because exclusive mmap_sem prevents deadlock.
119 * Look to see whether alloc_one_pte_map needs to perform a 94 */
120 * memory allocation. If it does then we need to drop the 95 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
121 * atomic kmap 96 new_pte = pte_offset_map_nested(new_pmd, new_addr);
122 */ 97 new_ptl = pte_lockptr(mm, new_pmd);
123 dst = get_one_pte_map(mm, new_addr); 98 if (new_ptl != old_ptl)
124 if (unlikely(!dst)) { 99 spin_lock(new_ptl);
125 pte_unmap_nested(src); 100
126 if (mapping) 101 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
127 spin_unlock(&mapping->i_mmap_lock); 102 new_pte++, new_addr += PAGE_SIZE) {
128 dst = alloc_one_pte_map(mm, new_addr); 103 if (pte_none(*old_pte))
129 if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { 104 continue;
130 spin_unlock(&mm->page_table_lock); 105 pte = ptep_clear_flush(vma, old_addr, old_pte);
131 spin_lock(&mapping->i_mmap_lock); 106 /* ZERO_PAGE can be dependant on virtual addr */
132 spin_lock(&mm->page_table_lock); 107 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
133 } 108 set_pte_at(mm, new_addr, new_pte, pte);
134 src = get_one_pte_map_nested(mm, old_addr);
135 }
136 /*
137 * Since alloc_one_pte_map can drop and re-acquire
138 * page_table_lock, we should re-check the src entry...
139 */
140 if (src) {
141 if (dst) {
142 pte_t pte;
143 pte = ptep_clear_flush(vma, old_addr, src);
144
145 /* ZERO_PAGE can be dependant on virtual addr */
146 pte = move_pte(pte, new_vma->vm_page_prot,
147 old_addr, new_addr);
148 set_pte_at(mm, new_addr, dst, pte);
149 } else
150 error = -ENOMEM;
151 pte_unmap_nested(src);
152 }
153 if (dst)
154 pte_unmap(dst);
155 } 109 }
156 spin_unlock(&mm->page_table_lock); 110
111 if (new_ptl != old_ptl)
112 spin_unlock(new_ptl);
113 pte_unmap_nested(new_pte - 1);
114 pte_unmap_unlock(old_pte - 1, old_ptl);
157 if (mapping) 115 if (mapping)
158 spin_unlock(&mapping->i_mmap_lock); 116 spin_unlock(&mapping->i_mmap_lock);
159 return error;
160} 117}
161 118
119#define LATENCY_LIMIT (64 * PAGE_SIZE)
120
162static unsigned long move_page_tables(struct vm_area_struct *vma, 121static unsigned long move_page_tables(struct vm_area_struct *vma,
163 unsigned long old_addr, struct vm_area_struct *new_vma, 122 unsigned long old_addr, struct vm_area_struct *new_vma,
164 unsigned long new_addr, unsigned long len) 123 unsigned long new_addr, unsigned long len)
165{ 124{
166 unsigned long offset; 125 unsigned long extent, next, old_end;
126 pmd_t *old_pmd, *new_pmd;
167 127
168 flush_cache_range(vma, old_addr, old_addr + len); 128 old_end = old_addr + len;
129 flush_cache_range(vma, old_addr, old_end);
169 130
170 /* 131 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
171 * This is not the clever way to do this, but we're taking the
172 * easy way out on the assumption that most remappings will be
173 * only a few pages.. This also makes error recovery easier.
174 */
175 for (offset = 0; offset < len; offset += PAGE_SIZE) {
176 if (move_one_page(vma, old_addr + offset,
177 new_vma, new_addr + offset) < 0)
178 break;
179 cond_resched(); 132 cond_resched();
133 next = (old_addr + PMD_SIZE) & PMD_MASK;
134 if (next - 1 > old_end)
135 next = old_end;
136 extent = next - old_addr;
137 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
138 if (!old_pmd)
139 continue;
140 new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
141 if (!new_pmd)
142 break;
143 next = (new_addr + PMD_SIZE) & PMD_MASK;
144 if (extent > next - new_addr)
145 extent = next - new_addr;
146 if (extent > LATENCY_LIMIT)
147 extent = LATENCY_LIMIT;
148 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
149 new_vma, new_pmd, new_addr);
180 } 150 }
181 return offset; 151
152 return len + old_addr - old_end; /* how much done */
182} 153}
183 154
184static unsigned long move_vma(struct vm_area_struct *vma, 155static unsigned long move_vma(struct vm_area_struct *vma,
@@ -191,6 +162,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
191 unsigned long new_pgoff; 162 unsigned long new_pgoff;
192 unsigned long moved_len; 163 unsigned long moved_len;
193 unsigned long excess = 0; 164 unsigned long excess = 0;
165 unsigned long hiwater_vm;
194 int split = 0; 166 int split = 0;
195 167
196 /* 168 /*
@@ -229,17 +201,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
229 } 201 }
230 202
231 /* 203 /*
232 * if we failed to move page tables we still do total_vm increment 204 * If we failed to move page tables we still do total_vm increment
233 * since do_munmap() will decrement it by old_len == new_len 205 * since do_munmap() will decrement it by old_len == new_len.
206 *
207 * Since total_vm is about to be raised artificially high for a
208 * moment, we need to restore high watermark afterwards: if stats
209 * are taken meanwhile, total_vm and hiwater_vm appear too high.
210 * If this were a serious issue, we'd add a flag to do_munmap().
234 */ 211 */
212 hiwater_vm = mm->hiwater_vm;
235 mm->total_vm += new_len >> PAGE_SHIFT; 213 mm->total_vm += new_len >> PAGE_SHIFT;
236 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 214 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
237 215
238 if (do_munmap(mm, old_addr, old_len) < 0) { 216 if (do_munmap(mm, old_addr, old_len) < 0) {
239 /* OOM: unable to split vma, just get accounts right */ 217 /* OOM: unable to split vma, just get accounts right */
240 vm_unacct_memory(excess >> PAGE_SHIFT); 218 vm_unacct_memory(excess >> PAGE_SHIFT);
241 excess = 0; 219 excess = 0;
242 } 220 }
221 mm->hiwater_vm = hiwater_vm;
243 222
244 /* Restore VM_ACCOUNT if one or two pieces of vma left */ 223 /* Restore VM_ACCOUNT if one or two pieces of vma left */
245 if (excess) { 224 if (excess) {
@@ -269,6 +248,7 @@ unsigned long do_mremap(unsigned long addr,
269 unsigned long old_len, unsigned long new_len, 248 unsigned long old_len, unsigned long new_len,
270 unsigned long flags, unsigned long new_addr) 249 unsigned long flags, unsigned long new_addr)
271{ 250{
251 struct mm_struct *mm = current->mm;
272 struct vm_area_struct *vma; 252 struct vm_area_struct *vma;
273 unsigned long ret = -EINVAL; 253 unsigned long ret = -EINVAL;
274 unsigned long charged = 0; 254 unsigned long charged = 0;
@@ -309,7 +289,7 @@ unsigned long do_mremap(unsigned long addr,
309 if ((addr <= new_addr) && (addr+old_len) > new_addr) 289 if ((addr <= new_addr) && (addr+old_len) > new_addr)
310 goto out; 290 goto out;
311 291
312 ret = do_munmap(current->mm, new_addr, new_len); 292 ret = do_munmap(mm, new_addr, new_len);
313 if (ret) 293 if (ret)
314 goto out; 294 goto out;
315 } 295 }
@@ -320,7 +300,7 @@ unsigned long do_mremap(unsigned long addr,
320 * do_munmap does all the needed commit accounting 300 * do_munmap does all the needed commit accounting
321 */ 301 */
322 if (old_len >= new_len) { 302 if (old_len >= new_len) {
323 ret = do_munmap(current->mm, addr+new_len, old_len - new_len); 303 ret = do_munmap(mm, addr+new_len, old_len - new_len);
324 if (ret && old_len != new_len) 304 if (ret && old_len != new_len)
325 goto out; 305 goto out;
326 ret = addr; 306 ret = addr;
@@ -333,7 +313,7 @@ unsigned long do_mremap(unsigned long addr,
333 * Ok, we need to grow.. or relocate. 313 * Ok, we need to grow.. or relocate.
334 */ 314 */
335 ret = -EFAULT; 315 ret = -EFAULT;
336 vma = find_vma(current->mm, addr); 316 vma = find_vma(mm, addr);
337 if (!vma || vma->vm_start > addr) 317 if (!vma || vma->vm_start > addr)
338 goto out; 318 goto out;
339 if (is_vm_hugetlb_page(vma)) { 319 if (is_vm_hugetlb_page(vma)) {
@@ -349,14 +329,14 @@ unsigned long do_mremap(unsigned long addr,
349 } 329 }
350 if (vma->vm_flags & VM_LOCKED) { 330 if (vma->vm_flags & VM_LOCKED) {
351 unsigned long locked, lock_limit; 331 unsigned long locked, lock_limit;
352 locked = current->mm->locked_vm << PAGE_SHIFT; 332 locked = mm->locked_vm << PAGE_SHIFT;
353 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 333 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
354 locked += new_len - old_len; 334 locked += new_len - old_len;
355 ret = -EAGAIN; 335 ret = -EAGAIN;
356 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 336 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
357 goto out; 337 goto out;
358 } 338 }
359 if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) { 339 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
360 ret = -ENOMEM; 340 ret = -ENOMEM;
361 goto out; 341 goto out;
362 } 342 }
@@ -383,11 +363,10 @@ unsigned long do_mremap(unsigned long addr,
383 vma_adjust(vma, vma->vm_start, 363 vma_adjust(vma, vma->vm_start,
384 addr + new_len, vma->vm_pgoff, NULL); 364 addr + new_len, vma->vm_pgoff, NULL);
385 365
386 current->mm->total_vm += pages; 366 mm->total_vm += pages;
387 __vm_stat_account(vma->vm_mm, vma->vm_flags, 367 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
388 vma->vm_file, pages);
389 if (vma->vm_flags & VM_LOCKED) { 368 if (vma->vm_flags & VM_LOCKED) {
390 current->mm->locked_vm += pages; 369 mm->locked_vm += pages;
391 make_pages_present(addr + old_len, 370 make_pages_present(addr + old_len,
392 addr + new_len); 371 addr + new_len);
393 } 372 }
diff --git a/mm/msync.c b/mm/msync.c
index d0f5a1bce7cb..0e040e9c39d8 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -17,40 +17,48 @@
17#include <asm/pgtable.h> 17#include <asm/pgtable.h>
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19 19
20/* 20static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
21 * Called with mm->page_table_lock held to protect against other
22 * threads/the swapper from ripping pte's out from under us.
23 */
24
25static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
26 unsigned long addr, unsigned long end) 21 unsigned long addr, unsigned long end)
27{ 22{
28 pte_t *pte; 23 pte_t *pte;
24 spinlock_t *ptl;
25 int progress = 0;
29 26
30 pte = pte_offset_map(pmd, addr); 27again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
31 do { 29 do {
32 unsigned long pfn; 30 unsigned long pfn;
33 struct page *page; 31 struct page *page;
34 32
33 if (progress >= 64) {
34 progress = 0;
35 if (need_resched() || need_lockbreak(ptl))
36 break;
37 }
38 progress++;
35 if (!pte_present(*pte)) 39 if (!pte_present(*pte))
36 continue; 40 continue;
37 if (!pte_maybe_dirty(*pte)) 41 if (!pte_maybe_dirty(*pte))
38 continue; 42 continue;
39 pfn = pte_pfn(*pte); 43 pfn = pte_pfn(*pte);
40 if (!pfn_valid(pfn)) 44 if (unlikely(!pfn_valid(pfn))) {
45 print_bad_pte(vma, *pte, addr);
41 continue; 46 continue;
47 }
42 page = pfn_to_page(pfn); 48 page = pfn_to_page(pfn);
43 if (PageReserved(page))
44 continue;
45 49
46 if (ptep_clear_flush_dirty(vma, addr, pte) || 50 if (ptep_clear_flush_dirty(vma, addr, pte) ||
47 page_test_and_clear_dirty(page)) 51 page_test_and_clear_dirty(page))
48 set_page_dirty(page); 52 set_page_dirty(page);
53 progress += 3;
49 } while (pte++, addr += PAGE_SIZE, addr != end); 54 } while (pte++, addr += PAGE_SIZE, addr != end);
50 pte_unmap(pte - 1); 55 pte_unmap_unlock(pte - 1, ptl);
56 cond_resched();
57 if (addr != end)
58 goto again;
51} 59}
52 60
53static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, 61static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
54 unsigned long addr, unsigned long end) 62 unsigned long addr, unsigned long end)
55{ 63{
56 pmd_t *pmd; 64 pmd_t *pmd;
@@ -61,11 +69,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
61 next = pmd_addr_end(addr, end); 69 next = pmd_addr_end(addr, end);
62 if (pmd_none_or_clear_bad(pmd)) 70 if (pmd_none_or_clear_bad(pmd))
63 continue; 71 continue;
64 sync_pte_range(vma, pmd, addr, next); 72 msync_pte_range(vma, pmd, addr, next);
65 } while (pmd++, addr = next, addr != end); 73 } while (pmd++, addr = next, addr != end);
66} 74}
67 75
68static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 76static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
69 unsigned long addr, unsigned long end) 77 unsigned long addr, unsigned long end)
70{ 78{
71 pud_t *pud; 79 pud_t *pud;
@@ -76,58 +84,34 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
76 next = pud_addr_end(addr, end); 84 next = pud_addr_end(addr, end);
77 if (pud_none_or_clear_bad(pud)) 85 if (pud_none_or_clear_bad(pud))
78 continue; 86 continue;
79 sync_pmd_range(vma, pud, addr, next); 87 msync_pmd_range(vma, pud, addr, next);
80 } while (pud++, addr = next, addr != end); 88 } while (pud++, addr = next, addr != end);
81} 89}
82 90
83static void sync_page_range(struct vm_area_struct *vma, 91static void msync_page_range(struct vm_area_struct *vma,
84 unsigned long addr, unsigned long end) 92 unsigned long addr, unsigned long end)
85{ 93{
86 struct mm_struct *mm = vma->vm_mm;
87 pgd_t *pgd; 94 pgd_t *pgd;
88 unsigned long next; 95 unsigned long next;
89 96
90 /* For hugepages we can't go walking the page table normally, 97 /* For hugepages we can't go walking the page table normally,
91 * but that's ok, hugetlbfs is memory based, so we don't need 98 * but that's ok, hugetlbfs is memory based, so we don't need
92 * to do anything more on an msync() */ 99 * to do anything more on an msync().
93 if (is_vm_hugetlb_page(vma)) 100 * Can't do anything with VM_RESERVED regions either.
101 */
102 if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
94 return; 103 return;
95 104
96 BUG_ON(addr >= end); 105 BUG_ON(addr >= end);
97 pgd = pgd_offset(mm, addr); 106 pgd = pgd_offset(vma->vm_mm, addr);
98 flush_cache_range(vma, addr, end); 107 flush_cache_range(vma, addr, end);
99 spin_lock(&mm->page_table_lock);
100 do { 108 do {
101 next = pgd_addr_end(addr, end); 109 next = pgd_addr_end(addr, end);
102 if (pgd_none_or_clear_bad(pgd)) 110 if (pgd_none_or_clear_bad(pgd))
103 continue; 111 continue;
104 sync_pud_range(vma, pgd, addr, next); 112 msync_pud_range(vma, pgd, addr, next);
105 } while (pgd++, addr = next, addr != end); 113 } while (pgd++, addr = next, addr != end);
106 spin_unlock(&mm->page_table_lock);
107}
108
109#ifdef CONFIG_PREEMPT
110static inline void filemap_sync(struct vm_area_struct *vma,
111 unsigned long addr, unsigned long end)
112{
113 const size_t chunk = 64 * 1024; /* bytes */
114 unsigned long next;
115
116 do {
117 next = addr + chunk;
118 if (next > end || next < addr)
119 next = end;
120 sync_page_range(vma, addr, next);
121 cond_resched();
122 } while (addr = next, addr != end);
123}
124#else
125static inline void filemap_sync(struct vm_area_struct *vma,
126 unsigned long addr, unsigned long end)
127{
128 sync_page_range(vma, addr, end);
129} 114}
130#endif
131 115
132/* 116/*
133 * MS_SYNC syncs the entire file - including mappings. 117 * MS_SYNC syncs the entire file - including mappings.
@@ -150,7 +134,7 @@ static int msync_interval(struct vm_area_struct *vma,
150 return -EBUSY; 134 return -EBUSY;
151 135
152 if (file && (vma->vm_flags & VM_SHARED)) { 136 if (file && (vma->vm_flags & VM_SHARED)) {
153 filemap_sync(vma, addr, end); 137 msync_page_range(vma, addr, end);
154 138
155 if (flags & MS_SYNC) { 139 if (flags & MS_SYNC) {
156 struct address_space *mapping = file->f_mapping; 140 struct address_space *mapping = file->f_mapping;
diff --git a/mm/nommu.c b/mm/nommu.c
index 0ef241ae3763..d1e076a487cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
931 realalloc -= kobjsize(vml); 931 realalloc -= kobjsize(vml);
932 askedalloc -= sizeof(*vml); 932 askedalloc -= sizeof(*vml);
933 kfree(vml); 933 kfree(vml);
934
935 update_hiwater_vm(mm);
934 mm->total_vm -= len >> PAGE_SHIFT; 936 mm->total_vm -= len >> PAGE_SHIFT;
935 937
936#ifdef DEBUG 938#ifdef DEBUG
@@ -1047,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1047 1049
1048EXPORT_SYMBOL(find_vma); 1050EXPORT_SYMBOL(find_vma);
1049 1051
1050struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) 1052struct page *follow_page(struct mm_struct *mm, unsigned long address,
1053 unsigned int foll_flags)
1051{ 1054{
1052 return NULL; 1055 return NULL;
1053} 1056}
@@ -1078,19 +1081,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1078{ 1081{
1079} 1082}
1080 1083
1081void update_mem_hiwater(struct task_struct *tsk)
1082{
1083 unsigned long rss;
1084
1085 if (likely(tsk->mm)) {
1086 rss = get_mm_counter(tsk->mm, rss);
1087 if (tsk->mm->hiwater_rss < rss)
1088 tsk->mm->hiwater_rss = rss;
1089 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
1090 tsk->mm->hiwater_vm = tsk->mm->total_vm;
1091 }
1092}
1093
1094void unmap_mapping_range(struct address_space *mapping, 1084void unmap_mapping_range(struct address_space *mapping,
1095 loff_t const holebegin, loff_t const holelen, 1085 loff_t const holebegin, loff_t const holelen,
1096 int even_cows) 1086 int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94c864eac9c4..2dbdd98426fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
33#include <linux/sysctl.h> 33#include <linux/sysctl.h>
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/cpuset.h> 35#include <linux/cpuset.h>
36#include <linux/memory_hotplug.h>
36#include <linux/nodemask.h> 37#include <linux/nodemask.h>
37#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
38 39
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024;
78unsigned long __initdata nr_kernel_pages; 79unsigned long __initdata nr_kernel_pages;
79unsigned long __initdata nr_all_pages; 80unsigned long __initdata nr_all_pages;
80 81
82static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
83{
84 int ret = 0;
85 unsigned seq;
86 unsigned long pfn = page_to_pfn(page);
87
88 do {
89 seq = zone_span_seqbegin(zone);
90 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
91 ret = 1;
92 else if (pfn < zone->zone_start_pfn)
93 ret = 1;
94 } while (zone_span_seqretry(zone, seq));
95
96 return ret;
97}
98
99static int page_is_consistent(struct zone *zone, struct page *page)
100{
101#ifdef CONFIG_HOLES_IN_ZONE
102 if (!pfn_valid(page_to_pfn(page)))
103 return 0;
104#endif
105 if (zone != page_zone(page))
106 return 0;
107
108 return 1;
109}
81/* 110/*
82 * Temporary debugging check for pages not lying within a given zone. 111 * Temporary debugging check for pages not lying within a given zone.
83 */ 112 */
84static int bad_range(struct zone *zone, struct page *page) 113static int bad_range(struct zone *zone, struct page *page)
85{ 114{
86 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) 115 if (page_outside_zone_boundaries(zone, page))
87 return 1; 116 return 1;
88 if (page_to_pfn(page) < zone->zone_start_pfn) 117 if (!page_is_consistent(zone, page))
89 return 1;
90#ifdef CONFIG_HOLES_IN_ZONE
91 if (!pfn_valid(page_to_pfn(page)))
92 return 1;
93#endif
94 if (zone != page_zone(page))
95 return 1; 118 return 1;
119
96 return 0; 120 return 0;
97} 121}
98 122
@@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page)
114 1 << PG_reclaim | 138 1 << PG_reclaim |
115 1 << PG_slab | 139 1 << PG_slab |
116 1 << PG_swapcache | 140 1 << PG_swapcache |
117 1 << PG_writeback); 141 1 << PG_writeback |
142 1 << PG_reserved );
118 set_page_count(page, 0); 143 set_page_count(page, 0);
119 reset_page_mapcount(page); 144 reset_page_mapcount(page);
120 page->mapping = NULL; 145 page->mapping = NULL;
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
153 struct page *p = page + i; 178 struct page *p = page + i;
154 179
155 SetPageCompound(p); 180 SetPageCompound(p);
156 p->private = (unsigned long)page; 181 set_page_private(p, (unsigned long)page);
157 } 182 }
158} 183}
159 184
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
173 198
174 if (!PageCompound(p)) 199 if (!PageCompound(p))
175 bad_page(__FUNCTION__, page); 200 bad_page(__FUNCTION__, page);
176 if (p->private != (unsigned long)page) 201 if (page_private(p) != (unsigned long)page)
177 bad_page(__FUNCTION__, page); 202 bad_page(__FUNCTION__, page);
178 ClearPageCompound(p); 203 ClearPageCompound(p);
179 } 204 }
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
186 * So, we don't need atomic page->flags operations here. 211 * So, we don't need atomic page->flags operations here.
187 */ 212 */
188static inline unsigned long page_order(struct page *page) { 213static inline unsigned long page_order(struct page *page) {
189 return page->private; 214 return page_private(page);
190} 215}
191 216
192static inline void set_page_order(struct page *page, int order) { 217static inline void set_page_order(struct page *page, int order) {
193 page->private = order; 218 set_page_private(page, order);
194 __SetPagePrivate(page); 219 __SetPagePrivate(page);
195} 220}
196 221
197static inline void rmv_page_order(struct page *page) 222static inline void rmv_page_order(struct page *page)
198{ 223{
199 __ClearPagePrivate(page); 224 __ClearPagePrivate(page);
200 page->private = 0; 225 set_page_private(page, 0);
201} 226}
202 227
203/* 228/*
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
237 * (a) the buddy is free && 262 * (a) the buddy is free &&
238 * (b) the buddy is on the buddy system && 263 * (b) the buddy is on the buddy system &&
239 * (c) a page and its buddy have the same order. 264 * (c) a page and its buddy have the same order.
240 * for recording page's order, we use page->private and PG_private. 265 * for recording page's order, we use page_private(page) and PG_private.
241 * 266 *
242 */ 267 */
243static inline int page_is_buddy(struct page *page, int order) 268static inline int page_is_buddy(struct page *page, int order)
244{ 269{
245 if (PagePrivate(page) && 270 if (PagePrivate(page) &&
246 (page_order(page) == order) && 271 (page_order(page) == order) &&
247 !PageReserved(page) &&
248 page_count(page) == 0) 272 page_count(page) == 0)
249 return 1; 273 return 1;
250 return 0; 274 return 0;
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order)
264 * parts of the VM system. 288 * parts of the VM system.
265 * At each level, we keep a list of pages, which are heads of continuous 289 * At each level, we keep a list of pages, which are heads of continuous
266 * free pages of length of (1 << order) and marked with PG_Private.Page's 290 * free pages of length of (1 << order) and marked with PG_Private.Page's
267 * order is recorded in page->private field. 291 * order is recorded in page_private(page) field.
268 * So when we are allocating or freeing one, we can derive the state of the 292 * So when we are allocating or freeing one, we can derive the state of the
269 * other. That is, if we allocate a small block, and both were 293 * other. That is, if we allocate a small block, and both were
270 * free, the remainder of the region must be split into blocks. 294 * free, the remainder of the region must be split into blocks.
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page)
327 1 << PG_reclaim | 351 1 << PG_reclaim |
328 1 << PG_slab | 352 1 << PG_slab |
329 1 << PG_swapcache | 353 1 << PG_swapcache |
330 1 << PG_writeback ))) 354 1 << PG_writeback |
355 1 << PG_reserved )))
331 bad_page(function, page); 356 bad_page(function, page);
332 if (PageDirty(page)) 357 if (PageDirty(page))
333 __ClearPageDirty(page); 358 __ClearPageDirty(page);
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order)
455 1 << PG_reclaim | 480 1 << PG_reclaim |
456 1 << PG_slab | 481 1 << PG_slab |
457 1 << PG_swapcache | 482 1 << PG_swapcache |
458 1 << PG_writeback ))) 483 1 << PG_writeback |
484 1 << PG_reserved )))
459 bad_page(__FUNCTION__, page); 485 bad_page(__FUNCTION__, page);
460 486
461 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 487 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
462 1 << PG_referenced | 1 << PG_arch_1 | 488 1 << PG_referenced | 1 << PG_arch_1 |
463 1 << PG_checked | 1 << PG_mappedtodisk); 489 1 << PG_checked | 1 << PG_mappedtodisk);
464 page->private = 0; 490 set_page_private(page, 0);
465 set_page_refs(page, order); 491 set_page_refs(page, order);
466 kernel_map_pages(page, 1 << order, 1); 492 kernel_map_pages(page, 1 << order, 1);
467} 493}
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec)
1016 1042
1017fastcall void __free_pages(struct page *page, unsigned int order) 1043fastcall void __free_pages(struct page *page, unsigned int order)
1018{ 1044{
1019 if (!PageReserved(page) && put_page_testzero(page)) { 1045 if (put_page_testzero(page)) {
1020 if (order == 0) 1046 if (order == 0)
1021 free_hot_page(page); 1047 free_hot_page(page);
1022 else 1048 else
@@ -1305,12 +1331,9 @@ void show_free_areas(void)
1305 } else 1331 } else
1306 printk("\n"); 1332 printk("\n");
1307 1333
1308 for (cpu = 0; cpu < NR_CPUS; ++cpu) { 1334 for_each_cpu(cpu) {
1309 struct per_cpu_pageset *pageset; 1335 struct per_cpu_pageset *pageset;
1310 1336
1311 if (!cpu_possible(cpu))
1312 continue;
1313
1314 pageset = zone_pcp(zone, cpu); 1337 pageset = zone_pcp(zone, cpu);
1315 1338
1316 for (temperature = 0; temperature < 2; temperature++) 1339 for (temperature = 0; temperature < 2; temperature++)
@@ -1660,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1660 * up by free_all_bootmem() once the early boot process is 1683 * up by free_all_bootmem() once the early boot process is
1661 * done. Non-atomic initialization, single-pass. 1684 * done. Non-atomic initialization, single-pass.
1662 */ 1685 */
1663void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1686void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1664 unsigned long start_pfn) 1687 unsigned long start_pfn)
1665{ 1688{
1666 struct page *page; 1689 struct page *page;
@@ -1674,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1674 continue; 1697 continue;
1675 page = pfn_to_page(pfn); 1698 page = pfn_to_page(pfn);
1676 set_page_links(page, zone, nid, pfn); 1699 set_page_links(page, zone, nid, pfn);
1677 set_page_count(page, 0); 1700 set_page_count(page, 1);
1678 reset_page_mapcount(page); 1701 reset_page_mapcount(page);
1679 SetPageReserved(page); 1702 SetPageReserved(page);
1680 INIT_LIST_HEAD(&page->lru); 1703 INIT_LIST_HEAD(&page->lru);
@@ -1721,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone)
1721 1744
1722 /* 1745 /*
1723 * The per-cpu-pages pools are set to around 1000th of the 1746 * The per-cpu-pages pools are set to around 1000th of the
1724 * size of the zone. But no more than 1/4 of a meg - there's 1747 * size of the zone. But no more than 1/2 of a meg.
1725 * no point in going beyond the size of L2 cache.
1726 * 1748 *
1727 * OK, so we don't know how big the cache is. So guess. 1749 * OK, so we don't know how big the cache is. So guess.
1728 */ 1750 */
1729 batch = zone->present_pages / 1024; 1751 batch = zone->present_pages / 1024;
1730 if (batch * PAGE_SIZE > 256 * 1024) 1752 if (batch * PAGE_SIZE > 512 * 1024)
1731 batch = (256 * 1024) / PAGE_SIZE; 1753 batch = (512 * 1024) / PAGE_SIZE;
1732 batch /= 4; /* We effectively *= 4 below */ 1754 batch /= 4; /* We effectively *= 4 below */
1733 if (batch < 1) 1755 if (batch < 1)
1734 batch = 1; 1756 batch = 1;
1735 1757
1736 /* 1758 /*
1737 * Clamp the batch to a 2^n - 1 value. Having a power 1759 * We will be trying to allcoate bigger chunks of contiguous
1738 * of 2 value was found to be more likely to have 1760 * memory of the order of fls(batch). This should result in
1739 * suboptimal cache aliasing properties in some cases. 1761 * better cache coloring.
1740 * 1762 *
1741 * For example if 2 tasks are alternately allocating 1763 * A sanity check also to ensure that batch is still in limits.
1742 * batches of pages, one task can end up with a lot
1743 * of pages of one half of the possible page colors
1744 * and the other with pages of the other colors.
1745 */ 1764 */
1746 batch = (1 << fls(batch + batch/2)) - 1; 1765 batch = (1 << fls(batch + batch/2));
1766
1767 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
1768 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
1769
1747 return batch; 1770 return batch;
1748} 1771}
1749 1772
@@ -1755,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1755 1778
1756 pcp = &p->pcp[0]; /* hot */ 1779 pcp = &p->pcp[0]; /* hot */
1757 pcp->count = 0; 1780 pcp->count = 0;
1758 pcp->low = 2 * batch; 1781 pcp->low = 0;
1759 pcp->high = 6 * batch; 1782 pcp->high = 6 * batch;
1760 pcp->batch = max(1UL, 1 * batch); 1783 pcp->batch = max(1UL, 1 * batch);
1761 INIT_LIST_HEAD(&pcp->list); 1784 INIT_LIST_HEAD(&pcp->list);
@@ -1764,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1764 pcp->count = 0; 1787 pcp->count = 0;
1765 pcp->low = 0; 1788 pcp->low = 0;
1766 pcp->high = 2 * batch; 1789 pcp->high = 2 * batch;
1767 pcp->batch = max(1UL, 1 * batch); 1790 pcp->batch = max(1UL, batch/2);
1768 INIT_LIST_HEAD(&pcp->list); 1791 INIT_LIST_HEAD(&pcp->list);
1769} 1792}
1770 1793
@@ -1873,6 +1896,60 @@ void __init setup_per_cpu_pageset()
1873 1896
1874#endif 1897#endif
1875 1898
1899static __devinit
1900void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1901{
1902 int i;
1903 struct pglist_data *pgdat = zone->zone_pgdat;
1904
1905 /*
1906 * The per-page waitqueue mechanism uses hashed waitqueues
1907 * per zone.
1908 */
1909 zone->wait_table_size = wait_table_size(zone_size_pages);
1910 zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
1911 zone->wait_table = (wait_queue_head_t *)
1912 alloc_bootmem_node(pgdat, zone->wait_table_size
1913 * sizeof(wait_queue_head_t));
1914
1915 for(i = 0; i < zone->wait_table_size; ++i)
1916 init_waitqueue_head(zone->wait_table + i);
1917}
1918
1919static __devinit void zone_pcp_init(struct zone *zone)
1920{
1921 int cpu;
1922 unsigned long batch = zone_batchsize(zone);
1923
1924 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1925#ifdef CONFIG_NUMA
1926 /* Early boot. Slab allocator not functional yet */
1927 zone->pageset[cpu] = &boot_pageset[cpu];
1928 setup_pageset(&boot_pageset[cpu],0);
1929#else
1930 setup_pageset(zone_pcp(zone,cpu), batch);
1931#endif
1932 }
1933 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1934 zone->name, zone->present_pages, batch);
1935}
1936
1937static __devinit void init_currently_empty_zone(struct zone *zone,
1938 unsigned long zone_start_pfn, unsigned long size)
1939{
1940 struct pglist_data *pgdat = zone->zone_pgdat;
1941
1942 zone_wait_table_init(zone, size);
1943 pgdat->nr_zones = zone_idx(zone) + 1;
1944
1945 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1946 zone->zone_start_pfn = zone_start_pfn;
1947
1948 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
1949
1950 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1951}
1952
1876/* 1953/*
1877 * Set up the zone data structures: 1954 * Set up the zone data structures:
1878 * - mark all pages reserved 1955 * - mark all pages reserved
@@ -1882,10 +1959,11 @@ void __init setup_per_cpu_pageset()
1882static void __init free_area_init_core(struct pglist_data *pgdat, 1959static void __init free_area_init_core(struct pglist_data *pgdat,
1883 unsigned long *zones_size, unsigned long *zholes_size) 1960 unsigned long *zones_size, unsigned long *zholes_size)
1884{ 1961{
1885 unsigned long i, j; 1962 unsigned long j;
1886 int cpu, nid = pgdat->node_id; 1963 int nid = pgdat->node_id;
1887 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1964 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1888 1965
1966 pgdat_resize_init(pgdat);
1889 pgdat->nr_zones = 0; 1967 pgdat->nr_zones = 0;
1890 init_waitqueue_head(&pgdat->kswapd_wait); 1968 init_waitqueue_head(&pgdat->kswapd_wait);
1891 pgdat->kswapd_max_order = 0; 1969 pgdat->kswapd_max_order = 0;
@@ -1893,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1893 for (j = 0; j < MAX_NR_ZONES; j++) { 1971 for (j = 0; j < MAX_NR_ZONES; j++) {
1894 struct zone *zone = pgdat->node_zones + j; 1972 struct zone *zone = pgdat->node_zones + j;
1895 unsigned long size, realsize; 1973 unsigned long size, realsize;
1896 unsigned long batch;
1897 1974
1898 realsize = size = zones_size[j]; 1975 realsize = size = zones_size[j];
1899 if (zholes_size) 1976 if (zholes_size)
@@ -1908,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1908 zone->name = zone_names[j]; 1985 zone->name = zone_names[j];
1909 spin_lock_init(&zone->lock); 1986 spin_lock_init(&zone->lock);
1910 spin_lock_init(&zone->lru_lock); 1987 spin_lock_init(&zone->lru_lock);
1988 zone_seqlock_init(zone);
1911 zone->zone_pgdat = pgdat; 1989 zone->zone_pgdat = pgdat;
1912 zone->free_pages = 0; 1990 zone->free_pages = 0;
1913 1991
1914 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1992 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1915 1993
1916 batch = zone_batchsize(zone); 1994 zone_pcp_init(zone);
1917
1918 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1919#ifdef CONFIG_NUMA
1920 /* Early boot. Slab allocator not functional yet */
1921 zone->pageset[cpu] = &boot_pageset[cpu];
1922 setup_pageset(&boot_pageset[cpu],0);
1923#else
1924 setup_pageset(zone_pcp(zone,cpu), batch);
1925#endif
1926 }
1927 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1928 zone_names[j], realsize, batch);
1929 INIT_LIST_HEAD(&zone->active_list); 1995 INIT_LIST_HEAD(&zone->active_list);
1930 INIT_LIST_HEAD(&zone->inactive_list); 1996 INIT_LIST_HEAD(&zone->inactive_list);
1931 zone->nr_scan_active = 0; 1997 zone->nr_scan_active = 0;
@@ -1936,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1936 if (!size) 2002 if (!size)
1937 continue; 2003 continue;
1938 2004
1939 /*
1940 * The per-page waitqueue mechanism uses hashed waitqueues
1941 * per zone.
1942 */
1943 zone->wait_table_size = wait_table_size(size);
1944 zone->wait_table_bits =
1945 wait_table_bits(zone->wait_table_size);
1946 zone->wait_table = (wait_queue_head_t *)
1947 alloc_bootmem_node(pgdat, zone->wait_table_size
1948 * sizeof(wait_queue_head_t));
1949
1950 for(i = 0; i < zone->wait_table_size; ++i)
1951 init_waitqueue_head(zone->wait_table + i);
1952
1953 pgdat->nr_zones = j+1;
1954
1955 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1956 zone->zone_start_pfn = zone_start_pfn;
1957
1958 memmap_init(size, nid, j, zone_start_pfn);
1959
1960 zonetable_add(zone, nid, j, zone_start_pfn, size); 2005 zonetable_add(zone, nid, j, zone_start_pfn, size);
1961 2006 init_currently_empty_zone(zone, zone_start_pfn, size);
1962 zone_start_pfn += size; 2007 zone_start_pfn += size;
1963
1964 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1965 } 2008 }
1966} 2009}
1967 2010
@@ -2361,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void)
2361 * that the pages_{min,low,high} values for each zone are set correctly 2404 * that the pages_{min,low,high} values for each zone are set correctly
2362 * with respect to min_free_kbytes. 2405 * with respect to min_free_kbytes.
2363 */ 2406 */
2364static void setup_per_zone_pages_min(void) 2407void setup_per_zone_pages_min(void)
2365{ 2408{
2366 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2409 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2367 unsigned long lowmem_pages = 0; 2410 unsigned long lowmem_pages = 0;
diff --git a/mm/page_io.c b/mm/page_io.c
index 330e00d6db00..bb2b0d53889c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
91 unlock_page(page); 91 unlock_page(page);
92 goto out; 92 goto out;
93 } 93 }
94 bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); 94 bio = get_swap_bio(GFP_NOIO, page_private(page), page,
95 end_swap_bio_write);
95 if (bio == NULL) { 96 if (bio == NULL) {
96 set_page_dirty(page); 97 set_page_dirty(page);
97 unlock_page(page); 98 unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
115 116
116 BUG_ON(!PageLocked(page)); 117 BUG_ON(!PageLocked(page));
117 ClearPageUptodate(page); 118 ClearPageUptodate(page);
118 bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); 119 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
120 end_swap_bio_read);
119 if (bio == NULL) { 121 if (bio == NULL) {
120 unlock_page(page); 122 unlock_page(page);
121 ret = -ENOMEM; 123 ret = -ENOMEM;
diff --git a/mm/rmap.c b/mm/rmap.c
index 450f5241b5a5..914d04b98bee 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,7 @@
32 * page->flags PG_locked (lock_page) 32 * page->flags PG_locked (lock_page)
33 * mapping->i_mmap_lock 33 * mapping->i_mmap_lock
34 * anon_vma->lock 34 * anon_vma->lock
35 * mm->page_table_lock 35 * mm->page_table_lock or pte_lock
36 * zone->lru_lock (in mark_page_accessed) 36 * zone->lru_lock (in mark_page_accessed)
37 * swap_lock (in swap_duplicate, swap_info_get) 37 * swap_lock (in swap_duplicate, swap_info_get)
38 * mmlist_lock (in mmput, drain_mmlist and others) 38 * mmlist_lock (in mmput, drain_mmlist and others)
@@ -244,37 +244,44 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
244/* 244/*
245 * Check that @page is mapped at @address into @mm. 245 * Check that @page is mapped at @address into @mm.
246 * 246 *
247 * On success returns with mapped pte and locked mm->page_table_lock. 247 * On success returns with pte mapped and locked.
248 */ 248 */
249pte_t *page_check_address(struct page *page, struct mm_struct *mm, 249pte_t *page_check_address(struct page *page, struct mm_struct *mm,
250 unsigned long address) 250 unsigned long address, spinlock_t **ptlp)
251{ 251{
252 pgd_t *pgd; 252 pgd_t *pgd;
253 pud_t *pud; 253 pud_t *pud;
254 pmd_t *pmd; 254 pmd_t *pmd;
255 pte_t *pte; 255 pte_t *pte;
256 spinlock_t *ptl;
256 257
257 /*
258 * We need the page_table_lock to protect us from page faults,
259 * munmap, fork, etc...
260 */
261 spin_lock(&mm->page_table_lock);
262 pgd = pgd_offset(mm, address); 258 pgd = pgd_offset(mm, address);
263 if (likely(pgd_present(*pgd))) { 259 if (!pgd_present(*pgd))
264 pud = pud_offset(pgd, address); 260 return NULL;
265 if (likely(pud_present(*pud))) { 261
266 pmd = pmd_offset(pud, address); 262 pud = pud_offset(pgd, address);
267 if (likely(pmd_present(*pmd))) { 263 if (!pud_present(*pud))
268 pte = pte_offset_map(pmd, address); 264 return NULL;
269 if (likely(pte_present(*pte) && 265
270 page_to_pfn(page) == pte_pfn(*pte))) 266 pmd = pmd_offset(pud, address);
271 return pte; 267 if (!pmd_present(*pmd))
272 pte_unmap(pte); 268 return NULL;
273 } 269
274 } 270 pte = pte_offset_map(pmd, address);
271 /* Make a quick check before getting the lock */
272 if (!pte_present(*pte)) {
273 pte_unmap(pte);
274 return NULL;
275 }
276
277 ptl = pte_lockptr(mm, pmd);
278 spin_lock(ptl);
279 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
280 *ptlp = ptl;
281 return pte;
275 } 282 }
276 spin_unlock(&mm->page_table_lock); 283 pte_unmap_unlock(pte, ptl);
277 return ERR_PTR(-ENOENT); 284 return NULL;
278} 285}
279 286
280/* 287/*
@@ -287,24 +294,28 @@ static int page_referenced_one(struct page *page,
287 struct mm_struct *mm = vma->vm_mm; 294 struct mm_struct *mm = vma->vm_mm;
288 unsigned long address; 295 unsigned long address;
289 pte_t *pte; 296 pte_t *pte;
297 spinlock_t *ptl;
290 int referenced = 0; 298 int referenced = 0;
291 299
292 address = vma_address(page, vma); 300 address = vma_address(page, vma);
293 if (address == -EFAULT) 301 if (address == -EFAULT)
294 goto out; 302 goto out;
295 303
296 pte = page_check_address(page, mm, address); 304 pte = page_check_address(page, mm, address, &ptl);
297 if (!IS_ERR(pte)) { 305 if (!pte)
298 if (ptep_clear_flush_young(vma, address, pte)) 306 goto out;
299 referenced++;
300 307
301 if (mm != current->mm && !ignore_token && has_swap_token(mm)) 308 if (ptep_clear_flush_young(vma, address, pte))
302 referenced++; 309 referenced++;
303 310
304 (*mapcount)--; 311 /* Pretend the page is referenced if the task has the
305 pte_unmap(pte); 312 swap token and is in the middle of a page fault. */
306 spin_unlock(&mm->page_table_lock); 313 if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
307 } 314 rwsem_is_locked(&mm->mmap_sem))
315 referenced++;
316
317 (*mapcount)--;
318 pte_unmap_unlock(pte, ptl);
308out: 319out:
309 return referenced; 320 return referenced;
310} 321}
@@ -434,15 +445,11 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
434 * @vma: the vm area in which the mapping is added 445 * @vma: the vm area in which the mapping is added
435 * @address: the user virtual address mapped 446 * @address: the user virtual address mapped
436 * 447 *
437 * The caller needs to hold the mm->page_table_lock. 448 * The caller needs to hold the pte lock.
438 */ 449 */
439void page_add_anon_rmap(struct page *page, 450void page_add_anon_rmap(struct page *page,
440 struct vm_area_struct *vma, unsigned long address) 451 struct vm_area_struct *vma, unsigned long address)
441{ 452{
442 BUG_ON(PageReserved(page));
443
444 inc_mm_counter(vma->vm_mm, anon_rss);
445
446 if (atomic_inc_and_test(&page->_mapcount)) { 453 if (atomic_inc_and_test(&page->_mapcount)) {
447 struct anon_vma *anon_vma = vma->anon_vma; 454 struct anon_vma *anon_vma = vma->anon_vma;
448 455
@@ -461,13 +468,12 @@ void page_add_anon_rmap(struct page *page,
461 * page_add_file_rmap - add pte mapping to a file page 468 * page_add_file_rmap - add pte mapping to a file page
462 * @page: the page to add the mapping to 469 * @page: the page to add the mapping to
463 * 470 *
464 * The caller needs to hold the mm->page_table_lock. 471 * The caller needs to hold the pte lock.
465 */ 472 */
466void page_add_file_rmap(struct page *page) 473void page_add_file_rmap(struct page *page)
467{ 474{
468 BUG_ON(PageAnon(page)); 475 BUG_ON(PageAnon(page));
469 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) 476 BUG_ON(!pfn_valid(page_to_pfn(page)));
470 return;
471 477
472 if (atomic_inc_and_test(&page->_mapcount)) 478 if (atomic_inc_and_test(&page->_mapcount))
473 inc_page_state(nr_mapped); 479 inc_page_state(nr_mapped);
@@ -477,12 +483,10 @@ void page_add_file_rmap(struct page *page)
477 * page_remove_rmap - take down pte mapping from a page 483 * page_remove_rmap - take down pte mapping from a page
478 * @page: page to remove mapping from 484 * @page: page to remove mapping from
479 * 485 *
480 * Caller needs to hold the mm->page_table_lock. 486 * The caller needs to hold the pte lock.
481 */ 487 */
482void page_remove_rmap(struct page *page) 488void page_remove_rmap(struct page *page)
483{ 489{
484 BUG_ON(PageReserved(page));
485
486 if (atomic_add_negative(-1, &page->_mapcount)) { 490 if (atomic_add_negative(-1, &page->_mapcount)) {
487 BUG_ON(page_mapcount(page) < 0); 491 BUG_ON(page_mapcount(page) < 0);
488 /* 492 /*
@@ -510,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
510 unsigned long address; 514 unsigned long address;
511 pte_t *pte; 515 pte_t *pte;
512 pte_t pteval; 516 pte_t pteval;
517 spinlock_t *ptl;
513 int ret = SWAP_AGAIN; 518 int ret = SWAP_AGAIN;
514 519
515 address = vma_address(page, vma); 520 address = vma_address(page, vma);
516 if (address == -EFAULT) 521 if (address == -EFAULT)
517 goto out; 522 goto out;
518 523
519 pte = page_check_address(page, mm, address); 524 pte = page_check_address(page, mm, address, &ptl);
520 if (IS_ERR(pte)) 525 if (!pte)
521 goto out; 526 goto out;
522 527
523 /* 528 /*
@@ -541,8 +546,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
541 if (pte_dirty(pteval)) 546 if (pte_dirty(pteval))
542 set_page_dirty(page); 547 set_page_dirty(page);
543 548
549 /* Update high watermark before we lower rss */
550 update_hiwater_rss(mm);
551
544 if (PageAnon(page)) { 552 if (PageAnon(page)) {
545 swp_entry_t entry = { .val = page->private }; 553 swp_entry_t entry = { .val = page_private(page) };
546 /* 554 /*
547 * Store the swap location in the pte. 555 * Store the swap location in the pte.
548 * See handle_pte_fault() ... 556 * See handle_pte_fault() ...
@@ -551,21 +559,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
551 swap_duplicate(entry); 559 swap_duplicate(entry);
552 if (list_empty(&mm->mmlist)) { 560 if (list_empty(&mm->mmlist)) {
553 spin_lock(&mmlist_lock); 561 spin_lock(&mmlist_lock);
554 list_add(&mm->mmlist, &init_mm.mmlist); 562 if (list_empty(&mm->mmlist))
563 list_add(&mm->mmlist, &init_mm.mmlist);
555 spin_unlock(&mmlist_lock); 564 spin_unlock(&mmlist_lock);
556 } 565 }
557 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 566 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
558 BUG_ON(pte_file(*pte)); 567 BUG_ON(pte_file(*pte));
559 dec_mm_counter(mm, anon_rss); 568 dec_mm_counter(mm, anon_rss);
560 } 569 } else
570 dec_mm_counter(mm, file_rss);
561 571
562 dec_mm_counter(mm, rss);
563 page_remove_rmap(page); 572 page_remove_rmap(page);
564 page_cache_release(page); 573 page_cache_release(page);
565 574
566out_unmap: 575out_unmap:
567 pte_unmap(pte); 576 pte_unmap_unlock(pte, ptl);
568 spin_unlock(&mm->page_table_lock);
569out: 577out:
570 return ret; 578 return ret;
571} 579}
@@ -599,19 +607,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
599 pgd_t *pgd; 607 pgd_t *pgd;
600 pud_t *pud; 608 pud_t *pud;
601 pmd_t *pmd; 609 pmd_t *pmd;
602 pte_t *pte, *original_pte; 610 pte_t *pte;
603 pte_t pteval; 611 pte_t pteval;
612 spinlock_t *ptl;
604 struct page *page; 613 struct page *page;
605 unsigned long address; 614 unsigned long address;
606 unsigned long end; 615 unsigned long end;
607 unsigned long pfn; 616 unsigned long pfn;
608 617
609 /*
610 * We need the page_table_lock to protect us from page faults,
611 * munmap, fork, etc...
612 */
613 spin_lock(&mm->page_table_lock);
614
615 address = (vma->vm_start + cursor) & CLUSTER_MASK; 618 address = (vma->vm_start + cursor) & CLUSTER_MASK;
616 end = address + CLUSTER_SIZE; 619 end = address + CLUSTER_SIZE;
617 if (address < vma->vm_start) 620 if (address < vma->vm_start)
@@ -621,30 +624,33 @@ static void try_to_unmap_cluster(unsigned long cursor,
621 624
622 pgd = pgd_offset(mm, address); 625 pgd = pgd_offset(mm, address);
623 if (!pgd_present(*pgd)) 626 if (!pgd_present(*pgd))
624 goto out_unlock; 627 return;
625 628
626 pud = pud_offset(pgd, address); 629 pud = pud_offset(pgd, address);
627 if (!pud_present(*pud)) 630 if (!pud_present(*pud))
628 goto out_unlock; 631 return;
629 632
630 pmd = pmd_offset(pud, address); 633 pmd = pmd_offset(pud, address);
631 if (!pmd_present(*pmd)) 634 if (!pmd_present(*pmd))
632 goto out_unlock; 635 return;
636
637 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
633 638
634 for (original_pte = pte = pte_offset_map(pmd, address); 639 /* Update high watermark before we lower rss */
635 address < end; pte++, address += PAGE_SIZE) { 640 update_hiwater_rss(mm);
636 641
642 for (; address < end; pte++, address += PAGE_SIZE) {
637 if (!pte_present(*pte)) 643 if (!pte_present(*pte))
638 continue; 644 continue;
639 645
640 pfn = pte_pfn(*pte); 646 pfn = pte_pfn(*pte);
641 if (!pfn_valid(pfn)) 647 if (unlikely(!pfn_valid(pfn))) {
648 print_bad_pte(vma, *pte, address);
642 continue; 649 continue;
650 }
643 651
644 page = pfn_to_page(pfn); 652 page = pfn_to_page(pfn);
645 BUG_ON(PageAnon(page)); 653 BUG_ON(PageAnon(page));
646 if (PageReserved(page))
647 continue;
648 654
649 if (ptep_clear_flush_young(vma, address, pte)) 655 if (ptep_clear_flush_young(vma, address, pte))
650 continue; 656 continue;
@@ -663,13 +669,10 @@ static void try_to_unmap_cluster(unsigned long cursor,
663 669
664 page_remove_rmap(page); 670 page_remove_rmap(page);
665 page_cache_release(page); 671 page_cache_release(page);
666 dec_mm_counter(mm, rss); 672 dec_mm_counter(mm, file_rss);
667 (*mapcount)--; 673 (*mapcount)--;
668 } 674 }
669 675 pte_unmap_unlock(pte - 1, ptl);
670 pte_unmap(original_pte);
671out_unlock:
672 spin_unlock(&mm->page_table_lock);
673} 676}
674 677
675static int try_to_unmap_anon(struct page *page) 678static int try_to_unmap_anon(struct page *page)
@@ -806,7 +809,6 @@ int try_to_unmap(struct page *page)
806{ 809{
807 int ret; 810 int ret;
808 811
809 BUG_ON(PageReserved(page));
810 BUG_ON(!PageLocked(page)); 812 BUG_ON(!PageLocked(page));
811 813
812 if (PageAnon(page)) 814 if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index 55e04a0734c1..dc25565a61e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
71/* Pretend that each entry is of this size in directory's i_size */ 71/* Pretend that each entry is of this size in directory's i_size */
72#define BOGO_DIRENT_SIZE 20 72#define BOGO_DIRENT_SIZE 20
73 73
74/* Keep swapped page count in private field of indirect struct page */
75#define nr_swapped private
76
77/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 74/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
78enum sgp_type { 75enum sgp_type {
79 SGP_QUICK, /* don't try more than file page cache lookup */ 76 SGP_QUICK, /* don't try more than file page cache lookup */
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
324 321
325 entry->val = value; 322 entry->val = value;
326 info->swapped += incdec; 323 info->swapped += incdec;
327 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) 324 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
328 kmap_atomic_to_page(entry)->nr_swapped += incdec; 325 struct page *page = kmap_atomic_to_page(entry);
326 set_page_private(page, page_private(page) + incdec);
327 }
329} 328}
330 329
331/* 330/*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
368 367
369 spin_unlock(&info->lock); 368 spin_unlock(&info->lock);
370 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); 369 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
371 if (page) { 370 if (page)
372 page->nr_swapped = 0; 371 set_page_private(page, 0);
373 }
374 spin_lock(&info->lock); 372 spin_lock(&info->lock);
375 373
376 if (!page) { 374 if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
561 diroff = 0; 559 diroff = 0;
562 } 560 }
563 subdir = dir[diroff]; 561 subdir = dir[diroff];
564 if (subdir && subdir->nr_swapped) { 562 if (subdir && page_private(subdir)) {
565 size = limit - idx; 563 size = limit - idx;
566 if (size > ENTRIES_PER_PAGE) 564 if (size > ENTRIES_PER_PAGE)
567 size = ENTRIES_PER_PAGE; 565 size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
572 nr_swaps_freed += freed; 570 nr_swaps_freed += freed;
573 if (offset) 571 if (offset)
574 spin_lock(&info->lock); 572 spin_lock(&info->lock);
575 subdir->nr_swapped -= freed; 573 set_page_private(subdir, page_private(subdir) - freed);
576 if (offset) 574 if (offset)
577 spin_unlock(&info->lock); 575 spin_unlock(&info->lock);
578 BUG_ON(subdir->nr_swapped > offset); 576 BUG_ON(page_private(subdir) > offset);
579 } 577 }
580 if (offset) 578 if (offset)
581 offset = 0; 579 offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
743 dir = shmem_dir_map(subdir); 741 dir = shmem_dir_map(subdir);
744 } 742 }
745 subdir = *dir; 743 subdir = *dir;
746 if (subdir && subdir->nr_swapped) { 744 if (subdir && page_private(subdir)) {
747 ptr = shmem_swp_map(subdir); 745 ptr = shmem_swp_map(subdir);
748 size = limit - idx; 746 size = limit - idx;
749 if (size > ENTRIES_PER_PAGE) 747 if (size > ENTRIES_PER_PAGE)
@@ -1201,7 +1199,7 @@ static int shmem_populate(struct vm_area_struct *vma,
1201 page_cache_release(page); 1199 page_cache_release(page);
1202 return err; 1200 return err;
1203 } 1201 }
1204 } else { 1202 } else if (vma->vm_flags & VM_NONLINEAR) {
1205 /* No page was found just because we can't read it in 1203 /* No page was found just because we can't read it in
1206 * now (being here implies nonblock != 0), but the page 1204 * now (being here implies nonblock != 0), but the page
1207 * may exist, so set the PTE to fault it in later. */ 1205 * may exist, so set the PTE to fault it in later. */
@@ -1506,8 +1504,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1506 */ 1504 */
1507 if (!offset) 1505 if (!offset)
1508 mark_page_accessed(page); 1506 mark_page_accessed(page);
1509 } else 1507 } else {
1510 page = ZERO_PAGE(0); 1508 page = ZERO_PAGE(0);
1509 page_cache_get(page);
1510 }
1511 1511
1512 /* 1512 /*
1513 * Ok, we have the page, and it's up-to-date, so 1513 * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/slab.c b/mm/slab.c
index d30423f167a2..22bfb0b2ac8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2419,6 +2419,7 @@ retry:
2419 next = slab_bufctl(slabp)[slabp->free]; 2419 next = slab_bufctl(slabp)[slabp->free];
2420#if DEBUG 2420#if DEBUG
2421 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2421 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2422 WARN_ON(numa_node_id() != slabp->nodeid);
2422#endif 2423#endif
2423 slabp->free = next; 2424 slabp->free = next;
2424 } 2425 }
@@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
2633 check_spinlock_acquired_node(cachep, node); 2634 check_spinlock_acquired_node(cachep, node);
2634 check_slabp(cachep, slabp); 2635 check_slabp(cachep, slabp);
2635 2636
2636
2637#if DEBUG 2637#if DEBUG
2638 /* Verify that the slab belongs to the intended node */
2639 WARN_ON(slabp->nodeid != node);
2640
2638 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2641 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2639 printk(KERN_ERR "slab: double free detected in cache " 2642 printk(KERN_ERR "slab: double free detected in cache "
2640 "'%s', objp %p\n", cachep->name, objp); 2643 "'%s', objp %p\n", cachep->name, objp);
diff --git a/mm/sparse.c b/mm/sparse.c
index 347249a4917a..72079b538e2d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,8 +5,10 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/highmem.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/vmalloc.h>
10#include <asm/dma.h> 12#include <asm/dma.h>
11 13
12/* 14/*
@@ -72,6 +74,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
72} 74}
73#endif 75#endif
74 76
77/*
78 * Although written for the SPARSEMEM_EXTREME case, this happens
79 * to also work for the flat array case becase
80 * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
81 */
82int __section_nr(struct mem_section* ms)
83{
84 unsigned long root_nr;
85 struct mem_section* root;
86
87 for (root_nr = 0;
88 root_nr < NR_MEM_SECTIONS;
89 root_nr += SECTIONS_PER_ROOT) {
90 root = __nr_to_section(root_nr);
91
92 if (!root)
93 continue;
94
95 if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
96 break;
97 }
98
99 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
100}
101
75/* Record a memory area against a node. */ 102/* Record a memory area against a node. */
76void memory_present(int nid, unsigned long start, unsigned long end) 103void memory_present(int nid, unsigned long start, unsigned long end)
77{ 104{
@@ -162,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
162 return NULL; 189 return NULL;
163} 190}
164 191
192static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
193{
194 struct page *page, *ret;
195 unsigned long memmap_size = sizeof(struct page) * nr_pages;
196
197 page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
198 if (page)
199 goto got_map_page;
200
201 ret = vmalloc(memmap_size);
202 if (ret)
203 goto got_map_ptr;
204
205 return NULL;
206got_map_page:
207 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
208got_map_ptr:
209 memset(ret, 0, memmap_size);
210
211 return ret;
212}
213
214static int vaddr_in_vmalloc_area(void *addr)
215{
216 if (addr >= (void *)VMALLOC_START &&
217 addr < (void *)VMALLOC_END)
218 return 1;
219 return 0;
220}
221
222static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
223{
224 if (vaddr_in_vmalloc_area(memmap))
225 vfree(memmap);
226 else
227 free_pages((unsigned long)memmap,
228 get_order(sizeof(struct page) * nr_pages));
229}
230
165/* 231/*
166 * Allocate the accumulated non-linear sections, allocate a mem_map 232 * Allocate the accumulated non-linear sections, allocate a mem_map
167 * for each and record the physical to section mapping. 233 * for each and record the physical to section mapping.
@@ -187,14 +253,37 @@ void sparse_init(void)
187 * set. If this is <=0, then that means that the passed-in 253 * set. If this is <=0, then that means that the passed-in
188 * map was not consumed and must be freed. 254 * map was not consumed and must be freed.
189 */ 255 */
190int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) 256int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
257 int nr_pages)
191{ 258{
192 struct mem_section *ms = __pfn_to_section(start_pfn); 259 unsigned long section_nr = pfn_to_section_nr(start_pfn);
260 struct pglist_data *pgdat = zone->zone_pgdat;
261 struct mem_section *ms;
262 struct page *memmap;
263 unsigned long flags;
264 int ret;
193 265
194 if (ms->section_mem_map & SECTION_MARKED_PRESENT) 266 /*
195 return -EEXIST; 267 * no locking for this, because it does its own
268 * plus, it does a kmalloc
269 */
270 sparse_index_init(section_nr, pgdat->node_id);
271 memmap = __kmalloc_section_memmap(nr_pages);
272
273 pgdat_resize_lock(pgdat, &flags);
196 274
275 ms = __pfn_to_section(start_pfn);
276 if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
277 ret = -EEXIST;
278 goto out;
279 }
197 ms->section_mem_map |= SECTION_MARKED_PRESENT; 280 ms->section_mem_map |= SECTION_MARKED_PRESENT;
198 281
199 return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); 282 ret = sparse_init_one_section(ms, section_nr, memmap);
283
284 if (ret <= 0)
285 __kfree_section_memmap(memmap, nr_pages);
286out:
287 pgdat_resize_unlock(pgdat, &flags);
288 return ret;
200} 289}
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f62..b89512877ec2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
39void put_page(struct page *page) 39void put_page(struct page *page)
40{ 40{
41 if (unlikely(PageCompound(page))) { 41 if (unlikely(PageCompound(page))) {
42 page = (struct page *)page->private; 42 page = (struct page *)page_private(page);
43 if (put_page_testzero(page)) { 43 if (put_page_testzero(page)) {
44 void (*dtor)(struct page *page); 44 void (*dtor)(struct page *page);
45 45
@@ -48,7 +48,7 @@ void put_page(struct page *page)
48 } 48 }
49 return; 49 return;
50 } 50 }
51 if (!PageReserved(page) && put_page_testzero(page)) 51 if (put_page_testzero(page))
52 __page_cache_release(page); 52 __page_cache_release(page);
53} 53}
54EXPORT_SYMBOL(put_page); 54EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
215 struct page *page = pages[i]; 215 struct page *page = pages[i];
216 struct zone *pagezone; 216 struct zone *pagezone;
217 217
218 if (PageReserved(page) || !put_page_testzero(page)) 218 if (!put_page_testzero(page))
219 continue; 219 continue;
220 220
221 pagezone = page_zone(page); 221 pagezone = page_zone(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 132164f7d0a7..dfd9a46755b8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
83 page_cache_get(page); 83 page_cache_get(page);
84 SetPageLocked(page); 84 SetPageLocked(page);
85 SetPageSwapCache(page); 85 SetPageSwapCache(page);
86 page->private = entry.val; 86 set_page_private(page, entry.val);
87 total_swapcache_pages++; 87 total_swapcache_pages++;
88 pagecache_acct(1); 88 pagecache_acct(1);
89 } 89 }
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
126 BUG_ON(PageWriteback(page)); 126 BUG_ON(PageWriteback(page));
127 BUG_ON(PagePrivate(page)); 127 BUG_ON(PagePrivate(page));
128 128
129 radix_tree_delete(&swapper_space.page_tree, page->private); 129 radix_tree_delete(&swapper_space.page_tree, page_private(page));
130 page->private = 0; 130 set_page_private(page, 0);
131 ClearPageSwapCache(page); 131 ClearPageSwapCache(page);
132 total_swapcache_pages--; 132 total_swapcache_pages--;
133 pagecache_acct(-1); 133 pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
197{ 197{
198 swp_entry_t entry; 198 swp_entry_t entry;
199 199
200 entry.val = page->private; 200 entry.val = page_private(page);
201 201
202 write_lock_irq(&swapper_space.tree_lock); 202 write_lock_irq(&swapper_space.tree_lock);
203 __delete_from_swap_cache(page); 203 __delete_from_swap_cache(page);
@@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page)
259 259
260/* 260/*
261 * Perform a free_page(), also freeing any swap cache associated with 261 * Perform a free_page(), also freeing any swap cache associated with
262 * this page if it is the last user of the page. Can not do a lock_page, 262 * this page if it is the last user of the page.
263 * as we are holding the page_table_lock spinlock.
264 */ 263 */
265void free_page_and_swap_cache(struct page *page) 264void free_page_and_swap_cache(struct page *page)
266{ 265{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1dcaeda039f4..8970c0b74194 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
61 swp_entry_t entry; 61 swp_entry_t entry;
62 62
63 down_read(&swap_unplug_sem); 63 down_read(&swap_unplug_sem);
64 entry.val = page->private; 64 entry.val = page_private(page);
65 if (PageSwapCache(page)) { 65 if (PageSwapCache(page)) {
66 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 66 struct block_device *bdev = swap_info[swp_type(entry)].bdev;
67 struct backing_dev_info *bdi; 67 struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
69 /* 69 /*
70 * If the page is removed from swapcache from under us (with a 70 * If the page is removed from swapcache from under us (with a
71 * racy try_to_unuse/swapoff) we need an additional reference 71 * racy try_to_unuse/swapoff) we need an additional reference
72 * count to avoid reading garbage from page->private above. If 72 * count to avoid reading garbage from page_private(page) above.
73 * the WARN_ON triggers during a swapoff it maybe the race 73 * If the WARN_ON triggers during a swapoff it maybe the race
74 * condition and it's harmless. However if it triggers without 74 * condition and it's harmless. However if it triggers without
75 * swapoff it signals a problem. 75 * swapoff it signals a problem.
76 */ 76 */
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
294 struct swap_info_struct *p; 294 struct swap_info_struct *p;
295 swp_entry_t entry; 295 swp_entry_t entry;
296 296
297 entry.val = page->private; 297 entry.val = page_private(page);
298 p = swap_info_get(entry); 298 p = swap_info_get(entry);
299 if (p) { 299 if (p) {
300 /* Subtract the 1 for the swap cache itself */ 300 /* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
339 if (page_count(page) != 2) /* 2: us + cache */ 339 if (page_count(page) != 2) /* 2: us + cache */
340 return 0; 340 return 0;
341 341
342 entry.val = page->private; 342 entry.val = page_private(page);
343 p = swap_info_get(entry); 343 p = swap_info_get(entry);
344 if (!p) 344 if (!p)
345 return 0; 345 return 0;
@@ -398,17 +398,14 @@ void free_swap_and_cache(swp_entry_t entry)
398} 398}
399 399
400/* 400/*
401 * Always set the resulting pte to be nowrite (the same as COW pages 401 * No need to decide whether this PTE shares the swap entry with others,
402 * after one process has exited). We don't know just how many PTEs will 402 * just let do_wp_page work it out if a write is requested later - to
403 * share this swap entry, so be cautious and let do_wp_page work out 403 * force COW, vm_page_prot omits write permission from any private vma.
404 * what to do if a write is requested later.
405 *
406 * vma->vm_mm->page_table_lock is held.
407 */ 404 */
408static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, 405static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
409 unsigned long addr, swp_entry_t entry, struct page *page) 406 unsigned long addr, swp_entry_t entry, struct page *page)
410{ 407{
411 inc_mm_counter(vma->vm_mm, rss); 408 inc_mm_counter(vma->vm_mm, anon_rss);
412 get_page(page); 409 get_page(page);
413 set_pte_at(vma->vm_mm, addr, pte, 410 set_pte_at(vma->vm_mm, addr, pte,
414 pte_mkold(mk_pte(page, vma->vm_page_prot))); 411 pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -425,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
425 unsigned long addr, unsigned long end, 422 unsigned long addr, unsigned long end,
426 swp_entry_t entry, struct page *page) 423 swp_entry_t entry, struct page *page)
427{ 424{
428 pte_t *pte;
429 pte_t swp_pte = swp_entry_to_pte(entry); 425 pte_t swp_pte = swp_entry_to_pte(entry);
426 pte_t *pte;
427 spinlock_t *ptl;
428 int found = 0;
430 429
431 pte = pte_offset_map(pmd, addr); 430 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
432 do { 431 do {
433 /* 432 /*
434 * swapoff spends a _lot_ of time in this loop! 433 * swapoff spends a _lot_ of time in this loop!
435 * Test inline before going to call unuse_pte. 434 * Test inline before going to call unuse_pte.
436 */ 435 */
437 if (unlikely(pte_same(*pte, swp_pte))) { 436 if (unlikely(pte_same(*pte, swp_pte))) {
438 unuse_pte(vma, pte, addr, entry, page); 437 unuse_pte(vma, pte++, addr, entry, page);
439 pte_unmap(pte); 438 found = 1;
440 return 1; 439 break;
441 } 440 }
442 } while (pte++, addr += PAGE_SIZE, addr != end); 441 } while (pte++, addr += PAGE_SIZE, addr != end);
443 pte_unmap(pte - 1); 442 pte_unmap_unlock(pte - 1, ptl);
444 return 0; 443 return found;
445} 444}
446 445
447static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 446static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -523,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm,
523 down_read(&mm->mmap_sem); 522 down_read(&mm->mmap_sem);
524 lock_page(page); 523 lock_page(page);
525 } 524 }
526 spin_lock(&mm->page_table_lock);
527 for (vma = mm->mmap; vma; vma = vma->vm_next) { 525 for (vma = mm->mmap; vma; vma = vma->vm_next) {
528 if (vma->anon_vma && unuse_vma(vma, entry, page)) 526 if (vma->anon_vma && unuse_vma(vma, entry, page))
529 break; 527 break;
530 } 528 }
531 spin_unlock(&mm->page_table_lock);
532 up_read(&mm->mmap_sem); 529 up_read(&mm->mmap_sem);
533 /* 530 /*
534 * Currently unuse_mm cannot fail, but leave error handling 531 * Currently unuse_mm cannot fail, but leave error handling
@@ -1045,7 +1042,7 @@ int page_queue_congested(struct page *page)
1045 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ 1042 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1046 1043
1047 if (PageSwapCache(page)) { 1044 if (PageSwapCache(page)) {
1048 swp_entry_t entry = { .val = page->private }; 1045 swp_entry_t entry = { .val = page_private(page) };
1049 struct swap_info_struct *sis; 1046 struct swap_info_struct *sis;
1050 1047
1051 sis = get_swap_info_struct(swp_type(entry)); 1048 sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/thrash.c b/mm/thrash.c
index 11461f7ad830..eff3c18c33a1 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -19,7 +19,7 @@ static unsigned long swap_token_check;
19struct mm_struct * swap_token_mm = &init_mm; 19struct mm_struct * swap_token_mm = &init_mm;
20 20
21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) 21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
22#define SWAP_TOKEN_TIMEOUT 0 22#define SWAP_TOKEN_TIMEOUT (300 * HZ)
23/* 23/*
24 * Currently disabled; Needs further code to work at HZ * 300. 24 * Currently disabled; Needs further code to work at HZ * 300.
25 */ 25 */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1150229b6366..54a90e83cb31 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,6 +5,7 @@
5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8 * Numa awareness, Christoph Lameter, SGI, June 2005
8 */ 9 */
9 10
10#include <linux/mm.h> 11#include <linux/mm.h>
@@ -88,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
88{ 89{
89 pte_t *pte; 90 pte_t *pte;
90 91
91 pte = pte_alloc_kernel(&init_mm, pmd, addr); 92 pte = pte_alloc_kernel(pmd, addr);
92 if (!pte) 93 if (!pte)
93 return -ENOMEM; 94 return -ENOMEM;
94 do { 95 do {
@@ -146,20 +147,18 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
146 147
147 BUG_ON(addr >= end); 148 BUG_ON(addr >= end);
148 pgd = pgd_offset_k(addr); 149 pgd = pgd_offset_k(addr);
149 spin_lock(&init_mm.page_table_lock);
150 do { 150 do {
151 next = pgd_addr_end(addr, end); 151 next = pgd_addr_end(addr, end);
152 err = vmap_pud_range(pgd, addr, next, prot, pages); 152 err = vmap_pud_range(pgd, addr, next, prot, pages);
153 if (err) 153 if (err)
154 break; 154 break;
155 } while (pgd++, addr = next, addr != end); 155 } while (pgd++, addr = next, addr != end);
156 spin_unlock(&init_mm.page_table_lock);
157 flush_cache_vmap((unsigned long) area->addr, end); 156 flush_cache_vmap((unsigned long) area->addr, end);
158 return err; 157 return err;
159} 158}
160 159
161struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 160struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
162 unsigned long start, unsigned long end) 161 unsigned long start, unsigned long end, int node)
163{ 162{
164 struct vm_struct **p, *tmp, *area; 163 struct vm_struct **p, *tmp, *area;
165 unsigned long align = 1; 164 unsigned long align = 1;
@@ -178,7 +177,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
178 addr = ALIGN(start, align); 177 addr = ALIGN(start, align);
179 size = PAGE_ALIGN(size); 178 size = PAGE_ALIGN(size);
180 179
181 area = kmalloc(sizeof(*area), GFP_KERNEL); 180 area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
182 if (unlikely(!area)) 181 if (unlikely(!area))
183 return NULL; 182 return NULL;
184 183
@@ -231,6 +230,12 @@ out:
231 return NULL; 230 return NULL;
232} 231}
233 232
233struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
234 unsigned long start, unsigned long end)
235{
236 return __get_vm_area_node(size, flags, start, end, -1);
237}
238
234/** 239/**
235 * get_vm_area - reserve a contingous kernel virtual area 240 * get_vm_area - reserve a contingous kernel virtual area
236 * 241 *
@@ -246,6 +251,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
246 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); 251 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
247} 252}
248 253
254struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
255{
256 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
257}
258
249/* Caller must hold vmlist_lock */ 259/* Caller must hold vmlist_lock */
250struct vm_struct *__remove_vm_area(void *addr) 260struct vm_struct *__remove_vm_area(void *addr)
251{ 261{
@@ -342,7 +352,6 @@ void vfree(void *addr)
342 BUG_ON(in_interrupt()); 352 BUG_ON(in_interrupt());
343 __vunmap(addr, 1); 353 __vunmap(addr, 1);
344} 354}
345
346EXPORT_SYMBOL(vfree); 355EXPORT_SYMBOL(vfree);
347 356
348/** 357/**
@@ -360,7 +369,6 @@ void vunmap(void *addr)
360 BUG_ON(in_interrupt()); 369 BUG_ON(in_interrupt());
361 __vunmap(addr, 0); 370 __vunmap(addr, 0);
362} 371}
363
364EXPORT_SYMBOL(vunmap); 372EXPORT_SYMBOL(vunmap);
365 373
366/** 374/**
@@ -392,10 +400,10 @@ void *vmap(struct page **pages, unsigned int count,
392 400
393 return area->addr; 401 return area->addr;
394} 402}
395
396EXPORT_SYMBOL(vmap); 403EXPORT_SYMBOL(vmap);
397 404
398void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 405void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
406 pgprot_t prot, int node)
399{ 407{
400 struct page **pages; 408 struct page **pages;
401 unsigned int nr_pages, array_size, i; 409 unsigned int nr_pages, array_size, i;
@@ -406,9 +414,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
406 area->nr_pages = nr_pages; 414 area->nr_pages = nr_pages;
407 /* Please note that the recursion is strictly bounded. */ 415 /* Please note that the recursion is strictly bounded. */
408 if (array_size > PAGE_SIZE) 416 if (array_size > PAGE_SIZE)
409 pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); 417 pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
410 else 418 else
411 pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); 419 pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
412 area->pages = pages; 420 area->pages = pages;
413 if (!area->pages) { 421 if (!area->pages) {
414 remove_vm_area(area->addr); 422 remove_vm_area(area->addr);
@@ -418,7 +426,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
418 memset(area->pages, 0, array_size); 426 memset(area->pages, 0, array_size);
419 427
420 for (i = 0; i < area->nr_pages; i++) { 428 for (i = 0; i < area->nr_pages; i++) {
421 area->pages[i] = alloc_page(gfp_mask); 429 if (node < 0)
430 area->pages[i] = alloc_page(gfp_mask);
431 else
432 area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
422 if (unlikely(!area->pages[i])) { 433 if (unlikely(!area->pages[i])) {
423 /* Successfully allocated i pages, free them in __vunmap() */ 434 /* Successfully allocated i pages, free them in __vunmap() */
424 area->nr_pages = i; 435 area->nr_pages = i;
@@ -435,18 +446,25 @@ fail:
435 return NULL; 446 return NULL;
436} 447}
437 448
449void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
450{
451 return __vmalloc_area_node(area, gfp_mask, prot, -1);
452}
453
438/** 454/**
439 * __vmalloc - allocate virtually contiguous memory 455 * __vmalloc_node - allocate virtually contiguous memory
440 * 456 *
441 * @size: allocation size 457 * @size: allocation size
442 * @gfp_mask: flags for the page level allocator 458 * @gfp_mask: flags for the page level allocator
443 * @prot: protection mask for the allocated pages 459 * @prot: protection mask for the allocated pages
460 * @node node to use for allocation or -1
444 * 461 *
445 * Allocate enough pages to cover @size from the page level 462 * Allocate enough pages to cover @size from the page level
446 * allocator with @gfp_mask flags. Map them into contiguous 463 * allocator with @gfp_mask flags. Map them into contiguous
447 * kernel virtual space, using a pagetable protection of @prot. 464 * kernel virtual space, using a pagetable protection of @prot.
448 */ 465 */
449void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 466void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
467 int node)
450{ 468{
451 struct vm_struct *area; 469 struct vm_struct *area;
452 470
@@ -454,13 +472,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
454 if (!size || (size >> PAGE_SHIFT) > num_physpages) 472 if (!size || (size >> PAGE_SHIFT) > num_physpages)
455 return NULL; 473 return NULL;
456 474
457 area = get_vm_area(size, VM_ALLOC); 475 area = get_vm_area_node(size, VM_ALLOC, node);
458 if (!area) 476 if (!area)
459 return NULL; 477 return NULL;
460 478
461 return __vmalloc_area(area, gfp_mask, prot); 479 return __vmalloc_area_node(area, gfp_mask, prot, node);
462} 480}
481EXPORT_SYMBOL(__vmalloc_node);
463 482
483void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
484{
485 return __vmalloc_node(size, gfp_mask, prot, -1);
486}
464EXPORT_SYMBOL(__vmalloc); 487EXPORT_SYMBOL(__vmalloc);
465 488
466/** 489/**
@@ -478,9 +501,26 @@ void *vmalloc(unsigned long size)
478{ 501{
479 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 502 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
480} 503}
481
482EXPORT_SYMBOL(vmalloc); 504EXPORT_SYMBOL(vmalloc);
483 505
506/**
507 * vmalloc_node - allocate memory on a specific node
508 *
509 * @size: allocation size
510 * @node; numa node
511 *
512 * Allocate enough pages to cover @size from the page level
513 * allocator and map them into contiguous kernel virtual space.
514 *
515 * For tight cotrol over page level allocator and protection flags
516 * use __vmalloc() instead.
517 */
518void *vmalloc_node(unsigned long size, int node)
519{
520 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
521}
522EXPORT_SYMBOL(vmalloc_node);
523
484#ifndef PAGE_KERNEL_EXEC 524#ifndef PAGE_KERNEL_EXEC
485# define PAGE_KERNEL_EXEC PAGE_KERNEL 525# define PAGE_KERNEL_EXEC PAGE_KERNEL
486#endif 526#endif
@@ -515,7 +555,6 @@ void *vmalloc_32(unsigned long size)
515{ 555{
516 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); 556 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
517} 557}
518
519EXPORT_SYMBOL(vmalloc_32); 558EXPORT_SYMBOL(vmalloc_32);
520 559
521long vread(char *buf, char *addr, unsigned long count) 560long vread(char *buf, char *addr, unsigned long count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 843c87d1e61f..135bf8ca96ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
417 * Anonymous process memory has backing store? 417 * Anonymous process memory has backing store?
418 * Try to allocate it some swap space here. 418 * Try to allocate it some swap space here.
419 */ 419 */
420 if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { 420 if (PageAnon(page) && !PageSwapCache(page)) {
421 if (!sc->may_swap)
422 goto keep_locked;
421 if (!add_to_swap(page)) 423 if (!add_to_swap(page))
422 goto activate_locked; 424 goto activate_locked;
423 } 425 }
@@ -519,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
519 521
520#ifdef CONFIG_SWAP 522#ifdef CONFIG_SWAP
521 if (PageSwapCache(page)) { 523 if (PageSwapCache(page)) {
522 swp_entry_t swap = { .val = page->private }; 524 swp_entry_t swap = { .val = page_private(page) };
523 __delete_from_swap_cache(page); 525 __delete_from_swap_cache(page);
524 write_unlock_irq(&mapping->tree_lock); 526 write_unlock_irq(&mapping->tree_lock);
525 swap_free(swap); 527 swap_free(swap);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 67abebabf83e..e97b2d162cc7 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
2949 return NOPAGE_OOM; 2949 return NOPAGE_OOM;
2950 runtime = substream->runtime; 2950 runtime = substream->runtime;
2951 page = virt_to_page(runtime->status); 2951 page = virt_to_page(runtime->status);
2952 if (!PageReserved(page)) 2952 get_page(page);
2953 get_page(page);
2954 if (type) 2953 if (type)
2955 *type = VM_FAULT_MINOR; 2954 *type = VM_FAULT_MINOR;
2956 return page; 2955 return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
2992 return NOPAGE_OOM; 2991 return NOPAGE_OOM;
2993 runtime = substream->runtime; 2992 runtime = substream->runtime;
2994 page = virt_to_page(runtime->control); 2993 page = virt_to_page(runtime->control);
2995 if (!PageReserved(page)) 2994 get_page(page);
2996 get_page(page);
2997 if (type) 2995 if (type)
2998 *type = VM_FAULT_MINOR; 2996 *type = VM_FAULT_MINOR;
2999 return page; 2997 return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
3066 vaddr = runtime->dma_area + offset; 3064 vaddr = runtime->dma_area + offset;
3067 page = virt_to_page(vaddr); 3065 page = virt_to_page(vaddr);
3068 } 3066 }
3069 if (!PageReserved(page)) 3067 get_page(page);
3070 get_page(page);
3071 if (type) 3068 if (type)
3072 *type = VM_FAULT_MINOR; 3069 *type = VM_FAULT_MINOR;
3073 return page; 3070 return page;