diff options
159 files changed, 3217 insertions, 3338 deletions
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index e132fb1163b0..7eb715e07eda 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt | |||
@@ -49,9 +49,6 @@ changes occur: | |||
49 | page table operations such as what happens during | 49 | page table operations such as what happens during |
50 | fork, and exec. | 50 | fork, and exec. |
51 | 51 | ||
52 | Platform developers note that generic code will always | ||
53 | invoke this interface without mm->page_table_lock held. | ||
54 | |||
55 | 3) void flush_tlb_range(struct vm_area_struct *vma, | 52 | 3) void flush_tlb_range(struct vm_area_struct *vma, |
56 | unsigned long start, unsigned long end) | 53 | unsigned long start, unsigned long end) |
57 | 54 | ||
@@ -72,9 +69,6 @@ changes occur: | |||
72 | call flush_tlb_page (see below) for each entry which may be | 69 | call flush_tlb_page (see below) for each entry which may be |
73 | modified. | 70 | modified. |
74 | 71 | ||
75 | Platform developers note that generic code will always | ||
76 | invoke this interface with mm->page_table_lock held. | ||
77 | |||
78 | 4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) | 72 | 4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) |
79 | 73 | ||
80 | This time we need to remove the PAGE_SIZE sized translation | 74 | This time we need to remove the PAGE_SIZE sized translation |
@@ -93,9 +87,6 @@ changes occur: | |||
93 | 87 | ||
94 | This is used primarily during fault processing. | 88 | This is used primarily during fault processing. |
95 | 89 | ||
96 | Platform developers note that generic code will always | ||
97 | invoke this interface with mm->page_table_lock held. | ||
98 | |||
99 | 5) void flush_tlb_pgtables(struct mm_struct *mm, | 90 | 5) void flush_tlb_pgtables(struct mm_struct *mm, |
100 | unsigned long start, unsigned long end) | 91 | unsigned long start, unsigned long end) |
101 | 92 | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 90766b75d1b7..5dffcfefc3c7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1460,8 +1460,6 @@ running once the system is up. | |||
1460 | stifb= [HW] | 1460 | stifb= [HW] |
1461 | Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] | 1461 | Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] |
1462 | 1462 | ||
1463 | stram_swap= [HW,M68k] | ||
1464 | |||
1465 | swiotlb= [IA-64] Number of I/O TLB slabs | 1463 | swiotlb= [IA-64] Number of I/O TLB slabs |
1466 | 1464 | ||
1467 | switches= [HW,M68k] | 1465 | switches= [HW,M68k] |
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt index e191baad8308..d5d3f064f552 100644 --- a/Documentation/m68k/kernel-options.txt +++ b/Documentation/m68k/kernel-options.txt | |||
@@ -626,7 +626,7 @@ ignored (others aren't affected). | |||
626 | can be performed in optimal order. Not all SCSI devices support | 626 | can be performed in optimal order. Not all SCSI devices support |
627 | tagged queuing (:-(). | 627 | tagged queuing (:-(). |
628 | 628 | ||
629 | 4.6 switches= | 629 | 4.5 switches= |
630 | ------------- | 630 | ------------- |
631 | 631 | ||
632 | Syntax: switches=<list of switches> | 632 | Syntax: switches=<list of switches> |
@@ -661,28 +661,6 @@ correctly. | |||
661 | earlier initialization ("ov_"-less) takes precedence. But the | 661 | earlier initialization ("ov_"-less) takes precedence. But the |
662 | switching-off on reset still happens in this case. | 662 | switching-off on reset still happens in this case. |
663 | 663 | ||
664 | 4.5) stram_swap= | ||
665 | ---------------- | ||
666 | |||
667 | Syntax: stram_swap=<do_swap>[,<max_swap>] | ||
668 | |||
669 | This option is available only if the kernel has been compiled with | ||
670 | CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines | ||
671 | dynamically whether to actually use ST-RAM as swap space. (Currently, | ||
672 | the fraction of ST-RAM must be less or equal 1/3 of total memory to | ||
673 | enable this swapping.) You can override the kernel's decision by | ||
674 | specifying this option. 1 for <do_swap> means always enable the swap, | ||
675 | even if you have less alternate RAM. 0 stands for never swap to | ||
676 | ST-RAM, even if it's small enough compared to the rest of memory. | ||
677 | |||
678 | If ST-RAM swapping is enabled, the kernel usually uses all free | ||
679 | ST-RAM as swap "device". If the kernel resides in ST-RAM, the region | ||
680 | allocated by it is obviously never used for swapping :-) You can also | ||
681 | limit this amount by specifying the second parameter, <max_swap>, if | ||
682 | you want to use parts of ST-RAM as normal system memory. <max_swap> is | ||
683 | in kBytes and the number should be a multiple of 4 (otherwise: rounded | ||
684 | down). | ||
685 | |||
686 | 5) Options for Amiga Only: | 664 | 5) Options for Amiga Only: |
687 | ========================== | 665 | ========================== |
688 | 666 | ||
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index c7481d59b6df..6d5251254f68 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c | |||
@@ -371,6 +371,8 @@ show_mem(void) | |||
371 | show_free_areas(); | 371 | show_free_areas(); |
372 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 372 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
373 | for_each_online_node(nid) { | 373 | for_each_online_node(nid) { |
374 | unsigned long flags; | ||
375 | pgdat_resize_lock(NODE_DATA(nid), &flags); | ||
374 | i = node_spanned_pages(nid); | 376 | i = node_spanned_pages(nid); |
375 | while (i-- > 0) { | 377 | while (i-- > 0) { |
376 | struct page *page = nid_page_nr(nid, i); | 378 | struct page *page = nid_page_nr(nid, i); |
@@ -384,6 +386,7 @@ show_mem(void) | |||
384 | else | 386 | else |
385 | shared += page_count(page) - 1; | 387 | shared += page_count(page) - 1; |
386 | } | 388 | } |
389 | pgdat_resize_unlock(NODE_DATA(nid), &flags); | ||
387 | } | 390 | } |
388 | printk("%ld pages of RAM\n",total); | 391 | printk("%ld pages of RAM\n",total); |
389 | printk("%ld free pages\n",free); | 392 | printk("%ld free pages\n",free); |
diff --git a/arch/alpha/mm/remap.c b/arch/alpha/mm/remap.c index 19817ad3d89b..a78356c3ead5 100644 --- a/arch/alpha/mm/remap.c +++ b/arch/alpha/mm/remap.c | |||
@@ -2,7 +2,6 @@ | |||
2 | #include <asm/pgalloc.h> | 2 | #include <asm/pgalloc.h> |
3 | #include <asm/cacheflush.h> | 3 | #include <asm/cacheflush.h> |
4 | 4 | ||
5 | /* called with the page_table_lock held */ | ||
6 | static inline void | 5 | static inline void |
7 | remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, | 6 | remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, |
8 | unsigned long phys_addr, unsigned long flags) | 7 | unsigned long phys_addr, unsigned long flags) |
@@ -31,7 +30,6 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, | |||
31 | } while (address && (address < end)); | 30 | } while (address && (address < end)); |
32 | } | 31 | } |
33 | 32 | ||
34 | /* called with the page_table_lock held */ | ||
35 | static inline int | 33 | static inline int |
36 | remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, | 34 | remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, |
37 | unsigned long phys_addr, unsigned long flags) | 35 | unsigned long phys_addr, unsigned long flags) |
@@ -46,7 +44,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, | |||
46 | if (address >= end) | 44 | if (address >= end) |
47 | BUG(); | 45 | BUG(); |
48 | do { | 46 | do { |
49 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 47 | pte_t * pte = pte_alloc_kernel(pmd, address); |
50 | if (!pte) | 48 | if (!pte) |
51 | return -ENOMEM; | 49 | return -ENOMEM; |
52 | remap_area_pte(pte, address, end - address, | 50 | remap_area_pte(pte, address, end - address, |
@@ -70,7 +68,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
70 | flush_cache_all(); | 68 | flush_cache_all(); |
71 | if (address >= end) | 69 | if (address >= end) |
72 | BUG(); | 70 | BUG(); |
73 | spin_lock(&init_mm.page_table_lock); | ||
74 | do { | 71 | do { |
75 | pmd_t *pmd; | 72 | pmd_t *pmd; |
76 | pmd = pmd_alloc(&init_mm, dir, address); | 73 | pmd = pmd_alloc(&init_mm, dir, address); |
@@ -84,7 +81,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
84 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 81 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
85 | dir++; | 82 | dir++; |
86 | } while (address && (address < end)); | 83 | } while (address && (address < end)); |
87 | spin_unlock(&init_mm.page_table_lock); | ||
88 | return error; | 84 | return error; |
89 | } | 85 | } |
90 | 86 | ||
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index a94d75fef598..a917e3dd3666 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c | |||
@@ -139,93 +139,33 @@ struct iwmmxt_sigframe { | |||
139 | unsigned long storage[0x98/4]; | 139 | unsigned long storage[0x98/4]; |
140 | }; | 140 | }; |
141 | 141 | ||
142 | static int page_present(struct mm_struct *mm, void __user *uptr, int wr) | ||
143 | { | ||
144 | unsigned long addr = (unsigned long)uptr; | ||
145 | pgd_t *pgd = pgd_offset(mm, addr); | ||
146 | if (pgd_present(*pgd)) { | ||
147 | pmd_t *pmd = pmd_offset(pgd, addr); | ||
148 | if (pmd_present(*pmd)) { | ||
149 | pte_t *pte = pte_offset_map(pmd, addr); | ||
150 | return (pte_present(*pte) && (!wr || pte_write(*pte))); | ||
151 | } | ||
152 | } | ||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | static int copy_locked(void __user *uptr, void *kptr, size_t size, int write, | ||
157 | void (*copyfn)(void *, void __user *)) | ||
158 | { | ||
159 | unsigned char v, __user *userptr = uptr; | ||
160 | int err = 0; | ||
161 | |||
162 | do { | ||
163 | struct mm_struct *mm; | ||
164 | |||
165 | if (write) { | ||
166 | __put_user_error(0, userptr, err); | ||
167 | __put_user_error(0, userptr + size - 1, err); | ||
168 | } else { | ||
169 | __get_user_error(v, userptr, err); | ||
170 | __get_user_error(v, userptr + size - 1, err); | ||
171 | } | ||
172 | |||
173 | if (err) | ||
174 | break; | ||
175 | |||
176 | mm = current->mm; | ||
177 | spin_lock(&mm->page_table_lock); | ||
178 | if (page_present(mm, userptr, write) && | ||
179 | page_present(mm, userptr + size - 1, write)) { | ||
180 | copyfn(kptr, uptr); | ||
181 | } else | ||
182 | err = 1; | ||
183 | spin_unlock(&mm->page_table_lock); | ||
184 | } while (err); | ||
185 | |||
186 | return err; | ||
187 | } | ||
188 | |||
189 | static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame) | 142 | static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame) |
190 | { | 143 | { |
191 | int err = 0; | 144 | char kbuf[sizeof(*frame) + 8]; |
145 | struct iwmmxt_sigframe *kframe; | ||
192 | 146 | ||
193 | /* the iWMMXt context must be 64 bit aligned */ | 147 | /* the iWMMXt context must be 64 bit aligned */ |
194 | WARN_ON((unsigned long)frame & 7); | 148 | kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7); |
195 | 149 | kframe->magic0 = IWMMXT_MAGIC0; | |
196 | __put_user_error(IWMMXT_MAGIC0, &frame->magic0, err); | 150 | kframe->magic1 = IWMMXT_MAGIC1; |
197 | __put_user_error(IWMMXT_MAGIC1, &frame->magic1, err); | 151 | iwmmxt_task_copy(current_thread_info(), &kframe->storage); |
198 | 152 | return __copy_to_user(frame, kframe, sizeof(*frame)); | |
199 | /* | ||
200 | * iwmmxt_task_copy() doesn't check user permissions. | ||
201 | * Let's do a dummy write on the upper boundary to ensure | ||
202 | * access to user mem is OK all way up. | ||
203 | */ | ||
204 | err |= copy_locked(&frame->storage, current_thread_info(), | ||
205 | sizeof(frame->storage), 1, iwmmxt_task_copy); | ||
206 | return err; | ||
207 | } | 153 | } |
208 | 154 | ||
209 | static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame) | 155 | static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame) |
210 | { | 156 | { |
211 | unsigned long magic0, magic1; | 157 | char kbuf[sizeof(*frame) + 8]; |
212 | int err = 0; | 158 | struct iwmmxt_sigframe *kframe; |
213 | 159 | ||
214 | /* the iWMMXt context is 64 bit aligned */ | 160 | /* the iWMMXt context must be 64 bit aligned */ |
215 | WARN_ON((unsigned long)frame & 7); | 161 | kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7); |
216 | 162 | if (__copy_from_user(kframe, frame, sizeof(*frame))) | |
217 | /* | 163 | return -1; |
218 | * Validate iWMMXt context signature. | 164 | if (kframe->magic0 != IWMMXT_MAGIC0 || |
219 | * Also, iwmmxt_task_restore() doesn't check user permissions. | 165 | kframe->magic1 != IWMMXT_MAGIC1) |
220 | * Let's do a dummy write on the upper boundary to ensure | 166 | return -1; |
221 | * access to user mem is OK all way up. | 167 | iwmmxt_task_restore(current_thread_info(), &kframe->storage); |
222 | */ | 168 | return 0; |
223 | __get_user_error(magic0, &frame->magic0, err); | ||
224 | __get_user_error(magic1, &frame->magic1, err); | ||
225 | if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1) | ||
226 | err = copy_locked(&frame->storage, current_thread_info(), | ||
227 | sizeof(frame->storage), 0, iwmmxt_task_restore); | ||
228 | return err; | ||
229 | } | 169 | } |
230 | 170 | ||
231 | #endif | 171 | #endif |
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index baa09601a64e..66e5a0516f23 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c | |||
@@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) | |||
483 | unsigned long addr = regs->ARM_r2; | 483 | unsigned long addr = regs->ARM_r2; |
484 | struct mm_struct *mm = current->mm; | 484 | struct mm_struct *mm = current->mm; |
485 | pgd_t *pgd; pmd_t *pmd; pte_t *pte; | 485 | pgd_t *pgd; pmd_t *pmd; pte_t *pte; |
486 | spinlock_t *ptl; | ||
486 | 487 | ||
487 | regs->ARM_cpsr &= ~PSR_C_BIT; | 488 | regs->ARM_cpsr &= ~PSR_C_BIT; |
488 | spin_lock(&mm->page_table_lock); | 489 | down_read(&mm->mmap_sem); |
489 | pgd = pgd_offset(mm, addr); | 490 | pgd = pgd_offset(mm, addr); |
490 | if (!pgd_present(*pgd)) | 491 | if (!pgd_present(*pgd)) |
491 | goto bad_access; | 492 | goto bad_access; |
492 | pmd = pmd_offset(pgd, addr); | 493 | pmd = pmd_offset(pgd, addr); |
493 | if (!pmd_present(*pmd)) | 494 | if (!pmd_present(*pmd)) |
494 | goto bad_access; | 495 | goto bad_access; |
495 | pte = pte_offset_map(pmd, addr); | 496 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
496 | if (!pte_present(*pte) || !pte_write(*pte)) | 497 | if (!pte_present(*pte) || !pte_write(*pte)) { |
498 | pte_unmap_unlock(pte, ptl); | ||
497 | goto bad_access; | 499 | goto bad_access; |
500 | } | ||
498 | val = *(unsigned long *)addr; | 501 | val = *(unsigned long *)addr; |
499 | val -= regs->ARM_r0; | 502 | val -= regs->ARM_r0; |
500 | if (val == 0) { | 503 | if (val == 0) { |
501 | *(unsigned long *)addr = regs->ARM_r1; | 504 | *(unsigned long *)addr = regs->ARM_r1; |
502 | regs->ARM_cpsr |= PSR_C_BIT; | 505 | regs->ARM_cpsr |= PSR_C_BIT; |
503 | } | 506 | } |
504 | spin_unlock(&mm->page_table_lock); | 507 | pte_unmap_unlock(pte, ptl); |
508 | up_read(&mm->mmap_sem); | ||
505 | return val; | 509 | return val; |
506 | 510 | ||
507 | bad_access: | 511 | bad_access: |
508 | spin_unlock(&mm->page_table_lock); | 512 | up_read(&mm->mmap_sem); |
509 | /* simulate a write access fault */ | 513 | /* simulate a write access fault */ |
510 | do_DataAbort(addr, 15 + (1 << 11), regs); | 514 | do_DataAbort(addr, 15 + (1 << 11), regs); |
511 | return -1; | 515 | return -1; |
diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c index 82f4d5e27c54..47b0b767f080 100644 --- a/arch/arm/mm/consistent.c +++ b/arch/arm/mm/consistent.c | |||
@@ -397,8 +397,6 @@ static int __init consistent_init(void) | |||
397 | pte_t *pte; | 397 | pte_t *pte; |
398 | int ret = 0; | 398 | int ret = 0; |
399 | 399 | ||
400 | spin_lock(&init_mm.page_table_lock); | ||
401 | |||
402 | do { | 400 | do { |
403 | pgd = pgd_offset(&init_mm, CONSISTENT_BASE); | 401 | pgd = pgd_offset(&init_mm, CONSISTENT_BASE); |
404 | pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); | 402 | pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); |
@@ -409,7 +407,7 @@ static int __init consistent_init(void) | |||
409 | } | 407 | } |
410 | WARN_ON(!pmd_none(*pmd)); | 408 | WARN_ON(!pmd_none(*pmd)); |
411 | 409 | ||
412 | pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); | 410 | pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); |
413 | if (!pte) { | 411 | if (!pte) { |
414 | printk(KERN_ERR "%s: no pte tables\n", __func__); | 412 | printk(KERN_ERR "%s: no pte tables\n", __func__); |
415 | ret = -ENOMEM; | 413 | ret = -ENOMEM; |
@@ -419,8 +417,6 @@ static int __init consistent_init(void) | |||
419 | consistent_pte = pte; | 417 | consistent_pte = pte; |
420 | } while (0); | 418 | } while (0); |
421 | 419 | ||
422 | spin_unlock(&init_mm.page_table_lock); | ||
423 | |||
424 | return ret; | 420 | return ret; |
425 | } | 421 | } |
426 | 422 | ||
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index be4ab3d73c91..7fc1b35a6746 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c | |||
@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE; | |||
26 | /* | 26 | /* |
27 | * We take the easy way out of this problem - we make the | 27 | * We take the easy way out of this problem - we make the |
28 | * PTE uncacheable. However, we leave the write buffer on. | 28 | * PTE uncacheable. However, we leave the write buffer on. |
29 | * | ||
30 | * Note that the pte lock held when calling update_mmu_cache must also | ||
31 | * guard the pte (somewhere else in the same mm) that we modify here. | ||
32 | * Therefore those configurations which might call adjust_pte (those | ||
33 | * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock. | ||
29 | */ | 34 | */ |
30 | static int adjust_pte(struct vm_area_struct *vma, unsigned long address) | 35 | static int adjust_pte(struct vm_area_struct *vma, unsigned long address) |
31 | { | 36 | { |
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page); | |||
127 | * 2. If we have multiple shared mappings of the same space in | 132 | * 2. If we have multiple shared mappings of the same space in |
128 | * an object, we need to deal with the cache aliasing issues. | 133 | * an object, we need to deal with the cache aliasing issues. |
129 | * | 134 | * |
130 | * Note that the page_table_lock will be held. | 135 | * Note that the pte lock will be held. |
131 | */ | 136 | */ |
132 | void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte) | 137 | void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
133 | { | 138 | { |
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 6fb1258df1b5..0f128c28fee4 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c | |||
@@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, | |||
75 | 75 | ||
76 | pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags); | 76 | pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags); |
77 | do { | 77 | do { |
78 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 78 | pte_t * pte = pte_alloc_kernel(pmd, address); |
79 | if (!pte) | 79 | if (!pte) |
80 | return -ENOMEM; | 80 | return -ENOMEM; |
81 | remap_area_pte(pte, address, end - address, address + phys_addr, pgprot); | 81 | remap_area_pte(pte, address, end - address, address + phys_addr, pgprot); |
@@ -97,7 +97,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr, | |||
97 | phys_addr -= address; | 97 | phys_addr -= address; |
98 | dir = pgd_offset(&init_mm, address); | 98 | dir = pgd_offset(&init_mm, address); |
99 | BUG_ON(address >= end); | 99 | BUG_ON(address >= end); |
100 | spin_lock(&init_mm.page_table_lock); | ||
101 | do { | 100 | do { |
102 | pmd_t *pmd = pmd_alloc(&init_mm, dir, address); | 101 | pmd_t *pmd = pmd_alloc(&init_mm, dir, address); |
103 | if (!pmd) { | 102 | if (!pmd) { |
@@ -114,7 +113,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr, | |||
114 | dir++; | 113 | dir++; |
115 | } while (address && (address < end)); | 114 | } while (address && (address < end)); |
116 | 115 | ||
117 | spin_unlock(&init_mm.page_table_lock); | ||
118 | flush_cache_vmap(start, end); | 116 | flush_cache_vmap(start, end); |
119 | return err; | 117 | return err; |
120 | } | 118 | } |
diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c index 61bc2fa0511e..1221fdde1769 100644 --- a/arch/arm/mm/mm-armv.c +++ b/arch/arm/mm/mm-armv.c | |||
@@ -180,11 +180,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) | |||
180 | 180 | ||
181 | if (!vectors_high()) { | 181 | if (!vectors_high()) { |
182 | /* | 182 | /* |
183 | * This lock is here just to satisfy pmd_alloc and pte_lock | ||
184 | */ | ||
185 | spin_lock(&mm->page_table_lock); | ||
186 | |||
187 | /* | ||
188 | * On ARM, first page must always be allocated since it | 183 | * On ARM, first page must always be allocated since it |
189 | * contains the machine vectors. | 184 | * contains the machine vectors. |
190 | */ | 185 | */ |
@@ -201,23 +196,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) | |||
201 | set_pte(new_pte, *init_pte); | 196 | set_pte(new_pte, *init_pte); |
202 | pte_unmap_nested(init_pte); | 197 | pte_unmap_nested(init_pte); |
203 | pte_unmap(new_pte); | 198 | pte_unmap(new_pte); |
204 | |||
205 | spin_unlock(&mm->page_table_lock); | ||
206 | } | 199 | } |
207 | 200 | ||
208 | return new_pgd; | 201 | return new_pgd; |
209 | 202 | ||
210 | no_pte: | 203 | no_pte: |
211 | spin_unlock(&mm->page_table_lock); | ||
212 | pmd_free(new_pmd); | 204 | pmd_free(new_pmd); |
213 | free_pages((unsigned long)new_pgd, 2); | ||
214 | return NULL; | ||
215 | |||
216 | no_pmd: | 205 | no_pmd: |
217 | spin_unlock(&mm->page_table_lock); | ||
218 | free_pages((unsigned long)new_pgd, 2); | 206 | free_pages((unsigned long)new_pgd, 2); |
219 | return NULL; | ||
220 | |||
221 | no_pgd: | 207 | no_pgd: |
222 | return NULL; | 208 | return NULL; |
223 | } | 209 | } |
@@ -243,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd) | |||
243 | pte = pmd_page(*pmd); | 229 | pte = pmd_page(*pmd); |
244 | pmd_clear(pmd); | 230 | pmd_clear(pmd); |
245 | dec_page_state(nr_page_table_pages); | 231 | dec_page_state(nr_page_table_pages); |
232 | pte_lock_deinit(pte); | ||
246 | pte_free(pte); | 233 | pte_free(pte); |
247 | pmd_free(pmd); | 234 | pmd_free(pmd); |
248 | free: | 235 | free: |
diff --git a/arch/arm/oprofile/backtrace.c b/arch/arm/oprofile/backtrace.c index df35c452a8bf..7c22c12618cc 100644 --- a/arch/arm/oprofile/backtrace.c +++ b/arch/arm/oprofile/backtrace.c | |||
@@ -49,42 +49,22 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail) | |||
49 | 49 | ||
50 | static struct frame_tail* user_backtrace(struct frame_tail *tail) | 50 | static struct frame_tail* user_backtrace(struct frame_tail *tail) |
51 | { | 51 | { |
52 | struct frame_tail buftail; | 52 | struct frame_tail buftail[2]; |
53 | 53 | ||
54 | /* hardware pte might not be valid due to dirty/accessed bit emulation | 54 | /* Also check accessibility of one struct frame_tail beyond */ |
55 | * so we use copy_from_user and benefit from exception fixups */ | 55 | if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) |
56 | if (copy_from_user(&buftail, tail, sizeof(struct frame_tail))) | 56 | return NULL; |
57 | if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) | ||
57 | return NULL; | 58 | return NULL; |
58 | 59 | ||
59 | oprofile_add_trace(buftail.lr); | 60 | oprofile_add_trace(buftail[0].lr); |
60 | 61 | ||
61 | /* frame pointers should strictly progress back up the stack | 62 | /* frame pointers should strictly progress back up the stack |
62 | * (towards higher addresses) */ | 63 | * (towards higher addresses) */ |
63 | if (tail >= buftail.fp) | 64 | if (tail >= buftail[0].fp) |
64 | return NULL; | 65 | return NULL; |
65 | 66 | ||
66 | return buftail.fp-1; | 67 | return buftail[0].fp-1; |
67 | } | ||
68 | |||
69 | /* Compare two addresses and see if they're on the same page */ | ||
70 | #define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \ | ||
71 | == ((((unsigned long) y) + offset) >> PAGE_SHIFT)) | ||
72 | |||
73 | /* check that the page(s) containing the frame tail are present */ | ||
74 | static int pages_present(struct frame_tail *tail) | ||
75 | { | ||
76 | struct mm_struct * mm = current->mm; | ||
77 | |||
78 | if (!check_user_page_readable(mm, (unsigned long)tail)) | ||
79 | return 0; | ||
80 | |||
81 | if (CMP_ADDR_EQUAL(tail, tail, 8)) | ||
82 | return 1; | ||
83 | |||
84 | if (!check_user_page_readable(mm, ((unsigned long)tail) + 8)) | ||
85 | return 0; | ||
86 | |||
87 | return 1; | ||
88 | } | 68 | } |
89 | 69 | ||
90 | /* | 70 | /* |
@@ -118,7 +98,6 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs) | |||
118 | void arm_backtrace(struct pt_regs * const regs, unsigned int depth) | 98 | void arm_backtrace(struct pt_regs * const regs, unsigned int depth) |
119 | { | 99 | { |
120 | struct frame_tail *tail; | 100 | struct frame_tail *tail; |
121 | unsigned long last_address = 0; | ||
122 | 101 | ||
123 | tail = ((struct frame_tail *) regs->ARM_fp) - 1; | 102 | tail = ((struct frame_tail *) regs->ARM_fp) - 1; |
124 | 103 | ||
@@ -132,13 +111,6 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth) | |||
132 | return; | 111 | return; |
133 | } | 112 | } |
134 | 113 | ||
135 | while (depth-- && tail && !((unsigned long) tail & 3)) { | 114 | while (depth-- && tail && !((unsigned long) tail & 3)) |
136 | if ((!CMP_ADDR_EQUAL(last_address, tail, 0) | ||
137 | || !CMP_ADDR_EQUAL(last_address, tail, 8)) | ||
138 | && !pages_present(tail)) | ||
139 | return; | ||
140 | last_address = (unsigned long) tail; | ||
141 | tail = user_backtrace(tail); | 115 | tail = user_backtrace(tail); |
142 | } | ||
143 | } | 116 | } |
144 | |||
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c index 8e8a2bb2487d..34def6397c3c 100644 --- a/arch/arm26/mm/memc.c +++ b/arch/arm26/mm/memc.c | |||
@@ -79,12 +79,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) | |||
79 | goto no_pgd; | 79 | goto no_pgd; |
80 | 80 | ||
81 | /* | 81 | /* |
82 | * This lock is here just to satisfy pmd_alloc and pte_lock | ||
83 | * FIXME: I bet we could avoid taking it pretty much altogether | ||
84 | */ | ||
85 | spin_lock(&mm->page_table_lock); | ||
86 | |||
87 | /* | ||
88 | * On ARM, first page must always be allocated since it contains | 82 | * On ARM, first page must always be allocated since it contains |
89 | * the machine vectors. | 83 | * the machine vectors. |
90 | */ | 84 | */ |
@@ -92,7 +86,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) | |||
92 | if (!new_pmd) | 86 | if (!new_pmd) |
93 | goto no_pmd; | 87 | goto no_pmd; |
94 | 88 | ||
95 | new_pte = pte_alloc_kernel(mm, new_pmd, 0); | 89 | new_pte = pte_alloc_map(mm, new_pmd, 0); |
96 | if (!new_pte) | 90 | if (!new_pte) |
97 | goto no_pte; | 91 | goto no_pte; |
98 | 92 | ||
@@ -101,6 +95,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) | |||
101 | init_pte = pte_offset(init_pmd, 0); | 95 | init_pte = pte_offset(init_pmd, 0); |
102 | 96 | ||
103 | set_pte(new_pte, *init_pte); | 97 | set_pte(new_pte, *init_pte); |
98 | pte_unmap(new_pte); | ||
104 | 99 | ||
105 | /* | 100 | /* |
106 | * the page table entries are zeroed | 101 | * the page table entries are zeroed |
@@ -112,23 +107,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) | |||
112 | memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, | 107 | memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, |
113 | (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); | 108 | (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); |
114 | 109 | ||
115 | spin_unlock(&mm->page_table_lock); | ||
116 | |||
117 | /* update MEMC tables */ | 110 | /* update MEMC tables */ |
118 | cpu_memc_update_all(new_pgd); | 111 | cpu_memc_update_all(new_pgd); |
119 | return new_pgd; | 112 | return new_pgd; |
120 | 113 | ||
121 | no_pte: | 114 | no_pte: |
122 | spin_unlock(&mm->page_table_lock); | ||
123 | pmd_free(new_pmd); | 115 | pmd_free(new_pmd); |
124 | free_pgd_slow(new_pgd); | ||
125 | return NULL; | ||
126 | |||
127 | no_pmd: | 116 | no_pmd: |
128 | spin_unlock(&mm->page_table_lock); | ||
129 | free_pgd_slow(new_pgd); | 117 | free_pgd_slow(new_pgd); |
130 | return NULL; | ||
131 | |||
132 | no_pgd: | 118 | no_pgd: |
133 | return NULL; | 119 | return NULL; |
134 | } | 120 | } |
diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c index 8233406798d3..b08a28bb58ab 100644 --- a/arch/cris/arch-v32/mm/tlb.c +++ b/arch/cris/arch-v32/mm/tlb.c | |||
@@ -175,6 +175,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) | |||
175 | return 0; | 175 | return 0; |
176 | } | 176 | } |
177 | 177 | ||
178 | static DEFINE_SPINLOCK(mmu_context_lock); | ||
179 | |||
178 | /* Called in schedule() just before actually doing the switch_to. */ | 180 | /* Called in schedule() just before actually doing the switch_to. */ |
179 | void | 181 | void |
180 | switch_mm(struct mm_struct *prev, struct mm_struct *next, | 182 | switch_mm(struct mm_struct *prev, struct mm_struct *next, |
@@ -183,10 +185,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
183 | int cpu = smp_processor_id(); | 185 | int cpu = smp_processor_id(); |
184 | 186 | ||
185 | /* Make sure there is a MMU context. */ | 187 | /* Make sure there is a MMU context. */ |
186 | spin_lock(&next->page_table_lock); | 188 | spin_lock(&mmu_context_lock); |
187 | get_mmu_context(next); | 189 | get_mmu_context(next); |
188 | cpu_set(cpu, next->cpu_vm_mask); | 190 | cpu_set(cpu, next->cpu_vm_mask); |
189 | spin_unlock(&next->page_table_lock); | 191 | spin_unlock(&mmu_context_lock); |
190 | 192 | ||
191 | /* | 193 | /* |
192 | * Remember the pgd for the fault handlers. Keep a seperate copy of it | 194 | * Remember the pgd for the fault handlers. Keep a seperate copy of it |
diff --git a/arch/cris/mm/ioremap.c b/arch/cris/mm/ioremap.c index ebba11e270fa..a92ac9877582 100644 --- a/arch/cris/mm/ioremap.c +++ b/arch/cris/mm/ioremap.c | |||
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo | |||
52 | if (address >= end) | 52 | if (address >= end) |
53 | BUG(); | 53 | BUG(); |
54 | do { | 54 | do { |
55 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 55 | pte_t * pte = pte_alloc_kernel(pmd, address); |
56 | if (!pte) | 56 | if (!pte) |
57 | return -ENOMEM; | 57 | return -ENOMEM; |
58 | remap_area_pte(pte, address, end - address, address + phys_addr, prot); | 58 | remap_area_pte(pte, address, end - address, address + phys_addr, prot); |
@@ -74,7 +74,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
74 | flush_cache_all(); | 74 | flush_cache_all(); |
75 | if (address >= end) | 75 | if (address >= end) |
76 | BUG(); | 76 | BUG(); |
77 | spin_lock(&init_mm.page_table_lock); | ||
78 | do { | 77 | do { |
79 | pud_t *pud; | 78 | pud_t *pud; |
80 | pmd_t *pmd; | 79 | pmd_t *pmd; |
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
94 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 93 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
95 | dir++; | 94 | dir++; |
96 | } while (address && (address < end)); | 95 | } while (address && (address < end)); |
97 | spin_unlock(&init_mm.page_table_lock); | ||
98 | flush_tlb_all(); | 96 | flush_tlb_all(); |
99 | return error; | 97 | return error; |
100 | } | 98 | } |
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c index cfc4f97490c6..342823aad758 100644 --- a/arch/frv/mm/dma-alloc.c +++ b/arch/frv/mm/dma-alloc.c | |||
@@ -55,21 +55,18 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot) | |||
55 | pte_t *pte; | 55 | pte_t *pte; |
56 | int err = -ENOMEM; | 56 | int err = -ENOMEM; |
57 | 57 | ||
58 | spin_lock(&init_mm.page_table_lock); | ||
59 | |||
60 | /* Use upper 10 bits of VA to index the first level map */ | 58 | /* Use upper 10 bits of VA to index the first level map */ |
61 | pge = pgd_offset_k(va); | 59 | pge = pgd_offset_k(va); |
62 | pue = pud_offset(pge, va); | 60 | pue = pud_offset(pge, va); |
63 | pme = pmd_offset(pue, va); | 61 | pme = pmd_offset(pue, va); |
64 | 62 | ||
65 | /* Use middle 10 bits of VA to index the second-level map */ | 63 | /* Use middle 10 bits of VA to index the second-level map */ |
66 | pte = pte_alloc_kernel(&init_mm, pme, va); | 64 | pte = pte_alloc_kernel(pme, va); |
67 | if (pte != 0) { | 65 | if (pte != 0) { |
68 | err = 0; | 66 | err = 0; |
69 | set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot)); | 67 | set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot)); |
70 | } | 68 | } |
71 | 69 | ||
72 | spin_unlock(&init_mm.page_table_lock); | ||
73 | return err; | 70 | return err; |
74 | } | 71 | } |
75 | 72 | ||
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c index 4eaec0f3525b..2c67dfe5a6b3 100644 --- a/arch/frv/mm/pgalloc.c +++ b/arch/frv/mm/pgalloc.c | |||
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd) | |||
87 | if (pgd_list) | 87 | if (pgd_list) |
88 | pgd_list->private = (unsigned long) &page->index; | 88 | pgd_list->private = (unsigned long) &page->index; |
89 | pgd_list = page; | 89 | pgd_list = page; |
90 | page->private = (unsigned long) &pgd_list; | 90 | set_page_private(page, (unsigned long)&pgd_list); |
91 | } | 91 | } |
92 | 92 | ||
93 | static inline void pgd_list_del(pgd_t *pgd) | 93 | static inline void pgd_list_del(pgd_t *pgd) |
94 | { | 94 | { |
95 | struct page *next, **pprev, *page = virt_to_page(pgd); | 95 | struct page *next, **pprev, *page = virt_to_page(pgd); |
96 | next = (struct page *) page->index; | 96 | next = (struct page *) page->index; |
97 | pprev = (struct page **) page->private; | 97 | pprev = (struct page **)page_private(page); |
98 | *pprev = next; | 98 | *pprev = next; |
99 | if (next) | 99 | if (next) |
100 | next->private = (unsigned long) pprev; | 100 | next->private = (unsigned long) pprev; |
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index 16b485009622..fc1993564f98 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c | |||
@@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) | |||
134 | return ret; | 134 | return ret; |
135 | } | 135 | } |
136 | 136 | ||
137 | static void mark_screen_rdonly(struct task_struct * tsk) | 137 | static void mark_screen_rdonly(struct mm_struct *mm) |
138 | { | 138 | { |
139 | pgd_t *pgd; | 139 | pgd_t *pgd; |
140 | pud_t *pud; | 140 | pud_t *pud; |
141 | pmd_t *pmd; | 141 | pmd_t *pmd; |
142 | pte_t *pte, *mapped; | 142 | pte_t *pte; |
143 | spinlock_t *ptl; | ||
143 | int i; | 144 | int i; |
144 | 145 | ||
145 | preempt_disable(); | 146 | pgd = pgd_offset(mm, 0xA0000); |
146 | spin_lock(&tsk->mm->page_table_lock); | ||
147 | pgd = pgd_offset(tsk->mm, 0xA0000); | ||
148 | if (pgd_none_or_clear_bad(pgd)) | 147 | if (pgd_none_or_clear_bad(pgd)) |
149 | goto out; | 148 | goto out; |
150 | pud = pud_offset(pgd, 0xA0000); | 149 | pud = pud_offset(pgd, 0xA0000); |
@@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct task_struct * tsk) | |||
153 | pmd = pmd_offset(pud, 0xA0000); | 152 | pmd = pmd_offset(pud, 0xA0000); |
154 | if (pmd_none_or_clear_bad(pmd)) | 153 | if (pmd_none_or_clear_bad(pmd)) |
155 | goto out; | 154 | goto out; |
156 | pte = mapped = pte_offset_map(pmd, 0xA0000); | 155 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); |
157 | for (i = 0; i < 32; i++) { | 156 | for (i = 0; i < 32; i++) { |
158 | if (pte_present(*pte)) | 157 | if (pte_present(*pte)) |
159 | set_pte(pte, pte_wrprotect(*pte)); | 158 | set_pte(pte, pte_wrprotect(*pte)); |
160 | pte++; | 159 | pte++; |
161 | } | 160 | } |
162 | pte_unmap(mapped); | 161 | pte_unmap_unlock(pte, ptl); |
163 | out: | 162 | out: |
164 | spin_unlock(&tsk->mm->page_table_lock); | ||
165 | preempt_enable(); | ||
166 | flush_tlb(); | 163 | flush_tlb(); |
167 | } | 164 | } |
168 | 165 | ||
@@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk | |||
306 | 303 | ||
307 | tsk->thread.screen_bitmap = info->screen_bitmap; | 304 | tsk->thread.screen_bitmap = info->screen_bitmap; |
308 | if (info->flags & VM86_SCREEN_BITMAP) | 305 | if (info->flags & VM86_SCREEN_BITMAP) |
309 | mark_screen_rdonly(tsk); | 306 | mark_screen_rdonly(tsk->mm); |
310 | __asm__ __volatile__( | 307 | __asm__ __volatile__( |
311 | "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" | 308 | "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" |
312 | "movl %0,%%esp\n\t" | 309 | "movl %0,%%esp\n\t" |
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 244d8ec66be2..c4af9638dbfa 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c | |||
@@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
98 | 98 | ||
99 | extern unsigned long find_max_low_pfn(void); | 99 | extern unsigned long find_max_low_pfn(void); |
100 | extern void find_max_pfn(void); | 100 | extern void find_max_pfn(void); |
101 | extern void one_highpage_init(struct page *, int, int); | 101 | extern void add_one_highpage_init(struct page *, int, int); |
102 | 102 | ||
103 | extern struct e820map e820; | 103 | extern struct e820map e820; |
104 | extern unsigned long init_pg_tables_end; | 104 | extern unsigned long init_pg_tables_end; |
@@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro) | |||
427 | if (!pfn_valid(node_pfn)) | 427 | if (!pfn_valid(node_pfn)) |
428 | continue; | 428 | continue; |
429 | page = pfn_to_page(node_pfn); | 429 | page = pfn_to_page(node_pfn); |
430 | one_highpage_init(page, node_pfn, bad_ppro); | 430 | add_one_highpage_init(page, node_pfn, bad_ppro); |
431 | } | 431 | } |
432 | } | 432 | } |
433 | totalram_pages += totalhigh_pages; | 433 | totalram_pages += totalhigh_pages; |
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 2ebaf75f732e..542d9298da5e 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
29 | #include <linux/efi.h> | 29 | #include <linux/efi.h> |
30 | #include <linux/memory_hotplug.h> | ||
30 | 31 | ||
31 | #include <asm/processor.h> | 32 | #include <asm/processor.h> |
32 | #include <asm/system.h> | 33 | #include <asm/system.h> |
@@ -266,17 +267,46 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) | |||
266 | pkmap_page_table = pte; | 267 | pkmap_page_table = pte; |
267 | } | 268 | } |
268 | 269 | ||
269 | void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) | 270 | void __devinit free_new_highpage(struct page *page) |
271 | { | ||
272 | set_page_count(page, 1); | ||
273 | __free_page(page); | ||
274 | totalhigh_pages++; | ||
275 | } | ||
276 | |||
277 | void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) | ||
270 | { | 278 | { |
271 | if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { | 279 | if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { |
272 | ClearPageReserved(page); | 280 | ClearPageReserved(page); |
273 | set_page_count(page, 1); | 281 | free_new_highpage(page); |
274 | __free_page(page); | ||
275 | totalhigh_pages++; | ||
276 | } else | 282 | } else |
277 | SetPageReserved(page); | 283 | SetPageReserved(page); |
278 | } | 284 | } |
279 | 285 | ||
286 | static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) | ||
287 | { | ||
288 | free_new_highpage(page); | ||
289 | totalram_pages++; | ||
290 | #ifdef CONFIG_FLATMEM | ||
291 | max_mapnr = max(pfn, max_mapnr); | ||
292 | #endif | ||
293 | num_physpages++; | ||
294 | return 0; | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Not currently handling the NUMA case. | ||
299 | * Assuming single node and all memory that | ||
300 | * has been added dynamically that would be | ||
301 | * onlined here is in HIGHMEM | ||
302 | */ | ||
303 | void online_page(struct page *page) | ||
304 | { | ||
305 | ClearPageReserved(page); | ||
306 | add_one_highpage_hotplug(page, page_to_pfn(page)); | ||
307 | } | ||
308 | |||
309 | |||
280 | #ifdef CONFIG_NUMA | 310 | #ifdef CONFIG_NUMA |
281 | extern void set_highmem_pages_init(int); | 311 | extern void set_highmem_pages_init(int); |
282 | #else | 312 | #else |
@@ -284,7 +314,7 @@ static void __init set_highmem_pages_init(int bad_ppro) | |||
284 | { | 314 | { |
285 | int pfn; | 315 | int pfn; |
286 | for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) | 316 | for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) |
287 | one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); | 317 | add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); |
288 | totalram_pages += totalhigh_pages; | 318 | totalram_pages += totalhigh_pages; |
289 | } | 319 | } |
290 | #endif /* CONFIG_FLATMEM */ | 320 | #endif /* CONFIG_FLATMEM */ |
@@ -615,6 +645,28 @@ void __init mem_init(void) | |||
615 | #endif | 645 | #endif |
616 | } | 646 | } |
617 | 647 | ||
648 | /* | ||
649 | * this is for the non-NUMA, single node SMP system case. | ||
650 | * Specifically, in the case of x86, we will always add | ||
651 | * memory to the highmem for now. | ||
652 | */ | ||
653 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
654 | int add_memory(u64 start, u64 size) | ||
655 | { | ||
656 | struct pglist_data *pgdata = &contig_page_data; | ||
657 | struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; | ||
658 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
659 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
660 | |||
661 | return __add_pages(zone, start_pfn, nr_pages); | ||
662 | } | ||
663 | |||
664 | int remove_memory(u64 start, u64 size) | ||
665 | { | ||
666 | return -EINVAL; | ||
667 | } | ||
668 | #endif | ||
669 | |||
618 | kmem_cache_t *pgd_cache; | 670 | kmem_cache_t *pgd_cache; |
619 | kmem_cache_t *pmd_cache; | 671 | kmem_cache_t *pmd_cache; |
620 | 672 | ||
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index f379b8d67558..5d09de8d1c6b 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c | |||
@@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, | |||
28 | unsigned long pfn; | 28 | unsigned long pfn; |
29 | 29 | ||
30 | pfn = phys_addr >> PAGE_SHIFT; | 30 | pfn = phys_addr >> PAGE_SHIFT; |
31 | pte = pte_alloc_kernel(&init_mm, pmd, addr); | 31 | pte = pte_alloc_kernel(pmd, addr); |
32 | if (!pte) | 32 | if (!pte) |
33 | return -ENOMEM; | 33 | return -ENOMEM; |
34 | do { | 34 | do { |
@@ -87,14 +87,12 @@ static int ioremap_page_range(unsigned long addr, | |||
87 | flush_cache_all(); | 87 | flush_cache_all(); |
88 | phys_addr -= addr; | 88 | phys_addr -= addr; |
89 | pgd = pgd_offset_k(addr); | 89 | pgd = pgd_offset_k(addr); |
90 | spin_lock(&init_mm.page_table_lock); | ||
91 | do { | 90 | do { |
92 | next = pgd_addr_end(addr, end); | 91 | next = pgd_addr_end(addr, end); |
93 | err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); | 92 | err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); |
94 | if (err) | 93 | if (err) |
95 | break; | 94 | break; |
96 | } while (pgd++, addr = next, addr != end); | 95 | } while (pgd++, addr = next, addr != end); |
97 | spin_unlock(&init_mm.page_table_lock); | ||
98 | flush_tlb_all(); | 96 | flush_tlb_all(); |
99 | return err; | 97 | return err; |
100 | } | 98 | } |
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index dcdce2c6c532..9db3242103be 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c | |||
@@ -31,11 +31,13 @@ void show_mem(void) | |||
31 | pg_data_t *pgdat; | 31 | pg_data_t *pgdat; |
32 | unsigned long i; | 32 | unsigned long i; |
33 | struct page_state ps; | 33 | struct page_state ps; |
34 | unsigned long flags; | ||
34 | 35 | ||
35 | printk(KERN_INFO "Mem-info:\n"); | 36 | printk(KERN_INFO "Mem-info:\n"); |
36 | show_free_areas(); | 37 | show_free_areas(); |
37 | printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 38 | printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
38 | for_each_pgdat(pgdat) { | 39 | for_each_pgdat(pgdat) { |
40 | pgdat_resize_lock(pgdat, &flags); | ||
39 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | 41 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { |
40 | page = pgdat_page_nr(pgdat, i); | 42 | page = pgdat_page_nr(pgdat, i); |
41 | total++; | 43 | total++; |
@@ -48,6 +50,7 @@ void show_mem(void) | |||
48 | else if (page_count(page)) | 50 | else if (page_count(page)) |
49 | shared += page_count(page) - 1; | 51 | shared += page_count(page) - 1; |
50 | } | 52 | } |
53 | pgdat_resize_unlock(pgdat, &flags); | ||
51 | } | 54 | } |
52 | printk(KERN_INFO "%d pages of RAM\n", total); | 55 | printk(KERN_INFO "%d pages of RAM\n", total); |
53 | printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); | 56 | printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); |
@@ -188,19 +191,19 @@ static inline void pgd_list_add(pgd_t *pgd) | |||
188 | struct page *page = virt_to_page(pgd); | 191 | struct page *page = virt_to_page(pgd); |
189 | page->index = (unsigned long)pgd_list; | 192 | page->index = (unsigned long)pgd_list; |
190 | if (pgd_list) | 193 | if (pgd_list) |
191 | pgd_list->private = (unsigned long)&page->index; | 194 | set_page_private(pgd_list, (unsigned long)&page->index); |
192 | pgd_list = page; | 195 | pgd_list = page; |
193 | page->private = (unsigned long)&pgd_list; | 196 | set_page_private(page, (unsigned long)&pgd_list); |
194 | } | 197 | } |
195 | 198 | ||
196 | static inline void pgd_list_del(pgd_t *pgd) | 199 | static inline void pgd_list_del(pgd_t *pgd) |
197 | { | 200 | { |
198 | struct page *next, **pprev, *page = virt_to_page(pgd); | 201 | struct page *next, **pprev, *page = virt_to_page(pgd); |
199 | next = (struct page *)page->index; | 202 | next = (struct page *)page->index; |
200 | pprev = (struct page **)page->private; | 203 | pprev = (struct page **)page_private(page); |
201 | *pprev = next; | 204 | *pprev = next; |
202 | if (next) | 205 | if (next) |
203 | next->private = (unsigned long)pprev; | 206 | set_page_private(next, (unsigned long)pprev); |
204 | } | 207 | } |
205 | 208 | ||
206 | void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) | 209 | void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) |
diff --git a/arch/i386/oprofile/backtrace.c b/arch/i386/oprofile/backtrace.c index 65dfd2edb671..21654be3f73f 100644 --- a/arch/i386/oprofile/backtrace.c +++ b/arch/i386/oprofile/backtrace.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <asm/ptrace.h> | 14 | #include <asm/ptrace.h> |
15 | #include <asm/uaccess.h> | ||
15 | 16 | ||
16 | struct frame_head { | 17 | struct frame_head { |
17 | struct frame_head * ebp; | 18 | struct frame_head * ebp; |
@@ -21,26 +22,22 @@ struct frame_head { | |||
21 | static struct frame_head * | 22 | static struct frame_head * |
22 | dump_backtrace(struct frame_head * head) | 23 | dump_backtrace(struct frame_head * head) |
23 | { | 24 | { |
24 | oprofile_add_trace(head->ret); | 25 | struct frame_head bufhead[2]; |
25 | 26 | ||
26 | /* frame pointers should strictly progress back up the stack | 27 | /* Also check accessibility of one struct frame_head beyond */ |
27 | * (towards higher addresses) */ | 28 | if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) |
28 | if (head >= head->ebp) | 29 | return NULL; |
30 | if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) | ||
29 | return NULL; | 31 | return NULL; |
30 | 32 | ||
31 | return head->ebp; | 33 | oprofile_add_trace(bufhead[0].ret); |
32 | } | ||
33 | |||
34 | /* check that the page(s) containing the frame head are present */ | ||
35 | static int pages_present(struct frame_head * head) | ||
36 | { | ||
37 | struct mm_struct * mm = current->mm; | ||
38 | 34 | ||
39 | /* FIXME: only necessary once per page */ | 35 | /* frame pointers should strictly progress back up the stack |
40 | if (!check_user_page_readable(mm, (unsigned long)head)) | 36 | * (towards higher addresses) */ |
41 | return 0; | 37 | if (head >= bufhead[0].ebp) |
38 | return NULL; | ||
42 | 39 | ||
43 | return check_user_page_readable(mm, (unsigned long)(head + 1)); | 40 | return bufhead[0].ebp; |
44 | } | 41 | } |
45 | 42 | ||
46 | /* | 43 | /* |
@@ -97,15 +94,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) | |||
97 | return; | 94 | return; |
98 | } | 95 | } |
99 | 96 | ||
100 | #ifdef CONFIG_SMP | 97 | while (depth-- && head) |
101 | if (!spin_trylock(¤t->mm->page_table_lock)) | ||
102 | return; | ||
103 | #endif | ||
104 | |||
105 | while (depth-- && head && pages_present(head)) | ||
106 | head = dump_backtrace(head); | 98 | head = dump_backtrace(head); |
107 | |||
108 | #ifdef CONFIG_SMP | ||
109 | spin_unlock(¤t->mm->page_table_lock); | ||
110 | #endif | ||
111 | } | 99 | } |
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index d71731ee5b61..f7dfc107cb7b 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c | |||
@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon | |||
2352 | insert_vm_struct(mm, vma); | 2352 | insert_vm_struct(mm, vma); |
2353 | 2353 | ||
2354 | mm->total_vm += size >> PAGE_SHIFT; | 2354 | mm->total_vm += size >> PAGE_SHIFT; |
2355 | vm_stat_account(vma); | 2355 | vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, |
2356 | vma_pages(vma)); | ||
2356 | up_write(&task->mm->mmap_sem); | 2357 | up_write(&task->mm->mmap_sem); |
2357 | 2358 | ||
2358 | /* | 2359 | /* |
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index a3788fb84809..a88cdb7232f8 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c | |||
@@ -555,9 +555,13 @@ void show_mem(void) | |||
555 | show_free_areas(); | 555 | show_free_areas(); |
556 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 556 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
557 | for_each_pgdat(pgdat) { | 557 | for_each_pgdat(pgdat) { |
558 | unsigned long present = pgdat->node_present_pages; | 558 | unsigned long present; |
559 | unsigned long flags; | ||
559 | int shared = 0, cached = 0, reserved = 0; | 560 | int shared = 0, cached = 0, reserved = 0; |
561 | |||
560 | printk("Node ID: %d\n", pgdat->node_id); | 562 | printk("Node ID: %d\n", pgdat->node_id); |
563 | pgdat_resize_lock(pgdat, &flags); | ||
564 | present = pgdat->node_present_pages; | ||
561 | for(i = 0; i < pgdat->node_spanned_pages; i++) { | 565 | for(i = 0; i < pgdat->node_spanned_pages; i++) { |
562 | struct page *page; | 566 | struct page *page; |
563 | if (pfn_valid(pgdat->node_start_pfn + i)) | 567 | if (pfn_valid(pgdat->node_start_pfn + i)) |
@@ -571,6 +575,7 @@ void show_mem(void) | |||
571 | else if (page_count(page)) | 575 | else if (page_count(page)) |
572 | shared += page_count(page)-1; | 576 | shared += page_count(page)-1; |
573 | } | 577 | } |
578 | pgdat_resize_unlock(pgdat, &flags); | ||
574 | total_present += present; | 579 | total_present += present; |
575 | total_reserved += reserved; | 580 | total_reserved += reserved; |
576 | total_cached += cached; | 581 | total_cached += cached; |
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 3c32af910d60..af7eb087dca7 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c | |||
@@ -20,32 +20,6 @@ | |||
20 | extern void die (char *, struct pt_regs *, long); | 20 | extern void die (char *, struct pt_regs *, long); |
21 | 21 | ||
22 | /* | 22 | /* |
23 | * This routine is analogous to expand_stack() but instead grows the | ||
24 | * register backing store (which grows towards higher addresses). | ||
25 | * Since the register backing store is access sequentially, we | ||
26 | * disallow growing the RBS by more than a page at a time. Note that | ||
27 | * the VM_GROWSUP flag can be set on any VM area but that's fine | ||
28 | * because the total process size is still limited by RLIMIT_STACK and | ||
29 | * RLIMIT_AS. | ||
30 | */ | ||
31 | static inline long | ||
32 | expand_backing_store (struct vm_area_struct *vma, unsigned long address) | ||
33 | { | ||
34 | unsigned long grow; | ||
35 | |||
36 | grow = PAGE_SIZE >> PAGE_SHIFT; | ||
37 | if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur | ||
38 | || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur)) | ||
39 | return -ENOMEM; | ||
40 | vma->vm_end += PAGE_SIZE; | ||
41 | vma->vm_mm->total_vm += grow; | ||
42 | if (vma->vm_flags & VM_LOCKED) | ||
43 | vma->vm_mm->locked_vm += grow; | ||
44 | __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); | ||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | /* | ||
49 | * Return TRUE if ADDRESS points at a page in the kernel's mapped segment | 23 | * Return TRUE if ADDRESS points at a page in the kernel's mapped segment |
50 | * (inside region 5, on ia64) and that page is present. | 24 | * (inside region 5, on ia64) and that page is present. |
51 | */ | 25 | */ |
@@ -185,7 +159,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re | |||
185 | if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) | 159 | if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) |
186 | || REGION_OFFSET(address) >= RGN_MAP_LIMIT) | 160 | || REGION_OFFSET(address) >= RGN_MAP_LIMIT) |
187 | goto bad_area; | 161 | goto bad_area; |
188 | if (expand_backing_store(vma, address)) | 162 | /* |
163 | * Since the register backing store is accessed sequentially, | ||
164 | * we disallow growing it by more than a page at a time. | ||
165 | */ | ||
166 | if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) | ||
167 | goto bad_area; | ||
168 | if (expand_upwards(vma, address)) | ||
189 | goto bad_area; | 169 | goto bad_area; |
190 | } | 170 | } |
191 | goto good_area; | 171 | goto good_area; |
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 98246acd4991..e3215ba64ffd 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c | |||
@@ -158,7 +158,7 @@ ia64_init_addr_space (void) | |||
158 | vma->vm_start = current->thread.rbs_bot & PAGE_MASK; | 158 | vma->vm_start = current->thread.rbs_bot & PAGE_MASK; |
159 | vma->vm_end = vma->vm_start + PAGE_SIZE; | 159 | vma->vm_end = vma->vm_start + PAGE_SIZE; |
160 | vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; | 160 | vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; |
161 | vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP; | 161 | vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; |
162 | down_write(¤t->mm->mmap_sem); | 162 | down_write(¤t->mm->mmap_sem); |
163 | if (insert_vm_struct(current->mm, vma)) { | 163 | if (insert_vm_struct(current->mm, vma)) { |
164 | up_write(¤t->mm->mmap_sem); | 164 | up_write(¤t->mm->mmap_sem); |
@@ -275,26 +275,21 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) | |||
275 | 275 | ||
276 | pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ | 276 | pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ |
277 | 277 | ||
278 | spin_lock(&init_mm.page_table_lock); | ||
279 | { | 278 | { |
280 | pud = pud_alloc(&init_mm, pgd, address); | 279 | pud = pud_alloc(&init_mm, pgd, address); |
281 | if (!pud) | 280 | if (!pud) |
282 | goto out; | 281 | goto out; |
283 | |||
284 | pmd = pmd_alloc(&init_mm, pud, address); | 282 | pmd = pmd_alloc(&init_mm, pud, address); |
285 | if (!pmd) | 283 | if (!pmd) |
286 | goto out; | 284 | goto out; |
287 | pte = pte_alloc_map(&init_mm, pmd, address); | 285 | pte = pte_alloc_kernel(pmd, address); |
288 | if (!pte) | 286 | if (!pte) |
289 | goto out; | 287 | goto out; |
290 | if (!pte_none(*pte)) { | 288 | if (!pte_none(*pte)) |
291 | pte_unmap(pte); | ||
292 | goto out; | 289 | goto out; |
293 | } | ||
294 | set_pte(pte, mk_pte(page, pgprot)); | 290 | set_pte(pte, mk_pte(page, pgprot)); |
295 | pte_unmap(pte); | ||
296 | } | 291 | } |
297 | out: spin_unlock(&init_mm.page_table_lock); | 292 | out: |
298 | /* no need for flush_tlb */ | 293 | /* no need for flush_tlb */ |
299 | return page; | 294 | return page; |
300 | } | 295 | } |
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index c93e0f2b5fea..c79a9b96d02b 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c | |||
@@ -158,10 +158,12 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long | |||
158 | # ifdef CONFIG_SMP | 158 | # ifdef CONFIG_SMP |
159 | platform_global_tlb_purge(mm, start, end, nbits); | 159 | platform_global_tlb_purge(mm, start, end, nbits); |
160 | # else | 160 | # else |
161 | preempt_disable(); | ||
161 | do { | 162 | do { |
162 | ia64_ptcl(start, (nbits<<2)); | 163 | ia64_ptcl(start, (nbits<<2)); |
163 | start += (1UL << nbits); | 164 | start += (1UL << nbits); |
164 | } while (start < end); | 165 | } while (start < end); |
166 | preempt_enable(); | ||
165 | # endif | 167 | # endif |
166 | 168 | ||
167 | ia64_srlz_i(); /* srlz.i implies srlz.d */ | 169 | ia64_srlz_i(); /* srlz.i implies srlz.d */ |
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index d9a40b1fe8ba..6facf15b04f3 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c | |||
@@ -48,6 +48,8 @@ void show_mem(void) | |||
48 | show_free_areas(); | 48 | show_free_areas(); |
49 | printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); | 49 | printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); |
50 | for_each_pgdat(pgdat) { | 50 | for_each_pgdat(pgdat) { |
51 | unsigned long flags; | ||
52 | pgdat_resize_lock(pgdat, &flags); | ||
51 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | 53 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { |
52 | page = pgdat_page_nr(pgdat, i); | 54 | page = pgdat_page_nr(pgdat, i); |
53 | total++; | 55 | total++; |
@@ -60,6 +62,7 @@ void show_mem(void) | |||
60 | else if (page_count(page)) | 62 | else if (page_count(page)) |
61 | shared += page_count(page) - 1; | 63 | shared += page_count(page) - 1; |
62 | } | 64 | } |
65 | pgdat_resize_unlock(pgdat, &flags); | ||
63 | } | 66 | } |
64 | printk("%d pages of RAM\n", total); | 67 | printk("%d pages of RAM\n", total); |
65 | printk("%d pages of HIGHMEM\n",highmem); | 68 | printk("%d pages of HIGHMEM\n",highmem); |
@@ -150,10 +153,14 @@ int __init reservedpages_count(void) | |||
150 | int reservedpages, nid, i; | 153 | int reservedpages, nid, i; |
151 | 154 | ||
152 | reservedpages = 0; | 155 | reservedpages = 0; |
153 | for_each_online_node(nid) | 156 | for_each_online_node(nid) { |
157 | unsigned long flags; | ||
158 | pgdat_resize_lock(NODE_DATA(nid), &flags); | ||
154 | for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) | 159 | for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) |
155 | if (PageReserved(nid_page_nr(nid, i))) | 160 | if (PageReserved(nid_page_nr(nid, i))) |
156 | reservedpages++; | 161 | reservedpages++; |
162 | pgdat_resize_unlock(NODE_DATA(nid), &flags); | ||
163 | } | ||
157 | 164 | ||
158 | return reservedpages; | 165 | return reservedpages; |
159 | } | 166 | } |
diff --git a/arch/m32r/mm/ioremap.c b/arch/m32r/mm/ioremap.c index 70c59055c19c..a151849a605e 100644 --- a/arch/m32r/mm/ioremap.c +++ b/arch/m32r/mm/ioremap.c | |||
@@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, | |||
67 | if (address >= end) | 67 | if (address >= end) |
68 | BUG(); | 68 | BUG(); |
69 | do { | 69 | do { |
70 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 70 | pte_t * pte = pte_alloc_kernel(pmd, address); |
71 | if (!pte) | 71 | if (!pte) |
72 | return -ENOMEM; | 72 | return -ENOMEM; |
73 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | 73 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); |
@@ -90,7 +90,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
90 | flush_cache_all(); | 90 | flush_cache_all(); |
91 | if (address >= end) | 91 | if (address >= end) |
92 | BUG(); | 92 | BUG(); |
93 | spin_lock(&init_mm.page_table_lock); | ||
94 | do { | 93 | do { |
95 | pmd_t *pmd; | 94 | pmd_t *pmd; |
96 | pmd = pmd_alloc(&init_mm, dir, address); | 95 | pmd = pmd_alloc(&init_mm, dir, address); |
@@ -104,7 +103,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
104 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 103 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
105 | dir++; | 104 | dir++; |
106 | } while (address && (address < end)); | 105 | } while (address && (address < end)); |
107 | spin_unlock(&init_mm.page_table_lock); | ||
108 | flush_tlb_all(); | 106 | flush_tlb_all(); |
109 | return error; | 107 | return error; |
110 | } | 108 | } |
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index ba960bbc8e6d..1dd5d18b2201 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig | |||
@@ -388,33 +388,11 @@ config AMIGA_PCMCIA | |||
388 | Include support in the kernel for pcmcia on Amiga 1200 and Amiga | 388 | Include support in the kernel for pcmcia on Amiga 1200 and Amiga |
389 | 600. If you intend to use pcmcia cards say Y; otherwise say N. | 389 | 600. If you intend to use pcmcia cards say Y; otherwise say N. |
390 | 390 | ||
391 | config STRAM_SWAP | ||
392 | bool "Support for ST-RAM as swap space" | ||
393 | depends on ATARI && BROKEN | ||
394 | ---help--- | ||
395 | Some Atari 68k machines (including the 520STF and 1020STE) divide | ||
396 | their addressable memory into ST and TT sections. The TT section | ||
397 | (up to 512MB) is the main memory; the ST section (up to 4MB) is | ||
398 | accessible to the built-in graphics board, runs slower, and is | ||
399 | present mainly for backward compatibility with older machines. | ||
400 | |||
401 | This enables support for using (parts of) ST-RAM as swap space, | ||
402 | instead of as normal system memory. This can first enhance system | ||
403 | performance if you have lots of alternate RAM (compared to the size | ||
404 | of ST-RAM), because executable code always will reside in faster | ||
405 | memory. ST-RAM will remain as ultra-fast swap space. On the other | ||
406 | hand, it allows much improved dynamic allocations of ST-RAM buffers | ||
407 | for device driver modules (e.g. floppy, ACSI, SLM printer, DMA | ||
408 | sound). The probability that such allocations at module load time | ||
409 | fail is drastically reduced. | ||
410 | |||
411 | config STRAM_PROC | 391 | config STRAM_PROC |
412 | bool "ST-RAM statistics in /proc" | 392 | bool "ST-RAM statistics in /proc" |
413 | depends on ATARI | 393 | depends on ATARI |
414 | help | 394 | help |
415 | Say Y here to report ST-RAM usage statistics in /proc/stram. See | 395 | Say Y here to report ST-RAM usage statistics in /proc/stram. |
416 | the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its | ||
417 | uses. | ||
418 | 396 | ||
419 | config HEARTBEAT | 397 | config HEARTBEAT |
420 | bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40 | 398 | bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40 |
diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c index 5a3c106b40c8..22e0481a5f7b 100644 --- a/arch/m68k/atari/stram.c +++ b/arch/m68k/atari/stram.c | |||
@@ -15,11 +15,9 @@ | |||
15 | #include <linux/kdev_t.h> | 15 | #include <linux/kdev_t.h> |
16 | #include <linux/major.h> | 16 | #include <linux/major.h> |
17 | #include <linux/init.h> | 17 | #include <linux/init.h> |
18 | #include <linux/swap.h> | ||
19 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
20 | #include <linux/vmalloc.h> | 19 | #include <linux/vmalloc.h> |
21 | #include <linux/pagemap.h> | 20 | #include <linux/pagemap.h> |
22 | #include <linux/shm.h> | ||
23 | #include <linux/bootmem.h> | 21 | #include <linux/bootmem.h> |
24 | #include <linux/mount.h> | 22 | #include <linux/mount.h> |
25 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
@@ -33,8 +31,6 @@ | |||
33 | #include <asm/io.h> | 31 | #include <asm/io.h> |
34 | #include <asm/semaphore.h> | 32 | #include <asm/semaphore.h> |
35 | 33 | ||
36 | #include <linux/swapops.h> | ||
37 | |||
38 | #undef DEBUG | 34 | #undef DEBUG |
39 | 35 | ||
40 | #ifdef DEBUG | 36 | #ifdef DEBUG |
@@ -49,8 +45,7 @@ | |||
49 | #include <linux/proc_fs.h> | 45 | #include <linux/proc_fs.h> |
50 | #endif | 46 | #endif |
51 | 47 | ||
52 | /* Pre-swapping comments: | 48 | /* |
53 | * | ||
54 | * ++roman: | 49 | * ++roman: |
55 | * | 50 | * |
56 | * New version of ST-Ram buffer allocation. Instead of using the | 51 | * New version of ST-Ram buffer allocation. Instead of using the |
@@ -75,76 +70,6 @@ | |||
75 | * | 70 | * |
76 | */ | 71 | */ |
77 | 72 | ||
78 | /* | ||
79 | * New Nov 1997: Use ST-RAM as swap space! | ||
80 | * | ||
81 | * In the past, there were often problems with modules that require ST-RAM | ||
82 | * buffers. Such drivers have to use __get_dma_pages(), which unfortunately | ||
83 | * often isn't very successful in allocating more than 1 page :-( [1] The net | ||
84 | * result was that most of the time you couldn't insmod such modules (ataflop, | ||
85 | * ACSI, SCSI on Falcon, Atari internal framebuffer, not to speak of acsi_slm, | ||
86 | * which needs a 1 MB buffer... :-). | ||
87 | * | ||
88 | * To overcome this limitation, ST-RAM can now be turned into a very | ||
89 | * high-speed swap space. If a request for an ST-RAM buffer comes, the kernel | ||
90 | * now tries to unswap some pages on that swap device to make some free (and | ||
91 | * contiguous) space. This works much better in comparison to | ||
92 | * __get_dma_pages(), since used swap pages can be selectively freed by either | ||
93 | * moving them to somewhere else in swap space, or by reading them back into | ||
94 | * system memory. Ok, there operation of unswapping isn't really cheap (for | ||
95 | * each page, one has to go through the page tables of all processes), but it | ||
96 | * doesn't happen that often (only when allocation ST-RAM, i.e. when loading a | ||
97 | * module that needs ST-RAM). But it at least makes it possible to load such | ||
98 | * modules! | ||
99 | * | ||
100 | * It could also be that overall system performance increases a bit due to | ||
101 | * ST-RAM swapping, since slow ST-RAM isn't used anymore for holding data or | ||
102 | * executing code in. It's then just a (very fast, compared to disk) back | ||
103 | * storage for not-so-often needed data. (But this effect must be compared | ||
104 | * with the loss of total memory...) Don't know if the effect is already | ||
105 | * visible on a TT, where the speed difference between ST- and TT-RAM isn't | ||
106 | * that dramatic, but it should on machines where TT-RAM is really much faster | ||
107 | * (e.g. Afterburner). | ||
108 | * | ||
109 | * [1]: __get_free_pages() does a fine job if you only want one page, but if | ||
110 | * you want more (contiguous) pages, it can give you such a block only if | ||
111 | * there's already a free one. The algorithm can't try to free buffers or swap | ||
112 | * out something in order to make more free space, since all that page-freeing | ||
113 | * mechanisms work "target-less", i.e. they just free something, but not in a | ||
114 | * specific place. I.e., __get_free_pages() can't do anything to free | ||
115 | * *adjacent* pages :-( This situation becomes even worse for DMA memory, | ||
116 | * since the freeing algorithms are also blind to DMA capability of pages. | ||
117 | */ | ||
118 | |||
119 | /* 1998-10-20: ++andreas | ||
120 | unswap_by_move disabled because it does not handle swapped shm pages. | ||
121 | */ | ||
122 | |||
123 | /* 2000-05-01: ++andreas | ||
124 | Integrated with bootmem. Remove all traces of unswap_by_move. | ||
125 | */ | ||
126 | |||
127 | #ifdef CONFIG_STRAM_SWAP | ||
128 | #define ALIGN_IF_SWAP(x) PAGE_ALIGN(x) | ||
129 | #else | ||
130 | #define ALIGN_IF_SWAP(x) (x) | ||
131 | #endif | ||
132 | |||
133 | /* get index of swap page at address 'addr' */ | ||
134 | #define SWAP_NR(addr) (((addr) - swap_start) >> PAGE_SHIFT) | ||
135 | |||
136 | /* get address of swap page #'nr' */ | ||
137 | #define SWAP_ADDR(nr) (swap_start + ((nr) << PAGE_SHIFT)) | ||
138 | |||
139 | /* get number of pages for 'n' bytes (already page-aligned) */ | ||
140 | #define N_PAGES(n) ((n) >> PAGE_SHIFT) | ||
141 | |||
142 | /* The following two numbers define the maximum fraction of ST-RAM in total | ||
143 | * memory, below that the kernel would automatically use ST-RAM as swap | ||
144 | * space. This decision can be overridden with stram_swap= */ | ||
145 | #define MAX_STRAM_FRACTION_NOM 1 | ||
146 | #define MAX_STRAM_FRACTION_DENOM 3 | ||
147 | |||
148 | /* Start and end (virtual) of ST-RAM */ | 73 | /* Start and end (virtual) of ST-RAM */ |
149 | static void *stram_start, *stram_end; | 74 | static void *stram_start, *stram_end; |
150 | 75 | ||
@@ -164,10 +89,9 @@ typedef struct stram_block { | |||
164 | } BLOCK; | 89 | } BLOCK; |
165 | 90 | ||
166 | /* values for flags field */ | 91 | /* values for flags field */ |
167 | #define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */ | 92 | #define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */ |
168 | #define BLOCK_KMALLOCED 0x02 /* structure allocated by kmalloc() */ | 93 | #define BLOCK_KMALLOCED 0x02 /* structure allocated by kmalloc() */ |
169 | #define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */ | 94 | #define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */ |
170 | #define BLOCK_INSWAP 0x10 /* block allocated in swap space */ | ||
171 | 95 | ||
172 | /* list of allocated blocks */ | 96 | /* list of allocated blocks */ |
173 | static BLOCK *alloc_list; | 97 | static BLOCK *alloc_list; |
@@ -179,60 +103,8 @@ static BLOCK *alloc_list; | |||
179 | #define N_STATIC_BLOCKS 20 | 103 | #define N_STATIC_BLOCKS 20 |
180 | static BLOCK static_blocks[N_STATIC_BLOCKS]; | 104 | static BLOCK static_blocks[N_STATIC_BLOCKS]; |
181 | 105 | ||
182 | #ifdef CONFIG_STRAM_SWAP | ||
183 | /* max. number of bytes to use for swapping | ||
184 | * 0 = no ST-RAM swapping | ||
185 | * -1 = do swapping (to whole ST-RAM) if it's less than MAX_STRAM_FRACTION of | ||
186 | * total memory | ||
187 | */ | ||
188 | static int max_swap_size = -1; | ||
189 | |||
190 | /* start and end of swapping area */ | ||
191 | static void *swap_start, *swap_end; | ||
192 | |||
193 | /* The ST-RAM's swap info structure */ | ||
194 | static struct swap_info_struct *stram_swap_info; | ||
195 | |||
196 | /* The ST-RAM's swap type */ | ||
197 | static int stram_swap_type; | ||
198 | |||
199 | /* Semaphore for get_stram_region. */ | ||
200 | static DECLARE_MUTEX(stram_swap_sem); | ||
201 | |||
202 | /* major and minor device number of the ST-RAM device; for the major, we use | ||
203 | * the same as Amiga z2ram, which is really similar and impossible on Atari, | ||
204 | * and for the minor a relatively odd number to avoid the user creating and | ||
205 | * using that device. */ | ||
206 | #define STRAM_MAJOR Z2RAM_MAJOR | ||
207 | #define STRAM_MINOR 13 | ||
208 | |||
209 | /* Some impossible pointer value */ | ||
210 | #define MAGIC_FILE_P (struct file *)0xffffdead | ||
211 | |||
212 | #ifdef DO_PROC | ||
213 | static unsigned stat_swap_read; | ||
214 | static unsigned stat_swap_write; | ||
215 | static unsigned stat_swap_force; | ||
216 | #endif /* DO_PROC */ | ||
217 | |||
218 | #endif /* CONFIG_STRAM_SWAP */ | ||
219 | |||
220 | /***************************** Prototypes *****************************/ | 106 | /***************************** Prototypes *****************************/ |
221 | 107 | ||
222 | #ifdef CONFIG_STRAM_SWAP | ||
223 | static int swap_init(void *start_mem, void *swap_data); | ||
224 | static void *get_stram_region( unsigned long n_pages ); | ||
225 | static void free_stram_region( unsigned long offset, unsigned long n_pages | ||
226 | ); | ||
227 | static int in_some_region(void *addr); | ||
228 | static unsigned long find_free_region( unsigned long n_pages, unsigned long | ||
229 | *total_free, unsigned long | ||
230 | *region_free ); | ||
231 | static void do_stram_request(request_queue_t *); | ||
232 | static int stram_open( struct inode *inode, struct file *filp ); | ||
233 | static int stram_release( struct inode *inode, struct file *filp ); | ||
234 | static void reserve_region(void *start, void *end); | ||
235 | #endif | ||
236 | static BLOCK *add_region( void *addr, unsigned long size ); | 108 | static BLOCK *add_region( void *addr, unsigned long size ); |
237 | static BLOCK *find_region( void *addr ); | 109 | static BLOCK *find_region( void *addr ); |
238 | static int remove_region( BLOCK *block ); | 110 | static int remove_region( BLOCK *block ); |
@@ -279,84 +151,11 @@ void __init atari_stram_init(void) | |||
279 | */ | 151 | */ |
280 | void __init atari_stram_reserve_pages(void *start_mem) | 152 | void __init atari_stram_reserve_pages(void *start_mem) |
281 | { | 153 | { |
282 | #ifdef CONFIG_STRAM_SWAP | ||
283 | /* if max_swap_size is negative (i.e. no stram_swap= option given), | ||
284 | * determine at run time whether to use ST-RAM swapping */ | ||
285 | if (max_swap_size < 0) | ||
286 | /* Use swapping if ST-RAM doesn't make up more than MAX_STRAM_FRACTION | ||
287 | * of total memory. In that case, the max. size is set to 16 MB, | ||
288 | * because ST-RAM can never be bigger than that. | ||
289 | * Also, never use swapping on a Hades, there's no separate ST-RAM in | ||
290 | * that machine. */ | ||
291 | max_swap_size = | ||
292 | (!MACH_IS_HADES && | ||
293 | (N_PAGES(stram_end-stram_start)*MAX_STRAM_FRACTION_DENOM <= | ||
294 | ((unsigned long)high_memory>>PAGE_SHIFT)*MAX_STRAM_FRACTION_NOM)) ? 16*1024*1024 : 0; | ||
295 | DPRINTK( "atari_stram_reserve_pages: max_swap_size = %d\n", max_swap_size ); | ||
296 | #endif | ||
297 | |||
298 | /* always reserve first page of ST-RAM, the first 2 kB are | 154 | /* always reserve first page of ST-RAM, the first 2 kB are |
299 | * supervisor-only! */ | 155 | * supervisor-only! */ |
300 | if (!kernel_in_stram) | 156 | if (!kernel_in_stram) |
301 | reserve_bootmem (0, PAGE_SIZE); | 157 | reserve_bootmem (0, PAGE_SIZE); |
302 | 158 | ||
303 | #ifdef CONFIG_STRAM_SWAP | ||
304 | { | ||
305 | void *swap_data; | ||
306 | |||
307 | start_mem = (void *) PAGE_ALIGN ((unsigned long) start_mem); | ||
308 | /* determine first page to use as swap: if the kernel is | ||
309 | in TT-RAM, this is the first page of (usable) ST-RAM; | ||
310 | otherwise just use the end of kernel data (= start_mem) */ | ||
311 | swap_start = !kernel_in_stram ? stram_start + PAGE_SIZE : start_mem; | ||
312 | /* decrement by one page, rest of kernel assumes that first swap page | ||
313 | * is always reserved and maybe doesn't handle swp_entry == 0 | ||
314 | * correctly */ | ||
315 | swap_start -= PAGE_SIZE; | ||
316 | swap_end = stram_end; | ||
317 | if (swap_end-swap_start > max_swap_size) | ||
318 | swap_end = swap_start + max_swap_size; | ||
319 | DPRINTK( "atari_stram_reserve_pages: swapping enabled; " | ||
320 | "swap=%p-%p\n", swap_start, swap_end); | ||
321 | |||
322 | /* reserve some amount of memory for maintainance of | ||
323 | * swapping itself: one page for each 2048 (PAGE_SIZE/2) | ||
324 | * swap pages. (2 bytes for each page) */ | ||
325 | swap_data = start_mem; | ||
326 | start_mem += ((SWAP_NR(swap_end) + PAGE_SIZE/2 - 1) | ||
327 | >> (PAGE_SHIFT-1)) << PAGE_SHIFT; | ||
328 | /* correct swap_start if necessary */ | ||
329 | if (swap_start + PAGE_SIZE == swap_data) | ||
330 | swap_start = start_mem - PAGE_SIZE; | ||
331 | |||
332 | if (!swap_init( start_mem, swap_data )) { | ||
333 | printk( KERN_ERR "ST-RAM swap space initialization failed\n" ); | ||
334 | max_swap_size = 0; | ||
335 | return; | ||
336 | } | ||
337 | /* reserve region for swapping meta-data */ | ||
338 | reserve_region(swap_data, start_mem); | ||
339 | /* reserve swapping area itself */ | ||
340 | reserve_region(swap_start + PAGE_SIZE, swap_end); | ||
341 | |||
342 | /* | ||
343 | * If the whole ST-RAM is used for swapping, there are no allocatable | ||
344 | * dma pages left. But unfortunately, some shared parts of the kernel | ||
345 | * (particularly the SCSI mid-level) call __get_dma_pages() | ||
346 | * unconditionally :-( These calls then fail, and scsi.c even doesn't | ||
347 | * check for NULL return values and just crashes. The quick fix for | ||
348 | * this (instead of doing much clean up work in the SCSI code) is to | ||
349 | * pretend all pages are DMA-able by setting mach_max_dma_address to | ||
350 | * ULONG_MAX. This doesn't change any functionality so far, since | ||
351 | * get_dma_pages() shouldn't be used on Atari anyway anymore (better | ||
352 | * use atari_stram_alloc()), and the Atari SCSI drivers don't need DMA | ||
353 | * memory. But unfortunately there's now no kind of warning (even not | ||
354 | * a NULL return value) if you use get_dma_pages() nevertheless :-( | ||
355 | * You just will get non-DMA-able memory... | ||
356 | */ | ||
357 | mach_max_dma_address = 0xffffffff; | ||
358 | } | ||
359 | #endif | ||
360 | } | 159 | } |
361 | 160 | ||
362 | void atari_stram_mem_init_hook (void) | 161 | void atari_stram_mem_init_hook (void) |
@@ -367,7 +166,6 @@ void atari_stram_mem_init_hook (void) | |||
367 | 166 | ||
368 | /* | 167 | /* |
369 | * This is main public interface: somehow allocate a ST-RAM block | 168 | * This is main public interface: somehow allocate a ST-RAM block |
370 | * There are three strategies: | ||
371 | * | 169 | * |
372 | * - If we're before mem_init(), we have to make a static allocation. The | 170 | * - If we're before mem_init(), we have to make a static allocation. The |
373 | * region is taken in the kernel data area (if the kernel is in ST-RAM) or | 171 | * region is taken in the kernel data area (if the kernel is in ST-RAM) or |
@@ -375,14 +173,9 @@ void atari_stram_mem_init_hook (void) | |||
375 | * rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel | 173 | * rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel |
376 | * address space in the latter case. | 174 | * address space in the latter case. |
377 | * | 175 | * |
378 | * - If mem_init() already has been called and ST-RAM swapping is enabled, | 176 | * - If mem_init() already has been called, try with __get_dma_pages(). |
379 | * try to get the memory from the (pseudo) swap-space, either free already | 177 | * This has the disadvantage that it's very hard to get more than 1 page, |
380 | * or by moving some other pages out of the swap. | 178 | * and it is likely to fail :-( |
381 | * | ||
382 | * - If mem_init() already has been called, and ST-RAM swapping is not | ||
383 | * enabled, the only possibility is to try with __get_dma_pages(). This has | ||
384 | * the disadvantage that it's very hard to get more than 1 page, and it is | ||
385 | * likely to fail :-( | ||
386 | * | 179 | * |
387 | */ | 180 | */ |
388 | void *atari_stram_alloc(long size, const char *owner) | 181 | void *atari_stram_alloc(long size, const char *owner) |
@@ -393,27 +186,13 @@ void *atari_stram_alloc(long size, const char *owner) | |||
393 | 186 | ||
394 | DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner); | 187 | DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner); |
395 | 188 | ||
396 | size = ALIGN_IF_SWAP(size); | ||
397 | DPRINTK( "atari_stram_alloc: rounded size = %08lx\n", size ); | ||
398 | #ifdef CONFIG_STRAM_SWAP | ||
399 | if (max_swap_size) { | ||
400 | /* If swapping is active: make some free space in the swap | ||
401 | "device". */ | ||
402 | DPRINTK( "atari_stram_alloc: after mem_init, swapping ok, " | ||
403 | "calling get_region\n" ); | ||
404 | addr = get_stram_region( N_PAGES(size) ); | ||
405 | flags = BLOCK_INSWAP; | ||
406 | } | ||
407 | else | ||
408 | #endif | ||
409 | if (!mem_init_done) | 189 | if (!mem_init_done) |
410 | return alloc_bootmem_low(size); | 190 | return alloc_bootmem_low(size); |
411 | else { | 191 | else { |
412 | /* After mem_init() and no swapping: can only resort to | 192 | /* After mem_init(): can only resort to __get_dma_pages() */ |
413 | * __get_dma_pages() */ | ||
414 | addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size)); | 193 | addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size)); |
415 | flags = BLOCK_GFP; | 194 | flags = BLOCK_GFP; |
416 | DPRINTK( "atari_stram_alloc: after mem_init, swapping off, " | 195 | DPRINTK( "atari_stram_alloc: after mem_init, " |
417 | "get_pages=%p\n", addr ); | 196 | "get_pages=%p\n", addr ); |
418 | } | 197 | } |
419 | 198 | ||
@@ -422,12 +201,7 @@ void *atari_stram_alloc(long size, const char *owner) | |||
422 | /* out of memory for BLOCK structure :-( */ | 201 | /* out of memory for BLOCK structure :-( */ |
423 | DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- " | 202 | DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- " |
424 | "freeing again\n" ); | 203 | "freeing again\n" ); |
425 | #ifdef CONFIG_STRAM_SWAP | 204 | free_pages((unsigned long)addr, get_order(size)); |
426 | if (flags == BLOCK_INSWAP) | ||
427 | free_stram_region( SWAP_NR(addr), N_PAGES(size) ); | ||
428 | else | ||
429 | #endif | ||
430 | free_pages((unsigned long)addr, get_order(size)); | ||
431 | return( NULL ); | 205 | return( NULL ); |
432 | } | 206 | } |
433 | block->owner = owner; | 207 | block->owner = owner; |
@@ -451,25 +225,12 @@ void atari_stram_free( void *addr ) | |||
451 | DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, " | 225 | DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, " |
452 | "flags=%02x\n", block, block->size, block->owner, block->flags ); | 226 | "flags=%02x\n", block, block->size, block->owner, block->flags ); |
453 | 227 | ||
454 | #ifdef CONFIG_STRAM_SWAP | 228 | if (!(block->flags & BLOCK_GFP)) |
455 | if (!max_swap_size) { | ||
456 | #endif | ||
457 | if (block->flags & BLOCK_GFP) { | ||
458 | DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n", | ||
459 | get_order(block->size)); | ||
460 | free_pages((unsigned long)addr, get_order(block->size)); | ||
461 | } | ||
462 | else | ||
463 | goto fail; | ||
464 | #ifdef CONFIG_STRAM_SWAP | ||
465 | } | ||
466 | else if (block->flags & BLOCK_INSWAP) { | ||
467 | DPRINTK( "atari_stram_free: is swap-alloced\n" ); | ||
468 | free_stram_region( SWAP_NR(block->start), N_PAGES(block->size) ); | ||
469 | } | ||
470 | else | ||
471 | goto fail; | 229 | goto fail; |
472 | #endif | 230 | |
231 | DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n", | ||
232 | get_order(block->size)); | ||
233 | free_pages((unsigned long)addr, get_order(block->size)); | ||
473 | remove_region( block ); | 234 | remove_region( block ); |
474 | return; | 235 | return; |
475 | 236 | ||
@@ -478,612 +239,6 @@ void atari_stram_free( void *addr ) | |||
478 | "(called from %p)\n", addr, __builtin_return_address(0) ); | 239 | "(called from %p)\n", addr, __builtin_return_address(0) ); |
479 | } | 240 | } |
480 | 241 | ||
481 | |||
482 | #ifdef CONFIG_STRAM_SWAP | ||
483 | |||
484 | |||
485 | /* ------------------------------------------------------------------------ */ | ||
486 | /* Main Swapping Functions */ | ||
487 | /* ------------------------------------------------------------------------ */ | ||
488 | |||
489 | |||
490 | /* | ||
491 | * Initialize ST-RAM swap device | ||
492 | * (lots copied and modified from sys_swapon() in mm/swapfile.c) | ||
493 | */ | ||
494 | static int __init swap_init(void *start_mem, void *swap_data) | ||
495 | { | ||
496 | static struct dentry fake_dentry; | ||
497 | static struct vfsmount fake_vfsmnt; | ||
498 | struct swap_info_struct *p; | ||
499 | struct inode swap_inode; | ||
500 | unsigned int type; | ||
501 | void *addr; | ||
502 | int i, j, k, prev; | ||
503 | |||
504 | DPRINTK("swap_init(start_mem=%p, swap_data=%p)\n", | ||
505 | start_mem, swap_data); | ||
506 | |||
507 | /* need at least one page for swapping to (and this also isn't very | ||
508 | * much... :-) */ | ||
509 | if (swap_end - swap_start < 2*PAGE_SIZE) { | ||
510 | printk( KERN_WARNING "stram_swap_init: swap space too small\n" ); | ||
511 | return( 0 ); | ||
512 | } | ||
513 | |||
514 | /* find free slot in swap_info */ | ||
515 | for( p = swap_info, type = 0; type < nr_swapfiles; type++, p++ ) | ||
516 | if (!(p->flags & SWP_USED)) | ||
517 | break; | ||
518 | if (type >= MAX_SWAPFILES) { | ||
519 | printk( KERN_WARNING "stram_swap_init: max. number of " | ||
520 | "swap devices exhausted\n" ); | ||
521 | return( 0 ); | ||
522 | } | ||
523 | if (type >= nr_swapfiles) | ||
524 | nr_swapfiles = type+1; | ||
525 | |||
526 | stram_swap_info = p; | ||
527 | stram_swap_type = type; | ||
528 | |||
529 | /* fake some dir cache entries to give us some name in /dev/swaps */ | ||
530 | fake_dentry.d_parent = &fake_dentry; | ||
531 | fake_dentry.d_name.name = "stram (internal)"; | ||
532 | fake_dentry.d_name.len = 16; | ||
533 | fake_vfsmnt.mnt_parent = &fake_vfsmnt; | ||
534 | |||
535 | p->flags = SWP_USED; | ||
536 | p->swap_file = &fake_dentry; | ||
537 | p->swap_vfsmnt = &fake_vfsmnt; | ||
538 | p->swap_map = swap_data; | ||
539 | p->cluster_nr = 0; | ||
540 | p->next = -1; | ||
541 | p->prio = 0x7ff0; /* a rather high priority, but not the higest | ||
542 | * to give the user a chance to override */ | ||
543 | |||
544 | /* call stram_open() directly, avoids at least the overhead in | ||
545 | * constructing a dummy file structure... */ | ||
546 | swap_inode.i_rdev = MKDEV( STRAM_MAJOR, STRAM_MINOR ); | ||
547 | stram_open( &swap_inode, MAGIC_FILE_P ); | ||
548 | p->max = SWAP_NR(swap_end); | ||
549 | |||
550 | /* initialize swap_map: set regions that are already allocated or belong | ||
551 | * to kernel data space to SWAP_MAP_BAD, otherwise to free */ | ||
552 | j = 0; /* # of free pages */ | ||
553 | k = 0; /* # of already allocated pages (from pre-mem_init stram_alloc()) */ | ||
554 | p->lowest_bit = 0; | ||
555 | p->highest_bit = 0; | ||
556 | for( i = 1, addr = SWAP_ADDR(1); i < p->max; | ||
557 | i++, addr += PAGE_SIZE ) { | ||
558 | if (in_some_region( addr )) { | ||
559 | p->swap_map[i] = SWAP_MAP_BAD; | ||
560 | ++k; | ||
561 | } | ||
562 | else if (kernel_in_stram && addr < start_mem ) { | ||
563 | p->swap_map[i] = SWAP_MAP_BAD; | ||
564 | } | ||
565 | else { | ||
566 | p->swap_map[i] = 0; | ||
567 | ++j; | ||
568 | if (!p->lowest_bit) p->lowest_bit = i; | ||
569 | p->highest_bit = i; | ||
570 | } | ||
571 | } | ||
572 | /* first page always reserved (and doesn't really belong to swap space) */ | ||
573 | p->swap_map[0] = SWAP_MAP_BAD; | ||
574 | |||
575 | /* now swapping to this device ok */ | ||
576 | p->pages = j + k; | ||
577 | swap_list_lock(); | ||
578 | nr_swap_pages += j; | ||
579 | p->flags = SWP_WRITEOK; | ||
580 | |||
581 | /* insert swap space into swap_list */ | ||
582 | prev = -1; | ||
583 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | ||
584 | if (p->prio >= swap_info[i].prio) { | ||
585 | break; | ||
586 | } | ||
587 | prev = i; | ||
588 | } | ||
589 | p->next = i; | ||
590 | if (prev < 0) { | ||
591 | swap_list.head = swap_list.next = p - swap_info; | ||
592 | } else { | ||
593 | swap_info[prev].next = p - swap_info; | ||
594 | } | ||
595 | swap_list_unlock(); | ||
596 | |||
597 | printk( KERN_INFO "Using %dk (%d pages) of ST-RAM as swap space.\n", | ||
598 | p->pages << 2, p->pages ); | ||
599 | return( 1 ); | ||
600 | } | ||
601 | |||
602 | |||
603 | /* | ||
604 | * The swap entry has been read in advance, and we return 1 to indicate | ||
605 | * that the page has been used or is no longer needed. | ||
606 | * | ||
607 | * Always set the resulting pte to be nowrite (the same as COW pages | ||
608 | * after one process has exited). We don't know just how many PTEs will | ||
609 | * share this swap entry, so be cautious and let do_wp_page work out | ||
610 | * what to do if a write is requested later. | ||
611 | */ | ||
612 | static inline void unswap_pte(struct vm_area_struct * vma, unsigned long | ||
613 | address, pte_t *dir, swp_entry_t entry, | ||
614 | struct page *page) | ||
615 | { | ||
616 | pte_t pte = *dir; | ||
617 | |||
618 | if (pte_none(pte)) | ||
619 | return; | ||
620 | if (pte_present(pte)) { | ||
621 | /* If this entry is swap-cached, then page must already | ||
622 | hold the right address for any copies in physical | ||
623 | memory */ | ||
624 | if (pte_page(pte) != page) | ||
625 | return; | ||
626 | /* We will be removing the swap cache in a moment, so... */ | ||
627 | set_pte(dir, pte_mkdirty(pte)); | ||
628 | return; | ||
629 | } | ||
630 | if (pte_val(pte) != entry.val) | ||
631 | return; | ||
632 | |||
633 | DPRINTK("unswap_pte: replacing entry %08lx by new page %p", | ||
634 | entry.val, page); | ||
635 | set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | ||
636 | swap_free(entry); | ||
637 | get_page(page); | ||
638 | inc_mm_counter(vma->vm_mm, rss); | ||
639 | } | ||
640 | |||
641 | static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir, | ||
642 | unsigned long address, unsigned long size, | ||
643 | unsigned long offset, swp_entry_t entry, | ||
644 | struct page *page) | ||
645 | { | ||
646 | pte_t * pte; | ||
647 | unsigned long end; | ||
648 | |||
649 | if (pmd_none(*dir)) | ||
650 | return; | ||
651 | if (pmd_bad(*dir)) { | ||
652 | pmd_ERROR(*dir); | ||
653 | pmd_clear(dir); | ||
654 | return; | ||
655 | } | ||
656 | pte = pte_offset_kernel(dir, address); | ||
657 | offset += address & PMD_MASK; | ||
658 | address &= ~PMD_MASK; | ||
659 | end = address + size; | ||
660 | if (end > PMD_SIZE) | ||
661 | end = PMD_SIZE; | ||
662 | do { | ||
663 | unswap_pte(vma, offset+address-vma->vm_start, pte, entry, page); | ||
664 | address += PAGE_SIZE; | ||
665 | pte++; | ||
666 | } while (address < end); | ||
667 | } | ||
668 | |||
669 | static inline void unswap_pgd(struct vm_area_struct * vma, pgd_t *dir, | ||
670 | unsigned long address, unsigned long size, | ||
671 | swp_entry_t entry, struct page *page) | ||
672 | { | ||
673 | pmd_t * pmd; | ||
674 | unsigned long offset, end; | ||
675 | |||
676 | if (pgd_none(*dir)) | ||
677 | return; | ||
678 | if (pgd_bad(*dir)) { | ||
679 | pgd_ERROR(*dir); | ||
680 | pgd_clear(dir); | ||
681 | return; | ||
682 | } | ||
683 | pmd = pmd_offset(dir, address); | ||
684 | offset = address & PGDIR_MASK; | ||
685 | address &= ~PGDIR_MASK; | ||
686 | end = address + size; | ||
687 | if (end > PGDIR_SIZE) | ||
688 | end = PGDIR_SIZE; | ||
689 | do { | ||
690 | unswap_pmd(vma, pmd, address, end - address, offset, entry, | ||
691 | page); | ||
692 | address = (address + PMD_SIZE) & PMD_MASK; | ||
693 | pmd++; | ||
694 | } while (address < end); | ||
695 | } | ||
696 | |||
697 | static void unswap_vma(struct vm_area_struct * vma, pgd_t *pgdir, | ||
698 | swp_entry_t entry, struct page *page) | ||
699 | { | ||
700 | unsigned long start = vma->vm_start, end = vma->vm_end; | ||
701 | |||
702 | do { | ||
703 | unswap_pgd(vma, pgdir, start, end - start, entry, page); | ||
704 | start = (start + PGDIR_SIZE) & PGDIR_MASK; | ||
705 | pgdir++; | ||
706 | } while (start < end); | ||
707 | } | ||
708 | |||
709 | static void unswap_process(struct mm_struct * mm, swp_entry_t entry, | ||
710 | struct page *page) | ||
711 | { | ||
712 | struct vm_area_struct* vma; | ||
713 | |||
714 | /* | ||
715 | * Go through process' page directory. | ||
716 | */ | ||
717 | if (!mm) | ||
718 | return; | ||
719 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
720 | pgd_t * pgd = pgd_offset(mm, vma->vm_start); | ||
721 | unswap_vma(vma, pgd, entry, page); | ||
722 | } | ||
723 | } | ||
724 | |||
725 | |||
726 | static int unswap_by_read(unsigned short *map, unsigned long max, | ||
727 | unsigned long start, unsigned long n_pages) | ||
728 | { | ||
729 | struct task_struct *p; | ||
730 | struct page *page; | ||
731 | swp_entry_t entry; | ||
732 | unsigned long i; | ||
733 | |||
734 | DPRINTK( "unswapping %lu..%lu by reading in\n", | ||
735 | start, start+n_pages-1 ); | ||
736 | |||
737 | for( i = start; i < start+n_pages; ++i ) { | ||
738 | if (map[i] == SWAP_MAP_BAD) { | ||
739 | printk( KERN_ERR "get_stram_region: page %lu already " | ||
740 | "reserved??\n", i ); | ||
741 | continue; | ||
742 | } | ||
743 | |||
744 | if (map[i]) { | ||
745 | entry = swp_entry(stram_swap_type, i); | ||
746 | DPRINTK("unswap: map[i=%lu]=%u nr_swap=%ld\n", | ||
747 | i, map[i], nr_swap_pages); | ||
748 | |||
749 | swap_device_lock(stram_swap_info); | ||
750 | map[i]++; | ||
751 | swap_device_unlock(stram_swap_info); | ||
752 | /* Get a page for the entry, using the existing | ||
753 | swap cache page if there is one. Otherwise, | ||
754 | get a clean page and read the swap into it. */ | ||
755 | page = read_swap_cache_async(entry, NULL, 0); | ||
756 | if (!page) { | ||
757 | swap_free(entry); | ||
758 | return -ENOMEM; | ||
759 | } | ||
760 | read_lock(&tasklist_lock); | ||
761 | for_each_process(p) | ||
762 | unswap_process(p->mm, entry, page); | ||
763 | read_unlock(&tasklist_lock); | ||
764 | shmem_unuse(entry, page); | ||
765 | /* Now get rid of the extra reference to the | ||
766 | temporary page we've been using. */ | ||
767 | if (PageSwapCache(page)) | ||
768 | delete_from_swap_cache(page); | ||
769 | __free_page(page); | ||
770 | #ifdef DO_PROC | ||
771 | stat_swap_force++; | ||
772 | #endif | ||
773 | } | ||
774 | |||
775 | DPRINTK( "unswap: map[i=%lu]=%u nr_swap=%ld\n", | ||
776 | i, map[i], nr_swap_pages ); | ||
777 | swap_list_lock(); | ||
778 | swap_device_lock(stram_swap_info); | ||
779 | map[i] = SWAP_MAP_BAD; | ||
780 | if (stram_swap_info->lowest_bit == i) | ||
781 | stram_swap_info->lowest_bit++; | ||
782 | if (stram_swap_info->highest_bit == i) | ||
783 | stram_swap_info->highest_bit--; | ||
784 | --nr_swap_pages; | ||
785 | swap_device_unlock(stram_swap_info); | ||
786 | swap_list_unlock(); | ||
787 | } | ||
788 | |||
789 | return 0; | ||
790 | } | ||
791 | |||
792 | /* | ||
793 | * reserve a region in ST-RAM swap space for an allocation | ||
794 | */ | ||
795 | static void *get_stram_region( unsigned long n_pages ) | ||
796 | { | ||
797 | unsigned short *map = stram_swap_info->swap_map; | ||
798 | unsigned long max = stram_swap_info->max; | ||
799 | unsigned long start, total_free, region_free; | ||
800 | int err; | ||
801 | void *ret = NULL; | ||
802 | |||
803 | DPRINTK( "get_stram_region(n_pages=%lu)\n", n_pages ); | ||
804 | |||
805 | down(&stram_swap_sem); | ||
806 | |||
807 | /* disallow writing to the swap device now */ | ||
808 | stram_swap_info->flags = SWP_USED; | ||
809 | |||
810 | /* find a region of n_pages pages in the swap space including as much free | ||
811 | * pages as possible (and excluding any already-reserved pages). */ | ||
812 | if (!(start = find_free_region( n_pages, &total_free, ®ion_free ))) | ||
813 | goto end; | ||
814 | DPRINTK( "get_stram_region: region starts at %lu, has %lu free pages\n", | ||
815 | start, region_free ); | ||
816 | |||
817 | err = unswap_by_read(map, max, start, n_pages); | ||
818 | if (err) | ||
819 | goto end; | ||
820 | |||
821 | ret = SWAP_ADDR(start); | ||
822 | end: | ||
823 | /* allow using swap device again */ | ||
824 | stram_swap_info->flags = SWP_WRITEOK; | ||
825 | up(&stram_swap_sem); | ||
826 | DPRINTK( "get_stram_region: returning %p\n", ret ); | ||
827 | return( ret ); | ||
828 | } | ||
829 | |||
830 | |||
831 | /* | ||
832 | * free a reserved region in ST-RAM swap space | ||
833 | */ | ||
834 | static void free_stram_region( unsigned long offset, unsigned long n_pages ) | ||
835 | { | ||
836 | unsigned short *map = stram_swap_info->swap_map; | ||
837 | |||
838 | DPRINTK( "free_stram_region(offset=%lu,n_pages=%lu)\n", offset, n_pages ); | ||
839 | |||
840 | if (offset < 1 || offset + n_pages > stram_swap_info->max) { | ||
841 | printk( KERN_ERR "free_stram_region: Trying to free non-ST-RAM\n" ); | ||
842 | return; | ||
843 | } | ||
844 | |||
845 | swap_list_lock(); | ||
846 | swap_device_lock(stram_swap_info); | ||
847 | /* un-reserve the freed pages */ | ||
848 | for( ; n_pages > 0; ++offset, --n_pages ) { | ||
849 | if (map[offset] != SWAP_MAP_BAD) | ||
850 | printk( KERN_ERR "free_stram_region: Swap page %lu was not " | ||
851 | "reserved\n", offset ); | ||
852 | map[offset] = 0; | ||
853 | } | ||
854 | |||
855 | /* update swapping meta-data */ | ||
856 | if (offset < stram_swap_info->lowest_bit) | ||
857 | stram_swap_info->lowest_bit = offset; | ||
858 | if (offset+n_pages-1 > stram_swap_info->highest_bit) | ||
859 | stram_swap_info->highest_bit = offset+n_pages-1; | ||
860 | if (stram_swap_info->prio > swap_info[swap_list.next].prio) | ||
861 | swap_list.next = swap_list.head; | ||
862 | nr_swap_pages += n_pages; | ||
863 | swap_device_unlock(stram_swap_info); | ||
864 | swap_list_unlock(); | ||
865 | } | ||
866 | |||
867 | |||
868 | /* ------------------------------------------------------------------------ */ | ||
869 | /* Utility Functions for Swapping */ | ||
870 | /* ------------------------------------------------------------------------ */ | ||
871 | |||
872 | |||
873 | /* is addr in some of the allocated regions? */ | ||
874 | static int in_some_region(void *addr) | ||
875 | { | ||
876 | BLOCK *p; | ||
877 | |||
878 | for( p = alloc_list; p; p = p->next ) { | ||
879 | if (p->start <= addr && addr < p->start + p->size) | ||
880 | return( 1 ); | ||
881 | } | ||
882 | return( 0 ); | ||
883 | } | ||
884 | |||
885 | |||
886 | static unsigned long find_free_region(unsigned long n_pages, | ||
887 | unsigned long *total_free, | ||
888 | unsigned long *region_free) | ||
889 | { | ||
890 | unsigned short *map = stram_swap_info->swap_map; | ||
891 | unsigned long max = stram_swap_info->max; | ||
892 | unsigned long head, tail, max_start; | ||
893 | long nfree, max_free; | ||
894 | |||
895 | /* first scan the swap space for a suitable place for the allocation */ | ||
896 | head = 1; | ||
897 | max_start = 0; | ||
898 | max_free = -1; | ||
899 | *total_free = 0; | ||
900 | |||
901 | start_over: | ||
902 | /* increment tail until final window size reached, and count free pages */ | ||
903 | nfree = 0; | ||
904 | for( tail = head; tail-head < n_pages && tail < max; ++tail ) { | ||
905 | if (map[tail] == SWAP_MAP_BAD) { | ||
906 | head = tail+1; | ||
907 | goto start_over; | ||
908 | } | ||
909 | if (!map[tail]) { | ||
910 | ++nfree; | ||
911 | ++*total_free; | ||
912 | } | ||
913 | } | ||
914 | if (tail-head < n_pages) | ||
915 | goto out; | ||
916 | if (nfree > max_free) { | ||
917 | max_start = head; | ||
918 | max_free = nfree; | ||
919 | if (max_free >= n_pages) | ||
920 | /* don't need more free pages... :-) */ | ||
921 | goto out; | ||
922 | } | ||
923 | |||
924 | /* now shift the window and look for the area where as much pages as | ||
925 | * possible are free */ | ||
926 | while( tail < max ) { | ||
927 | nfree -= (map[head++] == 0); | ||
928 | if (map[tail] == SWAP_MAP_BAD) { | ||
929 | head = tail+1; | ||
930 | goto start_over; | ||
931 | } | ||
932 | if (!map[tail]) { | ||
933 | ++nfree; | ||
934 | ++*total_free; | ||
935 | } | ||
936 | ++tail; | ||
937 | if (nfree > max_free) { | ||
938 | max_start = head; | ||
939 | max_free = nfree; | ||
940 | if (max_free >= n_pages) | ||
941 | /* don't need more free pages... :-) */ | ||
942 | goto out; | ||
943 | } | ||
944 | } | ||
945 | |||
946 | out: | ||
947 | if (max_free < 0) { | ||
948 | printk( KERN_NOTICE "get_stram_region: ST-RAM too full or fragmented " | ||
949 | "-- can't allocate %lu pages\n", n_pages ); | ||
950 | return( 0 ); | ||
951 | } | ||
952 | |||
953 | *region_free = max_free; | ||
954 | return( max_start ); | ||
955 | } | ||
956 | |||
957 | |||
958 | /* setup parameters from command line */ | ||
959 | void __init stram_swap_setup(char *str, int *ints) | ||
960 | { | ||
961 | if (ints[0] >= 1) | ||
962 | max_swap_size = ((ints[1] < 0 ? 0 : ints[1]) * 1024) & PAGE_MASK; | ||
963 | } | ||
964 | |||
965 | |||
966 | /* ------------------------------------------------------------------------ */ | ||
967 | /* ST-RAM device */ | ||
968 | /* ------------------------------------------------------------------------ */ | ||
969 | |||
970 | static int refcnt; | ||
971 | |||
972 | static void do_stram_request(request_queue_t *q) | ||
973 | { | ||
974 | struct request *req; | ||
975 | |||
976 | while ((req = elv_next_request(q)) != NULL) { | ||
977 | void *start = swap_start + (req->sector << 9); | ||
978 | unsigned long len = req->current_nr_sectors << 9; | ||
979 | if ((start + len) > swap_end) { | ||
980 | printk( KERN_ERR "stram: bad access beyond end of device: " | ||
981 | "block=%ld, count=%d\n", | ||
982 | req->sector, | ||
983 | req->current_nr_sectors ); | ||
984 | end_request(req, 0); | ||
985 | continue; | ||
986 | } | ||
987 | |||
988 | if (req->cmd == READ) { | ||
989 | memcpy(req->buffer, start, len); | ||
990 | #ifdef DO_PROC | ||
991 | stat_swap_read += N_PAGES(len); | ||
992 | #endif | ||
993 | } | ||
994 | else { | ||
995 | memcpy(start, req->buffer, len); | ||
996 | #ifdef DO_PROC | ||
997 | stat_swap_write += N_PAGES(len); | ||
998 | #endif | ||
999 | } | ||
1000 | end_request(req, 1); | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | |||
1005 | static int stram_open( struct inode *inode, struct file *filp ) | ||
1006 | { | ||
1007 | if (filp != MAGIC_FILE_P) { | ||
1008 | printk( KERN_NOTICE "Only kernel can open ST-RAM device\n" ); | ||
1009 | return( -EPERM ); | ||
1010 | } | ||
1011 | if (refcnt) | ||
1012 | return( -EBUSY ); | ||
1013 | ++refcnt; | ||
1014 | return( 0 ); | ||
1015 | } | ||
1016 | |||
1017 | static int stram_release( struct inode *inode, struct file *filp ) | ||
1018 | { | ||
1019 | if (filp != MAGIC_FILE_P) { | ||
1020 | printk( KERN_NOTICE "Only kernel can close ST-RAM device\n" ); | ||
1021 | return( -EPERM ); | ||
1022 | } | ||
1023 | if (refcnt > 0) | ||
1024 | --refcnt; | ||
1025 | return( 0 ); | ||
1026 | } | ||
1027 | |||
1028 | |||
1029 | static struct block_device_operations stram_fops = { | ||
1030 | .open = stram_open, | ||
1031 | .release = stram_release, | ||
1032 | }; | ||
1033 | |||
1034 | static struct gendisk *stram_disk; | ||
1035 | static struct request_queue *stram_queue; | ||
1036 | static DEFINE_SPINLOCK(stram_lock); | ||
1037 | |||
1038 | int __init stram_device_init(void) | ||
1039 | { | ||
1040 | if (!MACH_IS_ATARI) | ||
1041 | /* no point in initializing this, I hope */ | ||
1042 | return -ENXIO; | ||
1043 | |||
1044 | if (!max_swap_size) | ||
1045 | /* swapping not enabled */ | ||
1046 | return -ENXIO; | ||
1047 | stram_disk = alloc_disk(1); | ||
1048 | if (!stram_disk) | ||
1049 | return -ENOMEM; | ||
1050 | |||
1051 | if (register_blkdev(STRAM_MAJOR, "stram")) { | ||
1052 | put_disk(stram_disk); | ||
1053 | return -ENXIO; | ||
1054 | } | ||
1055 | |||
1056 | stram_queue = blk_init_queue(do_stram_request, &stram_lock); | ||
1057 | if (!stram_queue) { | ||
1058 | unregister_blkdev(STRAM_MAJOR, "stram"); | ||
1059 | put_disk(stram_disk); | ||
1060 | return -ENOMEM; | ||
1061 | } | ||
1062 | |||
1063 | stram_disk->major = STRAM_MAJOR; | ||
1064 | stram_disk->first_minor = STRAM_MINOR; | ||
1065 | stram_disk->fops = &stram_fops; | ||
1066 | stram_disk->queue = stram_queue; | ||
1067 | sprintf(stram_disk->disk_name, "stram"); | ||
1068 | set_capacity(stram_disk, (swap_end - swap_start)/512); | ||
1069 | add_disk(stram_disk); | ||
1070 | return 0; | ||
1071 | } | ||
1072 | |||
1073 | |||
1074 | |||
1075 | /* ------------------------------------------------------------------------ */ | ||
1076 | /* Misc Utility Functions */ | ||
1077 | /* ------------------------------------------------------------------------ */ | ||
1078 | |||
1079 | /* reserve a range of pages */ | ||
1080 | static void reserve_region(void *start, void *end) | ||
1081 | { | ||
1082 | reserve_bootmem (virt_to_phys(start), end - start); | ||
1083 | } | ||
1084 | |||
1085 | #endif /* CONFIG_STRAM_SWAP */ | ||
1086 | |||
1087 | 242 | ||
1088 | /* ------------------------------------------------------------------------ */ | 243 | /* ------------------------------------------------------------------------ */ |
1089 | /* Region Management */ | 244 | /* Region Management */ |
@@ -1173,50 +328,9 @@ int get_stram_list( char *buf ) | |||
1173 | { | 328 | { |
1174 | int len = 0; | 329 | int len = 0; |
1175 | BLOCK *p; | 330 | BLOCK *p; |
1176 | #ifdef CONFIG_STRAM_SWAP | ||
1177 | int i; | ||
1178 | unsigned short *map = stram_swap_info->swap_map; | ||
1179 | unsigned long max = stram_swap_info->max; | ||
1180 | unsigned free = 0, used = 0, rsvd = 0; | ||
1181 | #endif | ||
1182 | 331 | ||
1183 | #ifdef CONFIG_STRAM_SWAP | 332 | PRINT_PROC("Total ST-RAM: %8u kB\n", |
1184 | if (max_swap_size) { | ||
1185 | for( i = 1; i < max; ++i ) { | ||
1186 | if (!map[i]) | ||
1187 | ++free; | ||
1188 | else if (map[i] == SWAP_MAP_BAD) | ||
1189 | ++rsvd; | ||
1190 | else | ||
1191 | ++used; | ||
1192 | } | ||
1193 | PRINT_PROC( | ||
1194 | "Total ST-RAM: %8u kB\n" | ||
1195 | "Total ST-RAM swap: %8lu kB\n" | ||
1196 | "Free swap: %8u kB\n" | ||
1197 | "Used swap: %8u kB\n" | ||
1198 | "Allocated swap: %8u kB\n" | ||
1199 | "Swap Reads: %8u\n" | ||
1200 | "Swap Writes: %8u\n" | ||
1201 | "Swap Forced Reads: %8u\n", | ||
1202 | (stram_end - stram_start) >> 10, | ||
1203 | (max-1) << (PAGE_SHIFT-10), | ||
1204 | free << (PAGE_SHIFT-10), | ||
1205 | used << (PAGE_SHIFT-10), | ||
1206 | rsvd << (PAGE_SHIFT-10), | ||
1207 | stat_swap_read, | ||
1208 | stat_swap_write, | ||
1209 | stat_swap_force ); | ||
1210 | } | ||
1211 | else { | ||
1212 | #endif | ||
1213 | PRINT_PROC( "ST-RAM swapping disabled\n" ); | ||
1214 | PRINT_PROC("Total ST-RAM: %8u kB\n", | ||
1215 | (stram_end - stram_start) >> 10); | 333 | (stram_end - stram_start) >> 10); |
1216 | #ifdef CONFIG_STRAM_SWAP | ||
1217 | } | ||
1218 | #endif | ||
1219 | |||
1220 | PRINT_PROC( "Allocated regions:\n" ); | 334 | PRINT_PROC( "Allocated regions:\n" ); |
1221 | for( p = alloc_list; p; p = p->next ) { | 335 | for( p = alloc_list; p; p = p->next ) { |
1222 | if (len + 50 >= PAGE_SIZE) | 336 | if (len + 50 >= PAGE_SIZE) |
@@ -1227,8 +341,6 @@ int get_stram_list( char *buf ) | |||
1227 | p->owner); | 341 | p->owner); |
1228 | if (p->flags & BLOCK_GFP) | 342 | if (p->flags & BLOCK_GFP) |
1229 | PRINT_PROC( "page-alloced)\n" ); | 343 | PRINT_PROC( "page-alloced)\n" ); |
1230 | else if (p->flags & BLOCK_INSWAP) | ||
1231 | PRINT_PROC( "in swap)\n" ); | ||
1232 | else | 344 | else |
1233 | PRINT_PROC( "??)\n" ); | 345 | PRINT_PROC( "??)\n" ); |
1234 | } | 346 | } |
diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c index 5dcb3fa35ea9..fe2383e36b06 100644 --- a/arch/m68k/mm/kmap.c +++ b/arch/m68k/mm/kmap.c | |||
@@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag) | |||
201 | virtaddr += PTRTREESIZE; | 201 | virtaddr += PTRTREESIZE; |
202 | size -= PTRTREESIZE; | 202 | size -= PTRTREESIZE; |
203 | } else { | 203 | } else { |
204 | pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr); | 204 | pte_dir = pte_alloc_kernel(pmd_dir, virtaddr); |
205 | if (!pte_dir) { | 205 | if (!pte_dir) { |
206 | printk("ioremap: no mem for pte_dir\n"); | 206 | printk("ioremap: no mem for pte_dir\n"); |
207 | return NULL; | 207 | return NULL; |
diff --git a/arch/m68k/sun3x/dvma.c b/arch/m68k/sun3x/dvma.c index 32e55adfeb8e..117481e86305 100644 --- a/arch/m68k/sun3x/dvma.c +++ b/arch/m68k/sun3x/dvma.c | |||
@@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr, | |||
116 | pte_t *pte; | 116 | pte_t *pte; |
117 | unsigned long end3; | 117 | unsigned long end3; |
118 | 118 | ||
119 | if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) { | 119 | if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) { |
120 | ret = -ENOMEM; | 120 | ret = -ENOMEM; |
121 | goto out; | 121 | goto out; |
122 | } | 122 | } |
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c index 99262fe64560..7ce34d4aa220 100644 --- a/arch/mips/kernel/irixelf.c +++ b/arch/mips/kernel/irixelf.c | |||
@@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs) | |||
697 | /* Do this so that we can load the interpreter, if need be. We will | 697 | /* Do this so that we can load the interpreter, if need be. We will |
698 | * change some of these later. | 698 | * change some of these later. |
699 | */ | 699 | */ |
700 | set_mm_counter(current->mm, rss, 0); | ||
701 | setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); | 700 | setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); |
702 | current->mm->start_stack = bprm->p; | 701 | current->mm->start_stack = bprm->p; |
703 | 702 | ||
diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c index 9c44ca70befa..3101d1db5592 100644 --- a/arch/mips/mm/ioremap.c +++ b/arch/mips/mm/ioremap.c | |||
@@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, | |||
55 | if (address >= end) | 55 | if (address >= end) |
56 | BUG(); | 56 | BUG(); |
57 | do { | 57 | do { |
58 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 58 | pte_t * pte = pte_alloc_kernel(pmd, address); |
59 | if (!pte) | 59 | if (!pte) |
60 | return -ENOMEM; | 60 | return -ENOMEM; |
61 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | 61 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); |
@@ -77,7 +77,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr, | |||
77 | flush_cache_all(); | 77 | flush_cache_all(); |
78 | if (address >= end) | 78 | if (address >= end) |
79 | BUG(); | 79 | BUG(); |
80 | spin_lock(&init_mm.page_table_lock); | ||
81 | do { | 80 | do { |
82 | pud_t *pud; | 81 | pud_t *pud; |
83 | pmd_t *pmd; | 82 | pmd_t *pmd; |
@@ -96,7 +95,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr, | |||
96 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 95 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
97 | dir++; | 96 | dir++; |
98 | } while (address && (address < end)); | 97 | } while (address && (address < end)); |
99 | spin_unlock(&init_mm.page_table_lock); | ||
100 | flush_tlb_all(); | 98 | flush_tlb_all(); |
101 | return error; | 99 | return error; |
102 | } | 100 | } |
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index e15f09eaed12..a065349aee37 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c | |||
@@ -270,7 +270,6 @@ void flush_dcache_page(struct page *page) | |||
270 | unsigned long offset; | 270 | unsigned long offset; |
271 | unsigned long addr; | 271 | unsigned long addr; |
272 | pgoff_t pgoff; | 272 | pgoff_t pgoff; |
273 | pte_t *pte; | ||
274 | unsigned long pfn = page_to_pfn(page); | 273 | unsigned long pfn = page_to_pfn(page); |
275 | 274 | ||
276 | 275 | ||
@@ -301,21 +300,16 @@ void flush_dcache_page(struct page *page) | |||
301 | * taking a page fault if the pte doesn't exist. | 300 | * taking a page fault if the pte doesn't exist. |
302 | * This is just for speed. If the page translation | 301 | * This is just for speed. If the page translation |
303 | * isn't there, there's no point exciting the | 302 | * isn't there, there's no point exciting the |
304 | * nadtlb handler into a nullification frenzy */ | 303 | * nadtlb handler into a nullification frenzy. |
305 | 304 | * | |
306 | 305 | * Make sure we really have this page: the private | |
307 | if(!(pte = translation_exists(mpnt, addr))) | ||
308 | continue; | ||
309 | |||
310 | /* make sure we really have this page: the private | ||
311 | * mappings may cover this area but have COW'd this | 306 | * mappings may cover this area but have COW'd this |
312 | * particular page */ | 307 | * particular page. |
313 | if(pte_pfn(*pte) != pfn) | 308 | */ |
314 | continue; | 309 | if (translation_exists(mpnt, addr, pfn)) { |
315 | 310 | __flush_cache_page(mpnt, addr); | |
316 | __flush_cache_page(mpnt, addr); | 311 | break; |
317 | 312 | } | |
318 | break; | ||
319 | } | 313 | } |
320 | flush_dcache_mmap_unlock(mapping); | 314 | flush_dcache_mmap_unlock(mapping); |
321 | } | 315 | } |
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index ae6213d71670..f94a02ef3d95 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c | |||
@@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr, | |||
114 | if (end > PGDIR_SIZE) | 114 | if (end > PGDIR_SIZE) |
115 | end = PGDIR_SIZE; | 115 | end = PGDIR_SIZE; |
116 | do { | 116 | do { |
117 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr); | 117 | pte_t * pte = pte_alloc_kernel(pmd, vaddr); |
118 | if (!pte) | 118 | if (!pte) |
119 | return -ENOMEM; | 119 | return -ENOMEM; |
120 | if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr)) | 120 | if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr)) |
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 2886ad70db48..29b998e430e6 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c | |||
@@ -505,7 +505,9 @@ void show_mem(void) | |||
505 | 505 | ||
506 | for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { | 506 | for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { |
507 | struct page *p; | 507 | struct page *p; |
508 | unsigned long flags; | ||
508 | 509 | ||
510 | pgdat_resize_lock(NODE_DATA(i), &flags); | ||
509 | p = nid_page_nr(i, j) - node_start_pfn(i); | 511 | p = nid_page_nr(i, j) - node_start_pfn(i); |
510 | 512 | ||
511 | total++; | 513 | total++; |
@@ -517,6 +519,7 @@ void show_mem(void) | |||
517 | free++; | 519 | free++; |
518 | else | 520 | else |
519 | shared += page_count(p) - 1; | 521 | shared += page_count(p) - 1; |
522 | pgdat_resize_unlock(NODE_DATA(i), &flags); | ||
520 | } | 523 | } |
521 | } | 524 | } |
522 | #endif | 525 | #endif |
diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c index f2df502cdae3..5c7a1b3b9326 100644 --- a/arch/parisc/mm/ioremap.c +++ b/arch/parisc/mm/ioremap.c | |||
@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo | |||
52 | if (address >= end) | 52 | if (address >= end) |
53 | BUG(); | 53 | BUG(); |
54 | do { | 54 | do { |
55 | pte_t * pte = pte_alloc_kernel(NULL, pmd, address); | 55 | pte_t * pte = pte_alloc_kernel(pmd, address); |
56 | if (!pte) | 56 | if (!pte) |
57 | return -ENOMEM; | 57 | return -ENOMEM; |
58 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | 58 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); |
@@ -75,10 +75,9 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
75 | flush_cache_all(); | 75 | flush_cache_all(); |
76 | if (address >= end) | 76 | if (address >= end) |
77 | BUG(); | 77 | BUG(); |
78 | spin_lock(&init_mm.page_table_lock); | ||
79 | do { | 78 | do { |
80 | pmd_t *pmd; | 79 | pmd_t *pmd; |
81 | pmd = pmd_alloc(dir, address); | 80 | pmd = pmd_alloc(&init_mm, dir, address); |
82 | error = -ENOMEM; | 81 | error = -ENOMEM; |
83 | if (!pmd) | 82 | if (!pmd) |
84 | break; | 83 | break; |
@@ -89,7 +88,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
89 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 88 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
90 | dir++; | 89 | dir++; |
91 | } while (address && (address < end)); | 90 | } while (address && (address < end)); |
92 | spin_unlock(&init_mm.page_table_lock); | ||
93 | flush_tlb_all(); | 91 | flush_tlb_all(); |
94 | return error; | 92 | return error; |
95 | } | 93 | } |
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c index 0f710d2baec6..685fd0defe23 100644 --- a/arch/ppc/kernel/dma-mapping.c +++ b/arch/ppc/kernel/dma-mapping.c | |||
@@ -335,8 +335,6 @@ static int __init dma_alloc_init(void) | |||
335 | pte_t *pte; | 335 | pte_t *pte; |
336 | int ret = 0; | 336 | int ret = 0; |
337 | 337 | ||
338 | spin_lock(&init_mm.page_table_lock); | ||
339 | |||
340 | do { | 338 | do { |
341 | pgd = pgd_offset(&init_mm, CONSISTENT_BASE); | 339 | pgd = pgd_offset(&init_mm, CONSISTENT_BASE); |
342 | pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); | 340 | pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); |
@@ -347,7 +345,7 @@ static int __init dma_alloc_init(void) | |||
347 | } | 345 | } |
348 | WARN_ON(!pmd_none(*pmd)); | 346 | WARN_ON(!pmd_none(*pmd)); |
349 | 347 | ||
350 | pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); | 348 | pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); |
351 | if (!pte) { | 349 | if (!pte) { |
352 | printk(KERN_ERR "%s: no pte tables\n", __func__); | 350 | printk(KERN_ERR "%s: no pte tables\n", __func__); |
353 | ret = -ENOMEM; | 351 | ret = -ENOMEM; |
@@ -357,8 +355,6 @@ static int __init dma_alloc_init(void) | |||
357 | consistent_pte = pte; | 355 | consistent_pte = pte; |
358 | } while (0); | 356 | } while (0); |
359 | 357 | ||
360 | spin_unlock(&init_mm.page_table_lock); | ||
361 | |||
362 | return ret; | 358 | return ret; |
363 | } | 359 | } |
364 | 360 | ||
diff --git a/arch/ppc/mm/4xx_mmu.c b/arch/ppc/mm/4xx_mmu.c index b7bcbc232f39..4d006aa1a0d1 100644 --- a/arch/ppc/mm/4xx_mmu.c +++ b/arch/ppc/mm/4xx_mmu.c | |||
@@ -110,13 +110,11 @@ unsigned long __init mmu_mapin_ram(void) | |||
110 | pmd_t *pmdp; | 110 | pmd_t *pmdp; |
111 | unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; | 111 | unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; |
112 | 112 | ||
113 | spin_lock(&init_mm.page_table_lock); | ||
114 | pmdp = pmd_offset(pgd_offset_k(v), v); | 113 | pmdp = pmd_offset(pgd_offset_k(v), v); |
115 | pmd_val(*pmdp++) = val; | 114 | pmd_val(*pmdp++) = val; |
116 | pmd_val(*pmdp++) = val; | 115 | pmd_val(*pmdp++) = val; |
117 | pmd_val(*pmdp++) = val; | 116 | pmd_val(*pmdp++) = val; |
118 | pmd_val(*pmdp++) = val; | 117 | pmd_val(*pmdp++) = val; |
119 | spin_unlock(&init_mm.page_table_lock); | ||
120 | 118 | ||
121 | v += LARGE_PAGE_SIZE_16M; | 119 | v += LARGE_PAGE_SIZE_16M; |
122 | p += LARGE_PAGE_SIZE_16M; | 120 | p += LARGE_PAGE_SIZE_16M; |
@@ -127,10 +125,8 @@ unsigned long __init mmu_mapin_ram(void) | |||
127 | pmd_t *pmdp; | 125 | pmd_t *pmdp; |
128 | unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; | 126 | unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; |
129 | 127 | ||
130 | spin_lock(&init_mm.page_table_lock); | ||
131 | pmdp = pmd_offset(pgd_offset_k(v), v); | 128 | pmdp = pmd_offset(pgd_offset_k(v), v); |
132 | pmd_val(*pmdp) = val; | 129 | pmd_val(*pmdp) = val; |
133 | spin_unlock(&init_mm.page_table_lock); | ||
134 | 130 | ||
135 | v += LARGE_PAGE_SIZE_4M; | 131 | v += LARGE_PAGE_SIZE_4M; |
136 | p += LARGE_PAGE_SIZE_4M; | 132 | p += LARGE_PAGE_SIZE_4M; |
diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c index 43505b1fc5d8..6ea9185fd120 100644 --- a/arch/ppc/mm/pgtable.c +++ b/arch/ppc/mm/pgtable.c | |||
@@ -280,18 +280,16 @@ map_page(unsigned long va, phys_addr_t pa, int flags) | |||
280 | pte_t *pg; | 280 | pte_t *pg; |
281 | int err = -ENOMEM; | 281 | int err = -ENOMEM; |
282 | 282 | ||
283 | spin_lock(&init_mm.page_table_lock); | ||
284 | /* Use upper 10 bits of VA to index the first level map */ | 283 | /* Use upper 10 bits of VA to index the first level map */ |
285 | pd = pmd_offset(pgd_offset_k(va), va); | 284 | pd = pmd_offset(pgd_offset_k(va), va); |
286 | /* Use middle 10 bits of VA to index the second-level map */ | 285 | /* Use middle 10 bits of VA to index the second-level map */ |
287 | pg = pte_alloc_kernel(&init_mm, pd, va); | 286 | pg = pte_alloc_kernel(pd, va); |
288 | if (pg != 0) { | 287 | if (pg != 0) { |
289 | err = 0; | 288 | err = 0; |
290 | set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); | 289 | set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); |
291 | if (mem_init_done) | 290 | if (mem_init_done) |
292 | flush_HPTE(0, va, pmd_val(*pd)); | 291 | flush_HPTE(0, va, pmd_val(*pd)); |
293 | } | 292 | } |
294 | spin_unlock(&init_mm.page_table_lock); | ||
295 | return err; | 293 | return err; |
296 | } | 294 | } |
297 | 295 | ||
diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c index efa985f05aca..4aacf521e3e4 100644 --- a/arch/ppc64/kernel/vdso.c +++ b/arch/ppc64/kernel/vdso.c | |||
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma, | |||
176 | return NOPAGE_SIGBUS; | 176 | return NOPAGE_SIGBUS; |
177 | 177 | ||
178 | /* | 178 | /* |
179 | * Last page is systemcfg, special handling here, no get_page() a | 179 | * Last page is systemcfg. |
180 | * this is a reserved page | ||
181 | */ | 180 | */ |
182 | if ((vma->vm_end - address) <= PAGE_SIZE) | 181 | if ((vma->vm_end - address) <= PAGE_SIZE) |
183 | return virt_to_page(systemcfg); | 182 | pg = virt_to_page(systemcfg); |
183 | else | ||
184 | pg = virt_to_page(vbase + offset); | ||
184 | 185 | ||
185 | pg = virt_to_page(vbase + offset); | ||
186 | get_page(pg); | 186 | get_page(pg); |
187 | DBG(" ->page count: %d\n", page_count(pg)); | 187 | DBG(" ->page count: %d\n", page_count(pg)); |
188 | 188 | ||
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack) | |||
259 | * gettimeofday will be totally dead. It's fine to use that for setting | 259 | * gettimeofday will be totally dead. It's fine to use that for setting |
260 | * breakpoints in the vDSO code pages though | 260 | * breakpoints in the vDSO code pages though |
261 | */ | 261 | */ |
262 | vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 262 | vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED; |
263 | vma->vm_flags |= mm->def_flags; | 263 | vma->vm_flags |= mm->def_flags; |
264 | vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; | 264 | vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; |
265 | vma->vm_ops = &vdso_vmops; | 265 | vma->vm_ops = &vdso_vmops; |
@@ -603,6 +603,8 @@ void __init vdso_init(void) | |||
603 | ClearPageReserved(pg); | 603 | ClearPageReserved(pg); |
604 | get_page(pg); | 604 | get_page(pg); |
605 | } | 605 | } |
606 | |||
607 | get_page(virt_to_page(systemcfg)); | ||
606 | } | 608 | } |
607 | 609 | ||
608 | int in_gate_area_no_task(unsigned long addr) | 610 | int in_gate_area_no_task(unsigned long addr) |
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c index c65b87b92756..f4ca29cf5364 100644 --- a/arch/ppc64/mm/imalloc.c +++ b/arch/ppc64/mm/imalloc.c | |||
@@ -300,12 +300,7 @@ void im_free(void * addr) | |||
300 | for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { | 300 | for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { |
301 | if (tmp->addr == addr) { | 301 | if (tmp->addr == addr) { |
302 | *p = tmp->next; | 302 | *p = tmp->next; |
303 | |||
304 | /* XXX: do we need the lock? */ | ||
305 | spin_lock(&init_mm.page_table_lock); | ||
306 | unmap_vm_area(tmp); | 303 | unmap_vm_area(tmp); |
307 | spin_unlock(&init_mm.page_table_lock); | ||
308 | |||
309 | kfree(tmp); | 304 | kfree(tmp); |
310 | up(&imlist_sem); | 305 | up(&imlist_sem); |
311 | return; | 306 | return; |
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index be64b157afce..e2bd7776622f 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c | |||
@@ -104,6 +104,8 @@ void show_mem(void) | |||
104 | show_free_areas(); | 104 | show_free_areas(); |
105 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | 105 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); |
106 | for_each_pgdat(pgdat) { | 106 | for_each_pgdat(pgdat) { |
107 | unsigned long flags; | ||
108 | pgdat_resize_lock(pgdat, &flags); | ||
107 | for (i = 0; i < pgdat->node_spanned_pages; i++) { | 109 | for (i = 0; i < pgdat->node_spanned_pages; i++) { |
108 | page = pgdat_page_nr(pgdat, i); | 110 | page = pgdat_page_nr(pgdat, i); |
109 | total++; | 111 | total++; |
@@ -114,6 +116,7 @@ void show_mem(void) | |||
114 | else if (page_count(page)) | 116 | else if (page_count(page)) |
115 | shared += page_count(page) - 1; | 117 | shared += page_count(page) - 1; |
116 | } | 118 | } |
119 | pgdat_resize_unlock(pgdat, &flags); | ||
117 | } | 120 | } |
118 | printk("%ld pages of RAM\n", total); | 121 | printk("%ld pages of RAM\n", total); |
119 | printk("%ld reserved pages\n", reserved); | 122 | printk("%ld reserved pages\n", reserved); |
@@ -155,7 +158,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) | |||
155 | unsigned long vsid; | 158 | unsigned long vsid; |
156 | 159 | ||
157 | if (mem_init_done) { | 160 | if (mem_init_done) { |
158 | spin_lock(&init_mm.page_table_lock); | ||
159 | pgdp = pgd_offset_k(ea); | 161 | pgdp = pgd_offset_k(ea); |
160 | pudp = pud_alloc(&init_mm, pgdp, ea); | 162 | pudp = pud_alloc(&init_mm, pgdp, ea); |
161 | if (!pudp) | 163 | if (!pudp) |
@@ -163,12 +165,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) | |||
163 | pmdp = pmd_alloc(&init_mm, pudp, ea); | 165 | pmdp = pmd_alloc(&init_mm, pudp, ea); |
164 | if (!pmdp) | 166 | if (!pmdp) |
165 | return -ENOMEM; | 167 | return -ENOMEM; |
166 | ptep = pte_alloc_kernel(&init_mm, pmdp, ea); | 168 | ptep = pte_alloc_kernel(pmdp, ea); |
167 | if (!ptep) | 169 | if (!ptep) |
168 | return -ENOMEM; | 170 | return -ENOMEM; |
169 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | 171 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, |
170 | __pgprot(flags))); | 172 | __pgprot(flags))); |
171 | spin_unlock(&init_mm.page_table_lock); | ||
172 | } else { | 173 | } else { |
173 | unsigned long va, vpn, hash, hpteg; | 174 | unsigned long va, vpn, hash, hpteg; |
174 | 175 | ||
@@ -649,11 +650,14 @@ void __init mem_init(void) | |||
649 | #endif | 650 | #endif |
650 | 651 | ||
651 | for_each_pgdat(pgdat) { | 652 | for_each_pgdat(pgdat) { |
653 | unsigned long flags; | ||
654 | pgdat_resize_lock(pgdat, &flags); | ||
652 | for (i = 0; i < pgdat->node_spanned_pages; i++) { | 655 | for (i = 0; i < pgdat->node_spanned_pages; i++) { |
653 | page = pgdat_page_nr(pgdat, i); | 656 | page = pgdat_page_nr(pgdat, i); |
654 | if (PageReserved(page)) | 657 | if (PageReserved(page)) |
655 | reservedpages++; | 658 | reservedpages++; |
656 | } | 659 | } |
660 | pgdat_resize_unlock(pgdat, &flags); | ||
657 | } | 661 | } |
658 | 662 | ||
659 | codesize = (unsigned long)&_etext - (unsigned long)&_stext; | 663 | codesize = (unsigned long)&_etext - (unsigned long)&_stext; |
@@ -867,3 +871,80 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, | |||
867 | return vma_prot; | 871 | return vma_prot; |
868 | } | 872 | } |
869 | EXPORT_SYMBOL(phys_mem_access_prot); | 873 | EXPORT_SYMBOL(phys_mem_access_prot); |
874 | |||
875 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
876 | |||
877 | void online_page(struct page *page) | ||
878 | { | ||
879 | ClearPageReserved(page); | ||
880 | free_cold_page(page); | ||
881 | totalram_pages++; | ||
882 | num_physpages++; | ||
883 | } | ||
884 | |||
885 | /* | ||
886 | * This works only for the non-NUMA case. Later, we'll need a lookup | ||
887 | * to convert from real physical addresses to nid, that doesn't use | ||
888 | * pfn_to_nid(). | ||
889 | */ | ||
890 | int __devinit add_memory(u64 start, u64 size) | ||
891 | { | ||
892 | struct pglist_data *pgdata = NODE_DATA(0); | ||
893 | struct zone *zone; | ||
894 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
895 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
896 | |||
897 | /* this should work for most non-highmem platforms */ | ||
898 | zone = pgdata->node_zones; | ||
899 | |||
900 | return __add_pages(zone, start_pfn, nr_pages); | ||
901 | |||
902 | return 0; | ||
903 | } | ||
904 | |||
905 | /* | ||
906 | * First pass at this code will check to determine if the remove | ||
907 | * request is within the RMO. Do not allow removal within the RMO. | ||
908 | */ | ||
909 | int __devinit remove_memory(u64 start, u64 size) | ||
910 | { | ||
911 | struct zone *zone; | ||
912 | unsigned long start_pfn, end_pfn, nr_pages; | ||
913 | |||
914 | start_pfn = start >> PAGE_SHIFT; | ||
915 | nr_pages = size >> PAGE_SHIFT; | ||
916 | end_pfn = start_pfn + nr_pages; | ||
917 | |||
918 | printk("%s(): Attempting to remove memoy in range " | ||
919 | "%lx to %lx\n", __func__, start, start+size); | ||
920 | /* | ||
921 | * check for range within RMO | ||
922 | */ | ||
923 | zone = page_zone(pfn_to_page(start_pfn)); | ||
924 | |||
925 | printk("%s(): memory will be removed from " | ||
926 | "the %s zone\n", __func__, zone->name); | ||
927 | |||
928 | /* | ||
929 | * not handling removing memory ranges that | ||
930 | * overlap multiple zones yet | ||
931 | */ | ||
932 | if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) | ||
933 | goto overlap; | ||
934 | |||
935 | /* make sure it is NOT in RMO */ | ||
936 | if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { | ||
937 | printk("%s(): range to be removed must NOT be in RMO!\n", | ||
938 | __func__); | ||
939 | goto in_rmo; | ||
940 | } | ||
941 | |||
942 | return __remove_pages(zone, start_pfn, nr_pages); | ||
943 | |||
944 | overlap: | ||
945 | printk("%s(): memory range to be removed overlaps " | ||
946 | "multiple zones!!!\n", __func__); | ||
947 | in_rmo: | ||
948 | return -1; | ||
949 | } | ||
950 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
diff --git a/arch/s390/mm/ioremap.c b/arch/s390/mm/ioremap.c index c6c39d868bc8..0f6e9ecbefe2 100644 --- a/arch/s390/mm/ioremap.c +++ b/arch/s390/mm/ioremap.c | |||
@@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo | |||
58 | if (address >= end) | 58 | if (address >= end) |
59 | BUG(); | 59 | BUG(); |
60 | do { | 60 | do { |
61 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 61 | pte_t * pte = pte_alloc_kernel(pmd, address); |
62 | if (!pte) | 62 | if (!pte) |
63 | return -ENOMEM; | 63 | return -ENOMEM; |
64 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | 64 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); |
@@ -80,7 +80,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
80 | flush_cache_all(); | 80 | flush_cache_all(); |
81 | if (address >= end) | 81 | if (address >= end) |
82 | BUG(); | 82 | BUG(); |
83 | spin_lock(&init_mm.page_table_lock); | ||
84 | do { | 83 | do { |
85 | pmd_t *pmd; | 84 | pmd_t *pmd; |
86 | pmd = pmd_alloc(&init_mm, dir, address); | 85 | pmd = pmd_alloc(&init_mm, dir, address); |
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
94 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 93 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
95 | dir++; | 94 | dir++; |
96 | } while (address && (address < end)); | 95 | } while (address && (address < end)); |
97 | spin_unlock(&init_mm.page_table_lock); | ||
98 | flush_tlb_all(); | 96 | flush_tlb_all(); |
99 | return 0; | 97 | return 0; |
100 | } | 98 | } |
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 7abba2161da6..775f86cd3fe8 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c | |||
@@ -194,10 +194,13 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, | |||
194 | unsigned long address) | 194 | unsigned long address) |
195 | { | 195 | { |
196 | unsigned long addrmax = P4SEG; | 196 | unsigned long addrmax = P4SEG; |
197 | pgd_t *dir; | 197 | pgd_t *pgd; |
198 | pmd_t *pmd; | 198 | pmd_t *pmd; |
199 | pte_t *pte; | 199 | pte_t *pte; |
200 | pte_t entry; | 200 | pte_t entry; |
201 | struct mm_struct *mm; | ||
202 | spinlock_t *ptl; | ||
203 | int ret = 1; | ||
201 | 204 | ||
202 | #ifdef CONFIG_SH_KGDB | 205 | #ifdef CONFIG_SH_KGDB |
203 | if (kgdb_nofault && kgdb_bus_err_hook) | 206 | if (kgdb_nofault && kgdb_bus_err_hook) |
@@ -208,28 +211,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, | |||
208 | addrmax = P4SEG_STORE_QUE + 0x04000000; | 211 | addrmax = P4SEG_STORE_QUE + 0x04000000; |
209 | #endif | 212 | #endif |
210 | 213 | ||
211 | if (address >= P3SEG && address < addrmax) | 214 | if (address >= P3SEG && address < addrmax) { |
212 | dir = pgd_offset_k(address); | 215 | pgd = pgd_offset_k(address); |
213 | else if (address >= TASK_SIZE) | 216 | mm = NULL; |
217 | } else if (address >= TASK_SIZE) | ||
214 | return 1; | 218 | return 1; |
215 | else if (!current->mm) | 219 | else if (!(mm = current->mm)) |
216 | return 1; | 220 | return 1; |
217 | else | 221 | else |
218 | dir = pgd_offset(current->mm, address); | 222 | pgd = pgd_offset(mm, address); |
219 | 223 | ||
220 | pmd = pmd_offset(dir, address); | 224 | pmd = pmd_offset(pgd, address); |
221 | if (pmd_none(*pmd)) | 225 | if (pmd_none_or_clear_bad(pmd)) |
222 | return 1; | ||
223 | if (pmd_bad(*pmd)) { | ||
224 | pmd_ERROR(*pmd); | ||
225 | pmd_clear(pmd); | ||
226 | return 1; | 226 | return 1; |
227 | } | 227 | if (mm) |
228 | pte = pte_offset_kernel(pmd, address); | 228 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
229 | else | ||
230 | pte = pte_offset_kernel(pmd, address); | ||
231 | |||
229 | entry = *pte; | 232 | entry = *pte; |
230 | if (pte_none(entry) || pte_not_present(entry) | 233 | if (pte_none(entry) || pte_not_present(entry) |
231 | || (writeaccess && !pte_write(entry))) | 234 | || (writeaccess && !pte_write(entry))) |
232 | return 1; | 235 | goto unlock; |
233 | 236 | ||
234 | if (writeaccess) | 237 | if (writeaccess) |
235 | entry = pte_mkdirty(entry); | 238 | entry = pte_mkdirty(entry); |
@@ -251,8 +254,11 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, | |||
251 | 254 | ||
252 | set_pte(pte, entry); | 255 | set_pte(pte, entry); |
253 | update_mmu_cache(NULL, address, entry); | 256 | update_mmu_cache(NULL, address, entry); |
254 | 257 | ret = 0; | |
255 | return 0; | 258 | unlock: |
259 | if (mm) | ||
260 | pte_unmap_unlock(pte, ptl); | ||
261 | return ret; | ||
256 | } | 262 | } |
257 | 263 | ||
258 | void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) | 264 | void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) |
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 95bb1a6c6060..6b7a7688c98e 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c | |||
@@ -54,8 +54,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
54 | return pte; | 54 | return pte; |
55 | } | 55 | } |
56 | 56 | ||
57 | #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) | ||
58 | |||
59 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, | 57 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
60 | pte_t *ptep, pte_t entry) | 58 | pte_t *ptep, pte_t entry) |
61 | { | 59 | { |
diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index 9f490c2742f0..e794e27a72f1 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c | |||
@@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, | |||
57 | if (address >= end) | 57 | if (address >= end) |
58 | BUG(); | 58 | BUG(); |
59 | do { | 59 | do { |
60 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 60 | pte_t * pte = pte_alloc_kernel(pmd, address); |
61 | if (!pte) | 61 | if (!pte) |
62 | return -ENOMEM; | 62 | return -ENOMEM; |
63 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | 63 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); |
@@ -79,7 +79,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
79 | flush_cache_all(); | 79 | flush_cache_all(); |
80 | if (address >= end) | 80 | if (address >= end) |
81 | BUG(); | 81 | BUG(); |
82 | spin_lock(&init_mm.page_table_lock); | ||
83 | do { | 82 | do { |
84 | pmd_t *pmd; | 83 | pmd_t *pmd; |
85 | pmd = pmd_alloc(&init_mm, dir, address); | 84 | pmd = pmd_alloc(&init_mm, dir, address); |
@@ -93,7 +92,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
93 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 92 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
94 | dir++; | 93 | dir++; |
95 | } while (address && (address < end)); | 94 | } while (address && (address < end)); |
96 | spin_unlock(&init_mm.page_table_lock); | ||
97 | flush_tlb_all(); | 95 | flush_tlb_all(); |
98 | return error; | 96 | return error; |
99 | } | 97 | } |
diff --git a/arch/sh64/mm/cache.c b/arch/sh64/mm/cache.c index 3b87e25ea773..c0c1b21350d8 100644 --- a/arch/sh64/mm/cache.c +++ b/arch/sh64/mm/cache.c | |||
@@ -584,32 +584,36 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr) | |||
584 | } | 584 | } |
585 | } | 585 | } |
586 | 586 | ||
587 | static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr) | 587 | static void sh64_dcache_purge_user_pages(struct mm_struct *mm, |
588 | unsigned long addr, unsigned long end) | ||
588 | { | 589 | { |
589 | pgd_t *pgd; | 590 | pgd_t *pgd; |
590 | pmd_t *pmd; | 591 | pmd_t *pmd; |
591 | pte_t *pte; | 592 | pte_t *pte; |
592 | pte_t entry; | 593 | pte_t entry; |
594 | spinlock_t *ptl; | ||
593 | unsigned long paddr; | 595 | unsigned long paddr; |
594 | 596 | ||
595 | /* NOTE : all the callers of this have mm->page_table_lock held, so the | 597 | if (!mm) |
596 | following page table traversal is safe even on SMP/pre-emptible. */ | 598 | return; /* No way to find physical address of page */ |
597 | 599 | ||
598 | if (!mm) return; /* No way to find physical address of page */ | 600 | pgd = pgd_offset(mm, addr); |
599 | pgd = pgd_offset(mm, eaddr); | 601 | if (pgd_bad(*pgd)) |
600 | if (pgd_bad(*pgd)) return; | 602 | return; |
601 | 603 | ||
602 | pmd = pmd_offset(pgd, eaddr); | 604 | pmd = pmd_offset(pgd, addr); |
603 | if (pmd_none(*pmd) || pmd_bad(*pmd)) return; | 605 | if (pmd_none(*pmd) || pmd_bad(*pmd)) |
604 | 606 | return; | |
605 | pte = pte_offset_kernel(pmd, eaddr); | 607 | |
606 | entry = *pte; | 608 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
607 | if (pte_none(entry) || !pte_present(entry)) return; | 609 | do { |
608 | 610 | entry = *pte; | |
609 | paddr = pte_val(entry) & PAGE_MASK; | 611 | if (pte_none(entry) || !pte_present(entry)) |
610 | 612 | continue; | |
611 | sh64_dcache_purge_coloured_phy_page(paddr, eaddr); | 613 | paddr = pte_val(entry) & PAGE_MASK; |
612 | 614 | sh64_dcache_purge_coloured_phy_page(paddr, addr); | |
615 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
616 | pte_unmap_unlock(pte - 1, ptl); | ||
613 | } | 617 | } |
614 | /****************************************************************************/ | 618 | /****************************************************************************/ |
615 | 619 | ||
@@ -668,7 +672,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm, | |||
668 | int n_pages; | 672 | int n_pages; |
669 | 673 | ||
670 | n_pages = ((end - start) >> PAGE_SHIFT); | 674 | n_pages = ((end - start) >> PAGE_SHIFT); |
671 | if (n_pages >= 64) { | 675 | if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) { |
672 | #if 1 | 676 | #if 1 |
673 | sh64_dcache_purge_all(); | 677 | sh64_dcache_purge_all(); |
674 | #else | 678 | #else |
@@ -707,20 +711,10 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm, | |||
707 | } | 711 | } |
708 | #endif | 712 | #endif |
709 | } else { | 713 | } else { |
710 | /* 'Small' range */ | 714 | /* Small range, covered by a single page table page */ |
711 | unsigned long aligned_start; | 715 | start &= PAGE_MASK; /* should already be so */ |
712 | unsigned long eaddr; | 716 | end = PAGE_ALIGN(end); /* should already be so */ |
713 | unsigned long last_page_start; | 717 | sh64_dcache_purge_user_pages(mm, start, end); |
714 | |||
715 | aligned_start = start & PAGE_MASK; | ||
716 | /* 'end' is 1 byte beyond the end of the range */ | ||
717 | last_page_start = (end - 1) & PAGE_MASK; | ||
718 | |||
719 | eaddr = aligned_start; | ||
720 | while (eaddr <= last_page_start) { | ||
721 | sh64_dcache_purge_user_page(mm, eaddr); | ||
722 | eaddr += PAGE_SIZE; | ||
723 | } | ||
724 | } | 718 | } |
725 | return; | 719 | return; |
726 | } | 720 | } |
@@ -880,9 +874,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, | |||
880 | addresses from the user address space specified by mm, after writing | 874 | addresses from the user address space specified by mm, after writing |
881 | back any dirty data. | 875 | back any dirty data. |
882 | 876 | ||
883 | Note(1), 'end' is 1 byte beyond the end of the range to flush. | 877 | Note, 'end' is 1 byte beyond the end of the range to flush. */ |
884 | |||
885 | Note(2), this is called with mm->page_table_lock held.*/ | ||
886 | 878 | ||
887 | sh64_dcache_purge_user_range(mm, start, end); | 879 | sh64_dcache_purge_user_range(mm, start, end); |
888 | sh64_icache_inv_user_page_range(mm, start, end); | 880 | sh64_icache_inv_user_page_range(mm, start, end); |
@@ -898,7 +890,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned | |||
898 | the I-cache must be searched too in case the page in question is | 890 | the I-cache must be searched too in case the page in question is |
899 | both writable and being executed from (e.g. stack trampolines.) | 891 | both writable and being executed from (e.g. stack trampolines.) |
900 | 892 | ||
901 | Note(1), this is called with mm->page_table_lock held. | 893 | Note, this is called with pte lock held. |
902 | */ | 894 | */ |
903 | 895 | ||
904 | sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT); | 896 | sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT); |
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index dcd9c8a8baf8..ed6a505b3ee2 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c | |||
@@ -54,41 +54,31 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
54 | return pte; | 54 | return pte; |
55 | } | 55 | } |
56 | 56 | ||
57 | #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) | 57 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
58 | 58 | pte_t *ptep, pte_t entry) | |
59 | static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
60 | struct page *page, pte_t * page_table, int write_access) | ||
61 | { | 59 | { |
62 | unsigned long i; | 60 | int i; |
63 | pte_t entry; | ||
64 | |||
65 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
66 | |||
67 | if (write_access) | ||
68 | entry = pte_mkwrite(pte_mkdirty(mk_pte(page, | ||
69 | vma->vm_page_prot))); | ||
70 | else | ||
71 | entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); | ||
72 | entry = pte_mkyoung(entry); | ||
73 | mk_pte_huge(entry); | ||
74 | 61 | ||
75 | for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { | 62 | for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { |
76 | set_pte(page_table, entry); | 63 | set_pte_at(mm, addr, ptep, entry); |
77 | page_table++; | 64 | ptep++; |
78 | 65 | addr += PAGE_SIZE; | |
79 | pte_val(entry) += PAGE_SIZE; | 66 | pte_val(entry) += PAGE_SIZE; |
80 | } | 67 | } |
81 | } | 68 | } |
82 | 69 | ||
83 | pte_t huge_ptep_get_and_clear(pte_t *ptep) | 70 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, |
71 | pte_t *ptep) | ||
84 | { | 72 | { |
85 | pte_t entry; | 73 | pte_t entry; |
74 | int i; | ||
86 | 75 | ||
87 | entry = *ptep; | 76 | entry = *ptep; |
88 | 77 | ||
89 | for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { | 78 | for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { |
90 | pte_clear(pte); | 79 | pte_clear(mm, addr, ptep); |
91 | pte++; | 80 | addr += PAGE_SIZE; |
81 | ptep++; | ||
92 | } | 82 | } |
93 | 83 | ||
94 | return entry; | 84 | return entry; |
@@ -106,79 +96,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | |||
106 | return 0; | 96 | return 0; |
107 | } | 97 | } |
108 | 98 | ||
109 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | ||
110 | struct vm_area_struct *vma) | ||
111 | { | ||
112 | pte_t *src_pte, *dst_pte, entry; | ||
113 | struct page *ptepage; | ||
114 | unsigned long addr = vma->vm_start; | ||
115 | unsigned long end = vma->vm_end; | ||
116 | int i; | ||
117 | |||
118 | while (addr < end) { | ||
119 | dst_pte = huge_pte_alloc(dst, addr); | ||
120 | if (!dst_pte) | ||
121 | goto nomem; | ||
122 | src_pte = huge_pte_offset(src, addr); | ||
123 | BUG_ON(!src_pte || pte_none(*src_pte)); | ||
124 | entry = *src_pte; | ||
125 | ptepage = pte_page(entry); | ||
126 | get_page(ptepage); | ||
127 | for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { | ||
128 | set_pte(dst_pte, entry); | ||
129 | pte_val(entry) += PAGE_SIZE; | ||
130 | dst_pte++; | ||
131 | } | ||
132 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | ||
133 | addr += HPAGE_SIZE; | ||
134 | } | ||
135 | return 0; | ||
136 | |||
137 | nomem: | ||
138 | return -ENOMEM; | ||
139 | } | ||
140 | |||
141 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
142 | struct page **pages, struct vm_area_struct **vmas, | ||
143 | unsigned long *position, int *length, int i) | ||
144 | { | ||
145 | unsigned long vaddr = *position; | ||
146 | int remainder = *length; | ||
147 | |||
148 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
149 | |||
150 | while (vaddr < vma->vm_end && remainder) { | ||
151 | if (pages) { | ||
152 | pte_t *pte; | ||
153 | struct page *page; | ||
154 | |||
155 | pte = huge_pte_offset(mm, vaddr); | ||
156 | |||
157 | /* hugetlb should be locked, and hence, prefaulted */ | ||
158 | BUG_ON(!pte || pte_none(*pte)); | ||
159 | |||
160 | page = pte_page(*pte); | ||
161 | |||
162 | WARN_ON(!PageCompound(page)); | ||
163 | |||
164 | get_page(page); | ||
165 | pages[i] = page; | ||
166 | } | ||
167 | |||
168 | if (vmas) | ||
169 | vmas[i] = vma; | ||
170 | |||
171 | vaddr += PAGE_SIZE; | ||
172 | --remainder; | ||
173 | ++i; | ||
174 | } | ||
175 | |||
176 | *length = remainder; | ||
177 | *position = vaddr; | ||
178 | |||
179 | return i; | ||
180 | } | ||
181 | |||
182 | struct page *follow_huge_addr(struct mm_struct *mm, | 99 | struct page *follow_huge_addr(struct mm_struct *mm, |
183 | unsigned long address, int write) | 100 | unsigned long address, int write) |
184 | { | 101 | { |
@@ -195,84 +112,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
195 | { | 112 | { |
196 | return NULL; | 113 | return NULL; |
197 | } | 114 | } |
198 | |||
199 | void unmap_hugepage_range(struct vm_area_struct *vma, | ||
200 | unsigned long start, unsigned long end) | ||
201 | { | ||
202 | struct mm_struct *mm = vma->vm_mm; | ||
203 | unsigned long address; | ||
204 | pte_t *pte; | ||
205 | struct page *page; | ||
206 | int i; | ||
207 | |||
208 | BUG_ON(start & (HPAGE_SIZE - 1)); | ||
209 | BUG_ON(end & (HPAGE_SIZE - 1)); | ||
210 | |||
211 | for (address = start; address < end; address += HPAGE_SIZE) { | ||
212 | pte = huge_pte_offset(mm, address); | ||
213 | BUG_ON(!pte); | ||
214 | if (pte_none(*pte)) | ||
215 | continue; | ||
216 | page = pte_page(*pte); | ||
217 | put_page(page); | ||
218 | for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { | ||
219 | pte_clear(mm, address+(i*PAGE_SIZE), pte); | ||
220 | pte++; | ||
221 | } | ||
222 | } | ||
223 | add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); | ||
224 | flush_tlb_range(vma, start, end); | ||
225 | } | ||
226 | |||
227 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | ||
228 | { | ||
229 | struct mm_struct *mm = current->mm; | ||
230 | unsigned long addr; | ||
231 | int ret = 0; | ||
232 | |||
233 | BUG_ON(vma->vm_start & ~HPAGE_MASK); | ||
234 | BUG_ON(vma->vm_end & ~HPAGE_MASK); | ||
235 | |||
236 | spin_lock(&mm->page_table_lock); | ||
237 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | ||
238 | unsigned long idx; | ||
239 | pte_t *pte = huge_pte_alloc(mm, addr); | ||
240 | struct page *page; | ||
241 | |||
242 | if (!pte) { | ||
243 | ret = -ENOMEM; | ||
244 | goto out; | ||
245 | } | ||
246 | if (!pte_none(*pte)) | ||
247 | continue; | ||
248 | |||
249 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
250 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
251 | page = find_get_page(mapping, idx); | ||
252 | if (!page) { | ||
253 | /* charge the fs quota first */ | ||
254 | if (hugetlb_get_quota(mapping)) { | ||
255 | ret = -ENOMEM; | ||
256 | goto out; | ||
257 | } | ||
258 | page = alloc_huge_page(); | ||
259 | if (!page) { | ||
260 | hugetlb_put_quota(mapping); | ||
261 | ret = -ENOMEM; | ||
262 | goto out; | ||
263 | } | ||
264 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
265 | if (! ret) { | ||
266 | unlock_page(page); | ||
267 | } else { | ||
268 | hugetlb_put_quota(mapping); | ||
269 | free_huge_page(page); | ||
270 | goto out; | ||
271 | } | ||
272 | } | ||
273 | set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); | ||
274 | } | ||
275 | out: | ||
276 | spin_unlock(&mm->page_table_lock); | ||
277 | return ret; | ||
278 | } | ||
diff --git a/arch/sh64/mm/ioremap.c b/arch/sh64/mm/ioremap.c index f4003da556bc..fb1866fa2c9d 100644 --- a/arch/sh64/mm/ioremap.c +++ b/arch/sh64/mm/ioremap.c | |||
@@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo | |||
79 | BUG(); | 79 | BUG(); |
80 | 80 | ||
81 | do { | 81 | do { |
82 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 82 | pte_t * pte = pte_alloc_kernel(pmd, address); |
83 | if (!pte) | 83 | if (!pte) |
84 | return -ENOMEM; | 84 | return -ENOMEM; |
85 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | 85 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); |
@@ -101,7 +101,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
101 | flush_cache_all(); | 101 | flush_cache_all(); |
102 | if (address >= end) | 102 | if (address >= end) |
103 | BUG(); | 103 | BUG(); |
104 | spin_lock(&init_mm.page_table_lock); | ||
105 | do { | 104 | do { |
106 | pmd_t *pmd = pmd_alloc(&init_mm, dir, address); | 105 | pmd_t *pmd = pmd_alloc(&init_mm, dir, address); |
107 | error = -ENOMEM; | 106 | error = -ENOMEM; |
@@ -115,7 +114,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
115 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 114 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
116 | dir++; | 115 | dir++; |
117 | } while (address && (address < end)); | 116 | } while (address && (address < end)); |
118 | spin_unlock(&init_mm.page_table_lock); | ||
119 | flush_tlb_all(); | 117 | flush_tlb_all(); |
120 | return 0; | 118 | return 0; |
121 | } | 119 | } |
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c index 20ccb957fb77..9604893ffdbd 100644 --- a/arch/sparc/mm/generic.c +++ b/arch/sparc/mm/generic.c | |||
@@ -73,14 +73,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | |||
73 | int space = GET_IOSPACE(pfn); | 73 | int space = GET_IOSPACE(pfn); |
74 | unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; | 74 | unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; |
75 | 75 | ||
76 | /* See comment in mm/memory.c remap_pfn_range */ | ||
77 | vma->vm_flags |= VM_IO | VM_RESERVED; | ||
78 | |||
76 | prot = __pgprot(pg_iobits); | 79 | prot = __pgprot(pg_iobits); |
77 | offset -= from; | 80 | offset -= from; |
78 | dir = pgd_offset(mm, from); | 81 | dir = pgd_offset(mm, from); |
79 | flush_cache_range(vma, beg, end); | 82 | flush_cache_range(vma, beg, end); |
80 | 83 | ||
81 | spin_lock(&mm->page_table_lock); | ||
82 | while (from < end) { | 84 | while (from < end) { |
83 | pmd_t *pmd = pmd_alloc(current->mm, dir, from); | 85 | pmd_t *pmd = pmd_alloc(mm, dir, from); |
84 | error = -ENOMEM; | 86 | error = -ENOMEM; |
85 | if (!pmd) | 87 | if (!pmd) |
86 | break; | 88 | break; |
@@ -90,7 +92,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | |||
90 | from = (from + PGDIR_SIZE) & PGDIR_MASK; | 92 | from = (from + PGDIR_SIZE) & PGDIR_MASK; |
91 | dir++; | 93 | dir++; |
92 | } | 94 | } |
93 | spin_unlock(&mm->page_table_lock); | ||
94 | 95 | ||
95 | flush_tlb_range(vma, beg, end); | 96 | flush_tlb_range(vma, beg, end); |
96 | return error; | 97 | return error; |
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c index b2854ef221d0..edf52d06b280 100644 --- a/arch/sparc64/kernel/binfmt_aout32.c +++ b/arch/sparc64/kernel/binfmt_aout32.c | |||
@@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs) | |||
241 | current->mm->brk = ex.a_bss + | 241 | current->mm->brk = ex.a_bss + |
242 | (current->mm->start_brk = N_BSSADDR(ex)); | 242 | (current->mm->start_brk = N_BSSADDR(ex)); |
243 | 243 | ||
244 | set_mm_counter(current->mm, rss, 0); | ||
245 | current->mm->mmap = NULL; | 244 | current->mm->mmap = NULL; |
246 | compute_creds(bprm); | 245 | compute_creds(bprm); |
247 | current->flags &= ~PF_FORKNOEXEC; | 246 | current->flags &= ~PF_FORKNOEXEC; |
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c index c954d91f01d0..112c316e7cd2 100644 --- a/arch/sparc64/mm/generic.c +++ b/arch/sparc64/mm/generic.c | |||
@@ -127,14 +127,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | |||
127 | int space = GET_IOSPACE(pfn); | 127 | int space = GET_IOSPACE(pfn); |
128 | unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; | 128 | unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; |
129 | 129 | ||
130 | /* See comment in mm/memory.c remap_pfn_range */ | ||
131 | vma->vm_flags |= VM_IO | VM_RESERVED; | ||
132 | |||
130 | prot = __pgprot(pg_iobits); | 133 | prot = __pgprot(pg_iobits); |
131 | offset -= from; | 134 | offset -= from; |
132 | dir = pgd_offset(mm, from); | 135 | dir = pgd_offset(mm, from); |
133 | flush_cache_range(vma, beg, end); | 136 | flush_cache_range(vma, beg, end); |
134 | 137 | ||
135 | spin_lock(&mm->page_table_lock); | ||
136 | while (from < end) { | 138 | while (from < end) { |
137 | pud_t *pud = pud_alloc(current->mm, dir, from); | 139 | pud_t *pud = pud_alloc(mm, dir, from); |
138 | error = -ENOMEM; | 140 | error = -ENOMEM; |
139 | if (!pud) | 141 | if (!pud) |
140 | break; | 142 | break; |
@@ -144,8 +146,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | |||
144 | from = (from + PGDIR_SIZE) & PGDIR_MASK; | 146 | from = (from + PGDIR_SIZE) & PGDIR_MASK; |
145 | dir++; | 147 | dir++; |
146 | } | 148 | } |
147 | flush_tlb_range(vma, beg, end); | ||
148 | spin_unlock(&mm->page_table_lock); | ||
149 | 149 | ||
150 | flush_tlb_range(vma, beg, end); | ||
150 | return error; | 151 | return error; |
151 | } | 152 | } |
diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c index 90ca99d0b89c..8b104be4662b 100644 --- a/arch/sparc64/mm/tlb.c +++ b/arch/sparc64/mm/tlb.c | |||
@@ -18,8 +18,7 @@ | |||
18 | 18 | ||
19 | /* Heavily inspired by the ppc64 code. */ | 19 | /* Heavily inspired by the ppc64 code. */ |
20 | 20 | ||
21 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = | 21 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, }; |
22 | { NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, }; | ||
23 | 22 | ||
24 | void flush_tlb_pending(void) | 23 | void flush_tlb_pending(void) |
25 | { | 24 | { |
@@ -72,7 +71,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t | |||
72 | 71 | ||
73 | no_cache_flush: | 72 | no_cache_flush: |
74 | 73 | ||
75 | if (mp->tlb_frozen) | 74 | if (mp->fullmm) |
76 | return; | 75 | return; |
77 | 76 | ||
78 | nr = mp->tlb_nr; | 77 | nr = mp->tlb_nr; |
@@ -97,7 +96,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long | |||
97 | unsigned long nr = mp->tlb_nr; | 96 | unsigned long nr = mp->tlb_nr; |
98 | long s = start, e = end, vpte_base; | 97 | long s = start, e = end, vpte_base; |
99 | 98 | ||
100 | if (mp->tlb_frozen) | 99 | if (mp->fullmm) |
101 | return; | 100 | return; |
102 | 101 | ||
103 | /* If start is greater than end, that is a real problem. */ | 102 | /* If start is greater than end, that is a real problem. */ |
diff --git a/arch/um/include/tlb.h b/arch/um/include/tlb.h index 45d7da6c3b2c..8efc1e0f1b84 100644 --- a/arch/um/include/tlb.h +++ b/arch/um/include/tlb.h | |||
@@ -34,7 +34,6 @@ struct host_vm_op { | |||
34 | } u; | 34 | } u; |
35 | }; | 35 | }; |
36 | 36 | ||
37 | extern void mprotect_kernel_vm(int w); | ||
38 | extern void force_flush_all(void); | 37 | extern void force_flush_all(void); |
39 | extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr, | 38 | extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr, |
40 | unsigned long end_addr, int force, | 39 | unsigned long end_addr, int force, |
diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c index 0d73ceeece72..34b54a3e2132 100644 --- a/arch/um/kernel/process_kern.c +++ b/arch/um/kernel/process_kern.c | |||
@@ -222,6 +222,7 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr, | |||
222 | pud_t *pud; | 222 | pud_t *pud; |
223 | pmd_t *pmd; | 223 | pmd_t *pmd; |
224 | pte_t *pte; | 224 | pte_t *pte; |
225 | pte_t ptent; | ||
225 | 226 | ||
226 | if(task->mm == NULL) | 227 | if(task->mm == NULL) |
227 | return(ERR_PTR(-EINVAL)); | 228 | return(ERR_PTR(-EINVAL)); |
@@ -238,12 +239,13 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr, | |||
238 | return(ERR_PTR(-EINVAL)); | 239 | return(ERR_PTR(-EINVAL)); |
239 | 240 | ||
240 | pte = pte_offset_kernel(pmd, addr); | 241 | pte = pte_offset_kernel(pmd, addr); |
241 | if(!pte_present(*pte)) | 242 | ptent = *pte; |
243 | if(!pte_present(ptent)) | ||
242 | return(ERR_PTR(-EINVAL)); | 244 | return(ERR_PTR(-EINVAL)); |
243 | 245 | ||
244 | if(pte_out != NULL) | 246 | if(pte_out != NULL) |
245 | *pte_out = *pte; | 247 | *pte_out = ptent; |
246 | return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK)); | 248 | return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK)); |
247 | } | 249 | } |
248 | 250 | ||
249 | char *current_cmd(void) | 251 | char *current_cmd(void) |
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 240143b616a2..9e5e39cea821 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c | |||
@@ -28,7 +28,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, | |||
28 | pmd_t *pmd; | 28 | pmd_t *pmd; |
29 | pte_t *pte; | 29 | pte_t *pte; |
30 | 30 | ||
31 | spin_lock(&mm->page_table_lock); | ||
32 | pgd = pgd_offset(mm, proc); | 31 | pgd = pgd_offset(mm, proc); |
33 | pud = pud_alloc(mm, pgd, proc); | 32 | pud = pud_alloc(mm, pgd, proc); |
34 | if (!pud) | 33 | if (!pud) |
@@ -63,7 +62,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, | |||
63 | *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); | 62 | *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); |
64 | *pte = pte_mkexec(*pte); | 63 | *pte = pte_mkexec(*pte); |
65 | *pte = pte_wrprotect(*pte); | 64 | *pte = pte_wrprotect(*pte); |
66 | spin_unlock(&mm->page_table_lock); | ||
67 | return(0); | 65 | return(0); |
68 | 66 | ||
69 | out_pmd: | 67 | out_pmd: |
@@ -71,7 +69,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, | |||
71 | out_pte: | 69 | out_pte: |
72 | pmd_free(pmd); | 70 | pmd_free(pmd); |
73 | out: | 71 | out: |
74 | spin_unlock(&mm->page_table_lock); | ||
75 | return(-ENOMEM); | 72 | return(-ENOMEM); |
76 | } | 73 | } |
77 | 74 | ||
@@ -147,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm) | |||
147 | 144 | ||
148 | if(!proc_mm || !ptrace_faultinfo){ | 145 | if(!proc_mm || !ptrace_faultinfo){ |
149 | free_page(mmu->id.stack); | 146 | free_page(mmu->id.stack); |
147 | pte_lock_deinit(virt_to_page(mmu->last_page_table)); | ||
150 | pte_free_kernel((pte_t *) mmu->last_page_table); | 148 | pte_free_kernel((pte_t *) mmu->last_page_table); |
151 | dec_page_state(nr_page_table_pages); | 149 | dec_page_state(nr_page_table_pages); |
152 | #ifdef CONFIG_3_LEVEL_PGTABLES | 150 | #ifdef CONFIG_3_LEVEL_PGTABLES |
diff --git a/arch/um/kernel/tt/tlb.c b/arch/um/kernel/tt/tlb.c index f1d85dbb45b9..ae6217c86135 100644 --- a/arch/um/kernel/tt/tlb.c +++ b/arch/um/kernel/tt/tlb.c | |||
@@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end) | |||
74 | atomic_inc(&vmchange_seq); | 74 | atomic_inc(&vmchange_seq); |
75 | } | 75 | } |
76 | 76 | ||
77 | static void protect_vm_page(unsigned long addr, int w, int must_succeed) | ||
78 | { | ||
79 | int err; | ||
80 | |||
81 | err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed); | ||
82 | if(err == 0) return; | ||
83 | else if((err == -EFAULT) || (err == -ENOMEM)){ | ||
84 | flush_tlb_kernel_range(addr, addr + PAGE_SIZE); | ||
85 | protect_vm_page(addr, w, 1); | ||
86 | } | ||
87 | else panic("protect_vm_page : protect failed, errno = %d\n", err); | ||
88 | } | ||
89 | |||
90 | void mprotect_kernel_vm(int w) | ||
91 | { | ||
92 | struct mm_struct *mm; | ||
93 | pgd_t *pgd; | ||
94 | pud_t *pud; | ||
95 | pmd_t *pmd; | ||
96 | pte_t *pte; | ||
97 | unsigned long addr; | ||
98 | |||
99 | mm = &init_mm; | ||
100 | for(addr = start_vm; addr < end_vm;){ | ||
101 | pgd = pgd_offset(mm, addr); | ||
102 | pud = pud_offset(pgd, addr); | ||
103 | pmd = pmd_offset(pud, addr); | ||
104 | if(pmd_present(*pmd)){ | ||
105 | pte = pte_offset_kernel(pmd, addr); | ||
106 | if(pte_present(*pte)) protect_vm_page(addr, w, 0); | ||
107 | addr += PAGE_SIZE; | ||
108 | } | ||
109 | else addr += PMD_SIZE; | ||
110 | } | ||
111 | } | ||
112 | |||
113 | void flush_tlb_kernel_vm_tt(void) | 77 | void flush_tlb_kernel_vm_tt(void) |
114 | { | 78 | { |
115 | flush_tlb_kernel_range(start_vm, end_vm); | 79 | flush_tlb_kernel_range(start_vm, end_vm); |
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index 3e6780fa0186..93c60f4aa47a 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c | |||
@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) | |||
314 | current->mm->free_area_cache = TASK_UNMAPPED_BASE; | 314 | current->mm->free_area_cache = TASK_UNMAPPED_BASE; |
315 | current->mm->cached_hole_size = 0; | 315 | current->mm->cached_hole_size = 0; |
316 | 316 | ||
317 | set_mm_counter(current->mm, rss, 0); | ||
318 | current->mm->mmap = NULL; | 317 | current->mm->mmap = NULL; |
319 | compute_creds(bprm); | 318 | compute_creds(bprm); |
320 | current->flags &= ~PF_FORKNOEXEC; | 319 | current->flags &= ~PF_FORKNOEXEC; |
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index 6972df480d2b..ecf7acb5db9b 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c | |||
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo | |||
60 | if (address >= end) | 60 | if (address >= end) |
61 | BUG(); | 61 | BUG(); |
62 | do { | 62 | do { |
63 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | 63 | pte_t * pte = pte_alloc_kernel(pmd, address); |
64 | if (!pte) | 64 | if (!pte) |
65 | return -ENOMEM; | 65 | return -ENOMEM; |
66 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | 66 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); |
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
105 | flush_cache_all(); | 105 | flush_cache_all(); |
106 | if (address >= end) | 106 | if (address >= end) |
107 | BUG(); | 107 | BUG(); |
108 | spin_lock(&init_mm.page_table_lock); | ||
109 | do { | 108 | do { |
110 | pud_t *pud; | 109 | pud_t *pud; |
111 | pud = pud_alloc(&init_mm, pgd, address); | 110 | pud = pud_alloc(&init_mm, pgd, address); |
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, | |||
119 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | 118 | address = (address + PGDIR_SIZE) & PGDIR_MASK; |
120 | pgd++; | 119 | pgd++; |
121 | } while (address && (address < end)); | 120 | } while (address && (address < end)); |
122 | spin_unlock(&init_mm.page_table_lock); | ||
123 | flush_tlb_all(); | 121 | flush_tlb_all(); |
124 | return error; | 122 | return error; |
125 | } | 123 | } |
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 01a1bd239263..2143609d2936 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c | |||
@@ -200,8 +200,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) | |||
200 | * Note: Assume that this function returns zero on success | 200 | * Note: Assume that this function returns zero on success |
201 | */ | 201 | */ |
202 | result = add_memory(mem_device->start_addr, | 202 | result = add_memory(mem_device->start_addr, |
203 | (mem_device->end_addr - mem_device->start_addr) + 1, | 203 | (mem_device->end_addr - mem_device->start_addr) + 1); |
204 | mem_device->read_write_attribute); | ||
205 | if (result) { | 204 | if (result) { |
206 | ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); | 205 | ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); |
207 | mem_device->state = MEMORY_INVALID_STATE; | 206 | mem_device->state = MEMORY_INVALID_STATE; |
@@ -259,7 +258,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) | |||
259 | * Ask the VM to offline this memory range. | 258 | * Ask the VM to offline this memory range. |
260 | * Note: Assume that this function returns zero on success | 259 | * Note: Assume that this function returns zero on success |
261 | */ | 260 | */ |
262 | result = remove_memory(start, len, attr); | 261 | result = remove_memory(start, len); |
263 | if (result) { | 262 | if (result) { |
264 | ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); | 263 | ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); |
265 | return_VALUE(result); | 264 | return_VALUE(result); |
diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 66d9c4643fc1..f12898d53078 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile | |||
@@ -7,6 +7,7 @@ obj-y := core.o sys.o bus.o dd.o \ | |||
7 | obj-y += power/ | 7 | obj-y += power/ |
8 | obj-$(CONFIG_FW_LOADER) += firmware_class.o | 8 | obj-$(CONFIG_FW_LOADER) += firmware_class.o |
9 | obj-$(CONFIG_NUMA) += node.o | 9 | obj-$(CONFIG_NUMA) += node.o |
10 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o | ||
10 | 11 | ||
11 | ifeq ($(CONFIG_DEBUG_DRIVER),y) | 12 | ifeq ($(CONFIG_DEBUG_DRIVER),y) |
12 | EXTRA_CFLAGS += -DDEBUG | 13 | EXTRA_CFLAGS += -DDEBUG |
diff --git a/drivers/base/init.c b/drivers/base/init.c index 84e604e25c4f..c648914b9cde 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c | |||
@@ -9,6 +9,7 @@ | |||
9 | 9 | ||
10 | #include <linux/device.h> | 10 | #include <linux/device.h> |
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/memory.h> | ||
12 | 13 | ||
13 | #include "base.h" | 14 | #include "base.h" |
14 | 15 | ||
@@ -33,5 +34,6 @@ void __init driver_init(void) | |||
33 | platform_bus_init(); | 34 | platform_bus_init(); |
34 | system_bus_init(); | 35 | system_bus_init(); |
35 | cpu_dev_init(); | 36 | cpu_dev_init(); |
37 | memory_dev_init(); | ||
36 | attribute_container_init(); | 38 | attribute_container_init(); |
37 | } | 39 | } |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c new file mode 100644 index 000000000000..b7ddd651d664 --- /dev/null +++ b/drivers/base/memory.c | |||
@@ -0,0 +1,452 @@ | |||
1 | /* | ||
2 | * drivers/base/memory.c - basic Memory class support | ||
3 | * | ||
4 | * Written by Matt Tolentino <matthew.e.tolentino@intel.com> | ||
5 | * Dave Hansen <haveblue@us.ibm.com> | ||
6 | * | ||
7 | * This file provides the necessary infrastructure to represent | ||
8 | * a SPARSEMEM-memory-model system's physical memory in /sysfs. | ||
9 | * All arch-independent code that assumes MEMORY_HOTPLUG requires | ||
10 | * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. | ||
11 | */ | ||
12 | |||
13 | #include <linux/sysdev.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/sched.h> /* capable() */ | ||
17 | #include <linux/topology.h> | ||
18 | #include <linux/device.h> | ||
19 | #include <linux/memory.h> | ||
20 | #include <linux/kobject.h> | ||
21 | #include <linux/memory_hotplug.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <asm/atomic.h> | ||
24 | #include <asm/uaccess.h> | ||
25 | |||
26 | #define MEMORY_CLASS_NAME "memory" | ||
27 | |||
28 | static struct sysdev_class memory_sysdev_class = { | ||
29 | set_kset_name(MEMORY_CLASS_NAME), | ||
30 | }; | ||
31 | EXPORT_SYMBOL(memory_sysdev_class); | ||
32 | |||
33 | static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) | ||
34 | { | ||
35 | return MEMORY_CLASS_NAME; | ||
36 | } | ||
37 | |||
38 | static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, | ||
39 | int num_envp, char *buffer, int buffer_size) | ||
40 | { | ||
41 | int retval = 0; | ||
42 | |||
43 | return retval; | ||
44 | } | ||
45 | |||
46 | static struct kset_hotplug_ops memory_hotplug_ops = { | ||
47 | .name = memory_hotplug_name, | ||
48 | .hotplug = memory_hotplug, | ||
49 | }; | ||
50 | |||
51 | static struct notifier_block *memory_chain; | ||
52 | |||
53 | static int register_memory_notifier(struct notifier_block *nb) | ||
54 | { | ||
55 | return notifier_chain_register(&memory_chain, nb); | ||
56 | } | ||
57 | |||
58 | static void unregister_memory_notifier(struct notifier_block *nb) | ||
59 | { | ||
60 | notifier_chain_unregister(&memory_chain, nb); | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * register_memory - Setup a sysfs device for a memory block | ||
65 | */ | ||
66 | static int | ||
67 | register_memory(struct memory_block *memory, struct mem_section *section, | ||
68 | struct node *root) | ||
69 | { | ||
70 | int error; | ||
71 | |||
72 | memory->sysdev.cls = &memory_sysdev_class; | ||
73 | memory->sysdev.id = __section_nr(section); | ||
74 | |||
75 | error = sysdev_register(&memory->sysdev); | ||
76 | |||
77 | if (root && !error) | ||
78 | error = sysfs_create_link(&root->sysdev.kobj, | ||
79 | &memory->sysdev.kobj, | ||
80 | kobject_name(&memory->sysdev.kobj)); | ||
81 | |||
82 | return error; | ||
83 | } | ||
84 | |||
85 | static void | ||
86 | unregister_memory(struct memory_block *memory, struct mem_section *section, | ||
87 | struct node *root) | ||
88 | { | ||
89 | BUG_ON(memory->sysdev.cls != &memory_sysdev_class); | ||
90 | BUG_ON(memory->sysdev.id != __section_nr(section)); | ||
91 | |||
92 | sysdev_unregister(&memory->sysdev); | ||
93 | if (root) | ||
94 | sysfs_remove_link(&root->sysdev.kobj, | ||
95 | kobject_name(&memory->sysdev.kobj)); | ||
96 | } | ||
97 | |||
98 | /* | ||
99 | * use this as the physical section index that this memsection | ||
100 | * uses. | ||
101 | */ | ||
102 | |||
103 | static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) | ||
104 | { | ||
105 | struct memory_block *mem = | ||
106 | container_of(dev, struct memory_block, sysdev); | ||
107 | return sprintf(buf, "%08lx\n", mem->phys_index); | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * online, offline, going offline, etc. | ||
112 | */ | ||
113 | static ssize_t show_mem_state(struct sys_device *dev, char *buf) | ||
114 | { | ||
115 | struct memory_block *mem = | ||
116 | container_of(dev, struct memory_block, sysdev); | ||
117 | ssize_t len = 0; | ||
118 | |||
119 | /* | ||
120 | * We can probably put these states in a nice little array | ||
121 | * so that they're not open-coded | ||
122 | */ | ||
123 | switch (mem->state) { | ||
124 | case MEM_ONLINE: | ||
125 | len = sprintf(buf, "online\n"); | ||
126 | break; | ||
127 | case MEM_OFFLINE: | ||
128 | len = sprintf(buf, "offline\n"); | ||
129 | break; | ||
130 | case MEM_GOING_OFFLINE: | ||
131 | len = sprintf(buf, "going-offline\n"); | ||
132 | break; | ||
133 | default: | ||
134 | len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", | ||
135 | mem->state); | ||
136 | WARN_ON(1); | ||
137 | break; | ||
138 | } | ||
139 | |||
140 | return len; | ||
141 | } | ||
142 | |||
143 | static inline int memory_notify(unsigned long val, void *v) | ||
144 | { | ||
145 | return notifier_call_chain(&memory_chain, val, v); | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is | ||
150 | * OK to have direct references to sparsemem variables in here. | ||
151 | */ | ||
152 | static int | ||
153 | memory_block_action(struct memory_block *mem, unsigned long action) | ||
154 | { | ||
155 | int i; | ||
156 | unsigned long psection; | ||
157 | unsigned long start_pfn, start_paddr; | ||
158 | struct page *first_page; | ||
159 | int ret; | ||
160 | int old_state = mem->state; | ||
161 | |||
162 | psection = mem->phys_index; | ||
163 | first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); | ||
164 | |||
165 | /* | ||
166 | * The probe routines leave the pages reserved, just | ||
167 | * as the bootmem code does. Make sure they're still | ||
168 | * that way. | ||
169 | */ | ||
170 | if (action == MEM_ONLINE) { | ||
171 | for (i = 0; i < PAGES_PER_SECTION; i++) { | ||
172 | if (PageReserved(first_page+i)) | ||
173 | continue; | ||
174 | |||
175 | printk(KERN_WARNING "section number %ld page number %d " | ||
176 | "not reserved, was it already online? \n", | ||
177 | psection, i); | ||
178 | return -EBUSY; | ||
179 | } | ||
180 | } | ||
181 | |||
182 | switch (action) { | ||
183 | case MEM_ONLINE: | ||
184 | start_pfn = page_to_pfn(first_page); | ||
185 | ret = online_pages(start_pfn, PAGES_PER_SECTION); | ||
186 | break; | ||
187 | case MEM_OFFLINE: | ||
188 | mem->state = MEM_GOING_OFFLINE; | ||
189 | memory_notify(MEM_GOING_OFFLINE, NULL); | ||
190 | start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; | ||
191 | ret = remove_memory(start_paddr, | ||
192 | PAGES_PER_SECTION << PAGE_SHIFT); | ||
193 | if (ret) { | ||
194 | mem->state = old_state; | ||
195 | break; | ||
196 | } | ||
197 | memory_notify(MEM_MAPPING_INVALID, NULL); | ||
198 | break; | ||
199 | default: | ||
200 | printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", | ||
201 | __FUNCTION__, mem, action, action); | ||
202 | WARN_ON(1); | ||
203 | ret = -EINVAL; | ||
204 | } | ||
205 | /* | ||
206 | * For now, only notify on successful memory operations | ||
207 | */ | ||
208 | if (!ret) | ||
209 | memory_notify(action, NULL); | ||
210 | |||
211 | return ret; | ||
212 | } | ||
213 | |||
214 | static int memory_block_change_state(struct memory_block *mem, | ||
215 | unsigned long to_state, unsigned long from_state_req) | ||
216 | { | ||
217 | int ret = 0; | ||
218 | down(&mem->state_sem); | ||
219 | |||
220 | if (mem->state != from_state_req) { | ||
221 | ret = -EINVAL; | ||
222 | goto out; | ||
223 | } | ||
224 | |||
225 | ret = memory_block_action(mem, to_state); | ||
226 | if (!ret) | ||
227 | mem->state = to_state; | ||
228 | |||
229 | out: | ||
230 | up(&mem->state_sem); | ||
231 | return ret; | ||
232 | } | ||
233 | |||
234 | static ssize_t | ||
235 | store_mem_state(struct sys_device *dev, const char *buf, size_t count) | ||
236 | { | ||
237 | struct memory_block *mem; | ||
238 | unsigned int phys_section_nr; | ||
239 | int ret = -EINVAL; | ||
240 | |||
241 | mem = container_of(dev, struct memory_block, sysdev); | ||
242 | phys_section_nr = mem->phys_index; | ||
243 | |||
244 | if (!valid_section_nr(phys_section_nr)) | ||
245 | goto out; | ||
246 | |||
247 | if (!strncmp(buf, "online", min((int)count, 6))) | ||
248 | ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); | ||
249 | else if(!strncmp(buf, "offline", min((int)count, 7))) | ||
250 | ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | ||
251 | out: | ||
252 | if (ret) | ||
253 | return ret; | ||
254 | return count; | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * phys_device is a bad name for this. What I really want | ||
259 | * is a way to differentiate between memory ranges that | ||
260 | * are part of physical devices that constitute | ||
261 | * a complete removable unit or fru. | ||
262 | * i.e. do these ranges belong to the same physical device, | ||
263 | * s.t. if I offline all of these sections I can then | ||
264 | * remove the physical device? | ||
265 | */ | ||
266 | static ssize_t show_phys_device(struct sys_device *dev, char *buf) | ||
267 | { | ||
268 | struct memory_block *mem = | ||
269 | container_of(dev, struct memory_block, sysdev); | ||
270 | return sprintf(buf, "%d\n", mem->phys_device); | ||
271 | } | ||
272 | |||
273 | static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); | ||
274 | static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); | ||
275 | static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); | ||
276 | |||
277 | #define mem_create_simple_file(mem, attr_name) \ | ||
278 | sysdev_create_file(&mem->sysdev, &attr_##attr_name) | ||
279 | #define mem_remove_simple_file(mem, attr_name) \ | ||
280 | sysdev_remove_file(&mem->sysdev, &attr_##attr_name) | ||
281 | |||
282 | /* | ||
283 | * Block size attribute stuff | ||
284 | */ | ||
285 | static ssize_t | ||
286 | print_block_size(struct class *class, char *buf) | ||
287 | { | ||
288 | return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); | ||
289 | } | ||
290 | |||
291 | static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); | ||
292 | |||
293 | static int block_size_init(void) | ||
294 | { | ||
295 | sysfs_create_file(&memory_sysdev_class.kset.kobj, | ||
296 | &class_attr_block_size_bytes.attr); | ||
297 | return 0; | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Some architectures will have custom drivers to do this, and | ||
302 | * will not need to do it from userspace. The fake hot-add code | ||
303 | * as well as ppc64 will do all of their discovery in userspace | ||
304 | * and will require this interface. | ||
305 | */ | ||
306 | #ifdef CONFIG_ARCH_MEMORY_PROBE | ||
307 | static ssize_t | ||
308 | memory_probe_store(struct class *class, const char __user *buf, size_t count) | ||
309 | { | ||
310 | u64 phys_addr; | ||
311 | int ret; | ||
312 | |||
313 | phys_addr = simple_strtoull(buf, NULL, 0); | ||
314 | |||
315 | ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); | ||
316 | |||
317 | if (ret) | ||
318 | count = ret; | ||
319 | |||
320 | return count; | ||
321 | } | ||
322 | static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); | ||
323 | |||
324 | static int memory_probe_init(void) | ||
325 | { | ||
326 | sysfs_create_file(&memory_sysdev_class.kset.kobj, | ||
327 | &class_attr_probe.attr); | ||
328 | return 0; | ||
329 | } | ||
330 | #else | ||
331 | #define memory_probe_init(...) do {} while (0) | ||
332 | #endif | ||
333 | |||
334 | /* | ||
335 | * Note that phys_device is optional. It is here to allow for | ||
336 | * differentiation between which *physical* devices each | ||
337 | * section belongs to... | ||
338 | */ | ||
339 | |||
340 | static int add_memory_block(unsigned long node_id, struct mem_section *section, | ||
341 | unsigned long state, int phys_device) | ||
342 | { | ||
343 | struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); | ||
344 | int ret = 0; | ||
345 | |||
346 | if (!mem) | ||
347 | return -ENOMEM; | ||
348 | |||
349 | mem->phys_index = __section_nr(section); | ||
350 | mem->state = state; | ||
351 | init_MUTEX(&mem->state_sem); | ||
352 | mem->phys_device = phys_device; | ||
353 | |||
354 | ret = register_memory(mem, section, NULL); | ||
355 | if (!ret) | ||
356 | ret = mem_create_simple_file(mem, phys_index); | ||
357 | if (!ret) | ||
358 | ret = mem_create_simple_file(mem, state); | ||
359 | if (!ret) | ||
360 | ret = mem_create_simple_file(mem, phys_device); | ||
361 | |||
362 | return ret; | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | * For now, we have a linear search to go find the appropriate | ||
367 | * memory_block corresponding to a particular phys_index. If | ||
368 | * this gets to be a real problem, we can always use a radix | ||
369 | * tree or something here. | ||
370 | * | ||
371 | * This could be made generic for all sysdev classes. | ||
372 | */ | ||
373 | static struct memory_block *find_memory_block(struct mem_section *section) | ||
374 | { | ||
375 | struct kobject *kobj; | ||
376 | struct sys_device *sysdev; | ||
377 | struct memory_block *mem; | ||
378 | char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; | ||
379 | |||
380 | /* | ||
381 | * This only works because we know that section == sysdev->id | ||
382 | * slightly redundant with sysdev_register() | ||
383 | */ | ||
384 | sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); | ||
385 | |||
386 | kobj = kset_find_obj(&memory_sysdev_class.kset, name); | ||
387 | if (!kobj) | ||
388 | return NULL; | ||
389 | |||
390 | sysdev = container_of(kobj, struct sys_device, kobj); | ||
391 | mem = container_of(sysdev, struct memory_block, sysdev); | ||
392 | |||
393 | return mem; | ||
394 | } | ||
395 | |||
396 | int remove_memory_block(unsigned long node_id, struct mem_section *section, | ||
397 | int phys_device) | ||
398 | { | ||
399 | struct memory_block *mem; | ||
400 | |||
401 | mem = find_memory_block(section); | ||
402 | mem_remove_simple_file(mem, phys_index); | ||
403 | mem_remove_simple_file(mem, state); | ||
404 | mem_remove_simple_file(mem, phys_device); | ||
405 | unregister_memory(mem, section, NULL); | ||
406 | |||
407 | return 0; | ||
408 | } | ||
409 | |||
410 | /* | ||
411 | * need an interface for the VM to add new memory regions, | ||
412 | * but without onlining it. | ||
413 | */ | ||
414 | int register_new_memory(struct mem_section *section) | ||
415 | { | ||
416 | return add_memory_block(0, section, MEM_OFFLINE, 0); | ||
417 | } | ||
418 | |||
419 | int unregister_memory_section(struct mem_section *section) | ||
420 | { | ||
421 | if (!valid_section(section)) | ||
422 | return -EINVAL; | ||
423 | |||
424 | return remove_memory_block(0, section, 0); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * Initialize the sysfs support for memory devices... | ||
429 | */ | ||
430 | int __init memory_dev_init(void) | ||
431 | { | ||
432 | unsigned int i; | ||
433 | int ret; | ||
434 | |||
435 | memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; | ||
436 | ret = sysdev_class_register(&memory_sysdev_class); | ||
437 | |||
438 | /* | ||
439 | * Create entries for memory sections that were found | ||
440 | * during boot and have been initialized | ||
441 | */ | ||
442 | for (i = 0; i < NR_MEM_SECTIONS; i++) { | ||
443 | if (!valid_section_nr(i)) | ||
444 | continue; | ||
445 | add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); | ||
446 | } | ||
447 | |||
448 | memory_probe_init(); | ||
449 | block_size_init(); | ||
450 | |||
451 | return ret; | ||
452 | } | ||
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 07fee811c09e..d86d5c26061d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c | |||
@@ -1887,13 +1887,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages, | |||
1887 | int i; | 1887 | int i; |
1888 | 1888 | ||
1889 | for (i=0; i < nr_pages; i++) { | 1889 | for (i=0; i < nr_pages; i++) { |
1890 | if (dirtied && !PageReserved(sgl[i].page)) | 1890 | struct page *page = sgl[i].page; |
1891 | SetPageDirty(sgl[i].page); | 1891 | |
1892 | /* unlock_page(sgl[i].page); */ | 1892 | /* XXX: just for debug. Remove when PageReserved is removed */ |
1893 | BUG_ON(PageReserved(page)); | ||
1894 | if (dirtied) | ||
1895 | SetPageDirty(page); | ||
1896 | /* unlock_page(page); */ | ||
1893 | /* FIXME: cache flush missing for rw==READ | 1897 | /* FIXME: cache flush missing for rw==READ |
1894 | * FIXME: call the correct reference counting function | 1898 | * FIXME: call the correct reference counting function |
1895 | */ | 1899 | */ |
1896 | page_cache_release(sgl[i].page); | 1900 | page_cache_release(page); |
1897 | } | 1901 | } |
1898 | 1902 | ||
1899 | return 0; | 1903 | return 0; |
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 5eb54d8019b4..da9766283bd7 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c | |||
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p | |||
4526 | int i; | 4526 | int i; |
4527 | 4527 | ||
4528 | for (i=0; i < nr_pages; i++) { | 4528 | for (i=0; i < nr_pages; i++) { |
4529 | if (dirtied && !PageReserved(sgl[i].page)) | 4529 | struct page *page = sgl[i].page; |
4530 | SetPageDirty(sgl[i].page); | 4530 | |
4531 | /* XXX: just for debug. Remove when PageReserved is removed */ | ||
4532 | BUG_ON(PageReserved(page)); | ||
4533 | if (dirtied) | ||
4534 | SetPageDirty(page); | ||
4531 | /* FIXME: cache flush missing for rw==READ | 4535 | /* FIXME: cache flush missing for rw==READ |
4532 | * FIXME: call the correct reference counting function | 4536 | * FIXME: call the correct reference counting function |
4533 | */ | 4537 | */ |
4534 | page_cache_release(sgl[i].page); | 4538 | page_cache_release(page); |
4535 | } | 4539 | } |
4536 | 4540 | ||
4537 | return 0; | 4541 | return 0; |
diff --git a/fs/afs/file.c b/fs/afs/file.c index 0d576987ec67..4975c9c193dd 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c | |||
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags) | |||
291 | cachefs_uncache_page(vnode->cache, page); | 291 | cachefs_uncache_page(vnode->cache, page); |
292 | #endif | 292 | #endif |
293 | 293 | ||
294 | pageio = (struct cachefs_page *) page->private; | 294 | pageio = (struct cachefs_page *) page_private(page); |
295 | page->private = 0; | 295 | set_page_private(page, 0); |
296 | ClearPagePrivate(page); | 296 | ClearPagePrivate(page); |
297 | 297 | ||
298 | if (pageio) | 298 | if (pageio) |
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index dd9baabaf016..72011826f0cb 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c | |||
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) | |||
318 | current->mm->free_area_cache = current->mm->mmap_base; | 318 | current->mm->free_area_cache = current->mm->mmap_base; |
319 | current->mm->cached_hole_size = 0; | 319 | current->mm->cached_hole_size = 0; |
320 | 320 | ||
321 | set_mm_counter(current->mm, rss, 0); | ||
322 | current->mm->mmap = NULL; | 321 | current->mm->mmap = NULL; |
323 | compute_creds(bprm); | 322 | compute_creds(bprm); |
324 | current->flags &= ~PF_FORKNOEXEC; | 323 | current->flags &= ~PF_FORKNOEXEC; |
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d4b15576e584..918ccc267e41 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) | |||
773 | 773 | ||
774 | /* Do this so that we can load the interpreter, if need be. We will | 774 | /* Do this so that we can load the interpreter, if need be. We will |
775 | change some of these later */ | 775 | change some of these later */ |
776 | set_mm_counter(current->mm, rss, 0); | ||
777 | current->mm->free_area_cache = current->mm->mmap_base; | 776 | current->mm->free_area_cache = current->mm->mmap_base; |
778 | current->mm->cached_hole_size = 0; | 777 | current->mm->cached_hole_size = 0; |
779 | retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), | 778 | retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), |
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 134c9c0d1f54..dda87c4c82a3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c | |||
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs | |||
294 | &interp_params, | 294 | &interp_params, |
295 | ¤t->mm->start_stack, | 295 | ¤t->mm->start_stack, |
296 | ¤t->mm->start_brk); | 296 | ¤t->mm->start_brk); |
297 | #endif | ||
298 | |||
299 | /* do this so that we can load the interpreter, if need be | ||
300 | * - we will change some of these later | ||
301 | */ | ||
302 | set_mm_counter(current->mm, rss, 0); | ||
303 | 297 | ||
304 | #ifdef CONFIG_MMU | ||
305 | retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); | 298 | retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); |
306 | if (retval < 0) { | 299 | if (retval < 0) { |
307 | send_sig(SIGKILL, current, 0); | 300 | send_sig(SIGKILL, current, 0); |
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 7974efa107bc..9d6625829b99 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c | |||
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm, | |||
650 | current->mm->start_brk = datapos + data_len + bss_len; | 650 | current->mm->start_brk = datapos + data_len + bss_len; |
651 | current->mm->brk = (current->mm->start_brk + 3) & ~3; | 651 | current->mm->brk = (current->mm->start_brk + 3) & ~3; |
652 | current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; | 652 | current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; |
653 | set_mm_counter(current->mm, rss, 0); | ||
654 | } | 653 | } |
655 | 654 | ||
656 | if (flags & FLAT_FLAG_KTRACE) | 655 | if (flags & FLAT_FLAG_KTRACE) |
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 227a2682d2bf..00a91dc25d16 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c | |||
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) | |||
259 | create_som_tables(bprm); | 259 | create_som_tables(bprm); |
260 | 260 | ||
261 | current->mm->start_stack = bprm->p; | 261 | current->mm->start_stack = bprm->p; |
262 | set_mm_counter(current->mm, rss, 0); | ||
263 | 262 | ||
264 | #if 0 | 263 | #if 0 |
265 | printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); | 264 | printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); |
diff --git a/fs/buffer.c b/fs/buffer.c index b1667986442f..2066e4cb700c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -96,7 +96,7 @@ static void | |||
96 | __clear_page_buffers(struct page *page) | 96 | __clear_page_buffers(struct page *page) |
97 | { | 97 | { |
98 | ClearPagePrivate(page); | 98 | ClearPagePrivate(page); |
99 | page->private = 0; | 99 | set_page_private(page, 0); |
100 | page_cache_release(page); | 100 | page_cache_release(page); |
101 | } | 101 | } |
102 | 102 | ||
diff --git a/fs/compat.c b/fs/compat.c index a719e158e002..8e71cdbecc7c 100644 --- a/fs/compat.c +++ b/fs/compat.c | |||
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename, | |||
1490 | /* execve success */ | 1490 | /* execve success */ |
1491 | security_bprm_free(bprm); | 1491 | security_bprm_free(bprm); |
1492 | acct_update_integrals(current); | 1492 | acct_update_integrals(current); |
1493 | update_mem_hiwater(current); | ||
1494 | kfree(bprm); | 1493 | kfree(bprm); |
1495 | return retval; | 1494 | return retval; |
1496 | } | 1495 | } |
diff --git a/fs/direct-io.c b/fs/direct-io.c index 0d06097bc995..3931e7f1e6bf 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio) | |||
162 | up_read(¤t->mm->mmap_sem); | 162 | up_read(¤t->mm->mmap_sem); |
163 | 163 | ||
164 | if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { | 164 | if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { |
165 | struct page *page = ZERO_PAGE(dio->curr_user_address); | ||
165 | /* | 166 | /* |
166 | * A memory fault, but the filesystem has some outstanding | 167 | * A memory fault, but the filesystem has some outstanding |
167 | * mapped blocks. We need to use those blocks up to avoid | 168 | * mapped blocks. We need to use those blocks up to avoid |
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio) | |||
169 | */ | 170 | */ |
170 | if (dio->page_errors == 0) | 171 | if (dio->page_errors == 0) |
171 | dio->page_errors = ret; | 172 | dio->page_errors = ret; |
172 | dio->pages[0] = ZERO_PAGE(dio->curr_user_address); | 173 | page_cache_get(page); |
174 | dio->pages[0] = page; | ||
173 | dio->head = 0; | 175 | dio->head = 0; |
174 | dio->tail = 1; | 176 | dio->tail = 1; |
175 | ret = 0; | 177 | ret = 0; |
@@ -309,40 +309,36 @@ void install_arg_page(struct vm_area_struct *vma, | |||
309 | pud_t * pud; | 309 | pud_t * pud; |
310 | pmd_t * pmd; | 310 | pmd_t * pmd; |
311 | pte_t * pte; | 311 | pte_t * pte; |
312 | spinlock_t *ptl; | ||
312 | 313 | ||
313 | if (unlikely(anon_vma_prepare(vma))) | 314 | if (unlikely(anon_vma_prepare(vma))) |
314 | goto out_sig; | 315 | goto out; |
315 | 316 | ||
316 | flush_dcache_page(page); | 317 | flush_dcache_page(page); |
317 | pgd = pgd_offset(mm, address); | 318 | pgd = pgd_offset(mm, address); |
318 | |||
319 | spin_lock(&mm->page_table_lock); | ||
320 | pud = pud_alloc(mm, pgd, address); | 319 | pud = pud_alloc(mm, pgd, address); |
321 | if (!pud) | 320 | if (!pud) |
322 | goto out; | 321 | goto out; |
323 | pmd = pmd_alloc(mm, pud, address); | 322 | pmd = pmd_alloc(mm, pud, address); |
324 | if (!pmd) | 323 | if (!pmd) |
325 | goto out; | 324 | goto out; |
326 | pte = pte_alloc_map(mm, pmd, address); | 325 | pte = pte_alloc_map_lock(mm, pmd, address, &ptl); |
327 | if (!pte) | 326 | if (!pte) |
328 | goto out; | 327 | goto out; |
329 | if (!pte_none(*pte)) { | 328 | if (!pte_none(*pte)) { |
330 | pte_unmap(pte); | 329 | pte_unmap_unlock(pte, ptl); |
331 | goto out; | 330 | goto out; |
332 | } | 331 | } |
333 | inc_mm_counter(mm, rss); | 332 | inc_mm_counter(mm, anon_rss); |
334 | lru_cache_add_active(page); | 333 | lru_cache_add_active(page); |
335 | set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( | 334 | set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( |
336 | page, vma->vm_page_prot)))); | 335 | page, vma->vm_page_prot)))); |
337 | page_add_anon_rmap(page, vma, address); | 336 | page_add_anon_rmap(page, vma, address); |
338 | pte_unmap(pte); | 337 | pte_unmap_unlock(pte, ptl); |
339 | spin_unlock(&mm->page_table_lock); | ||
340 | 338 | ||
341 | /* no need for flush_tlb */ | 339 | /* no need for flush_tlb */ |
342 | return; | 340 | return; |
343 | out: | 341 | out: |
344 | spin_unlock(&mm->page_table_lock); | ||
345 | out_sig: | ||
346 | __free_page(page); | 342 | __free_page(page); |
347 | force_sig(SIGKILL, current); | 343 | force_sig(SIGKILL, current); |
348 | } | 344 | } |
@@ -1207,7 +1203,6 @@ int do_execve(char * filename, | |||
1207 | /* execve success */ | 1203 | /* execve success */ |
1208 | security_bprm_free(bprm); | 1204 | security_bprm_free(bprm); |
1209 | acct_update_integrals(current); | 1205 | acct_update_integrals(current); |
1210 | update_mem_hiwater(current); | ||
1211 | kfree(bprm); | 1206 | kfree(bprm); |
1212 | return retval; | 1207 | return retval; |
1213 | } | 1208 | } |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3a9b6d179cbd..e026c807e6b3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -45,10 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = { | |||
45 | 45 | ||
46 | int sysctl_hugetlb_shm_group; | 46 | int sysctl_hugetlb_shm_group; |
47 | 47 | ||
48 | static void huge_pagevec_release(struct pagevec *pvec) | ||
49 | { | ||
50 | int i; | ||
51 | |||
52 | for (i = 0; i < pagevec_count(pvec); ++i) | ||
53 | put_page(pvec->pages[i]); | ||
54 | |||
55 | pagevec_reinit(pvec); | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * huge_pages_needed tries to determine the number of new huge pages that | ||
60 | * will be required to fully populate this VMA. This will be equal to | ||
61 | * the size of the VMA in huge pages minus the number of huge pages | ||
62 | * (covered by this VMA) that are found in the page cache. | ||
63 | * | ||
64 | * Result is in bytes to be compatible with is_hugepage_mem_enough() | ||
65 | */ | ||
66 | unsigned long | ||
67 | huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) | ||
68 | { | ||
69 | int i; | ||
70 | struct pagevec pvec; | ||
71 | unsigned long start = vma->vm_start; | ||
72 | unsigned long end = vma->vm_end; | ||
73 | unsigned long hugepages = (end - start) >> HPAGE_SHIFT; | ||
74 | pgoff_t next = vma->vm_pgoff; | ||
75 | pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT); | ||
76 | |||
77 | pagevec_init(&pvec, 0); | ||
78 | while (next < endpg) { | ||
79 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) | ||
80 | break; | ||
81 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
82 | struct page *page = pvec.pages[i]; | ||
83 | if (page->index > next) | ||
84 | next = page->index; | ||
85 | if (page->index >= endpg) | ||
86 | break; | ||
87 | next++; | ||
88 | hugepages--; | ||
89 | } | ||
90 | huge_pagevec_release(&pvec); | ||
91 | } | ||
92 | return hugepages << HPAGE_SHIFT; | ||
93 | } | ||
94 | |||
48 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | 95 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) |
49 | { | 96 | { |
50 | struct inode *inode = file->f_dentry->d_inode; | 97 | struct inode *inode = file->f_dentry->d_inode; |
51 | struct address_space *mapping = inode->i_mapping; | 98 | struct address_space *mapping = inode->i_mapping; |
99 | unsigned long bytes; | ||
52 | loff_t len, vma_len; | 100 | loff_t len, vma_len; |
53 | int ret; | 101 | int ret; |
54 | 102 | ||
@@ -67,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
67 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | 115 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) |
68 | return -EINVAL; | 116 | return -EINVAL; |
69 | 117 | ||
118 | bytes = huge_pages_needed(mapping, vma); | ||
119 | if (!is_hugepage_mem_enough(bytes)) | ||
120 | return -ENOMEM; | ||
121 | |||
70 | vma_len = (loff_t)(vma->vm_end - vma->vm_start); | 122 | vma_len = (loff_t)(vma->vm_end - vma->vm_start); |
71 | 123 | ||
72 | down(&inode->i_sem); | 124 | down(&inode->i_sem); |
@@ -79,10 +131,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
79 | if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) | 131 | if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) |
80 | goto out; | 132 | goto out; |
81 | 133 | ||
82 | ret = hugetlb_prefault(mapping, vma); | 134 | ret = 0; |
83 | if (ret) | 135 | hugetlb_prefault_arch_hook(vma->vm_mm); |
84 | goto out; | ||
85 | |||
86 | if (inode->i_size < len) | 136 | if (inode->i_size < len) |
87 | inode->i_size = len; | 137 | inode->i_size = len; |
88 | out: | 138 | out: |
@@ -92,7 +142,7 @@ out: | |||
92 | } | 142 | } |
93 | 143 | ||
94 | /* | 144 | /* |
95 | * Called under down_write(mmap_sem), page_table_lock is not held | 145 | * Called under down_write(mmap_sem). |
96 | */ | 146 | */ |
97 | 147 | ||
98 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | 148 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA |
@@ -171,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file, | |||
171 | return -EINVAL; | 221 | return -EINVAL; |
172 | } | 222 | } |
173 | 223 | ||
174 | static void huge_pagevec_release(struct pagevec *pvec) | ||
175 | { | ||
176 | int i; | ||
177 | |||
178 | for (i = 0; i < pagevec_count(pvec); ++i) | ||
179 | put_page(pvec->pages[i]); | ||
180 | |||
181 | pagevec_reinit(pvec); | ||
182 | } | ||
183 | |||
184 | static void truncate_huge_page(struct page *page) | 224 | static void truncate_huge_page(struct page *page) |
185 | { | 225 | { |
186 | clear_page_dirty(page); | 226 | clear_page_dirty(page); |
@@ -224,52 +264,35 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) | |||
224 | 264 | ||
225 | static void hugetlbfs_delete_inode(struct inode *inode) | 265 | static void hugetlbfs_delete_inode(struct inode *inode) |
226 | { | 266 | { |
227 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb); | ||
228 | |||
229 | hlist_del_init(&inode->i_hash); | ||
230 | list_del_init(&inode->i_list); | ||
231 | list_del_init(&inode->i_sb_list); | ||
232 | inode->i_state |= I_FREEING; | ||
233 | inodes_stat.nr_inodes--; | ||
234 | spin_unlock(&inode_lock); | ||
235 | |||
236 | if (inode->i_data.nrpages) | 267 | if (inode->i_data.nrpages) |
237 | truncate_hugepages(&inode->i_data, 0); | 268 | truncate_hugepages(&inode->i_data, 0); |
238 | |||
239 | security_inode_delete(inode); | ||
240 | |||
241 | if (sbinfo->free_inodes >= 0) { | ||
242 | spin_lock(&sbinfo->stat_lock); | ||
243 | sbinfo->free_inodes++; | ||
244 | spin_unlock(&sbinfo->stat_lock); | ||
245 | } | ||
246 | |||
247 | clear_inode(inode); | 269 | clear_inode(inode); |
248 | destroy_inode(inode); | ||
249 | } | 270 | } |
250 | 271 | ||
251 | static void hugetlbfs_forget_inode(struct inode *inode) | 272 | static void hugetlbfs_forget_inode(struct inode *inode) |
252 | { | 273 | { |
253 | struct super_block *super_block = inode->i_sb; | 274 | struct super_block *sb = inode->i_sb; |
254 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block); | ||
255 | 275 | ||
256 | if (hlist_unhashed(&inode->i_hash)) | 276 | if (!hlist_unhashed(&inode->i_hash)) { |
257 | goto out_truncate; | 277 | if (!(inode->i_state & (I_DIRTY|I_LOCK))) |
258 | 278 | list_move(&inode->i_list, &inode_unused); | |
259 | if (!(inode->i_state & (I_DIRTY|I_LOCK))) { | 279 | inodes_stat.nr_unused++; |
260 | list_del(&inode->i_list); | 280 | if (!sb || (sb->s_flags & MS_ACTIVE)) { |
261 | list_add(&inode->i_list, &inode_unused); | 281 | spin_unlock(&inode_lock); |
262 | } | 282 | return; |
263 | inodes_stat.nr_unused++; | 283 | } |
264 | if (!super_block || (super_block->s_flags & MS_ACTIVE)) { | 284 | inode->i_state |= I_WILL_FREE; |
265 | spin_unlock(&inode_lock); | 285 | spin_unlock(&inode_lock); |
266 | return; | 286 | /* |
287 | * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK | ||
288 | * in our backing_dev_info. | ||
289 | */ | ||
290 | write_inode_now(inode, 1); | ||
291 | spin_lock(&inode_lock); | ||
292 | inode->i_state &= ~I_WILL_FREE; | ||
293 | inodes_stat.nr_unused--; | ||
294 | hlist_del_init(&inode->i_hash); | ||
267 | } | 295 | } |
268 | |||
269 | /* write_inode_now() ? */ | ||
270 | inodes_stat.nr_unused--; | ||
271 | hlist_del_init(&inode->i_hash); | ||
272 | out_truncate: | ||
273 | list_del_init(&inode->i_list); | 296 | list_del_init(&inode->i_list); |
274 | list_del_init(&inode->i_sb_list); | 297 | list_del_init(&inode->i_sb_list); |
275 | inode->i_state |= I_FREEING; | 298 | inode->i_state |= I_FREEING; |
@@ -277,13 +300,6 @@ out_truncate: | |||
277 | spin_unlock(&inode_lock); | 300 | spin_unlock(&inode_lock); |
278 | if (inode->i_data.nrpages) | 301 | if (inode->i_data.nrpages) |
279 | truncate_hugepages(&inode->i_data, 0); | 302 | truncate_hugepages(&inode->i_data, 0); |
280 | |||
281 | if (sbinfo->free_inodes >= 0) { | ||
282 | spin_lock(&sbinfo->stat_lock); | ||
283 | sbinfo->free_inodes++; | ||
284 | spin_unlock(&sbinfo->stat_lock); | ||
285 | } | ||
286 | |||
287 | clear_inode(inode); | 303 | clear_inode(inode); |
288 | destroy_inode(inode); | 304 | destroy_inode(inode); |
289 | } | 305 | } |
@@ -291,7 +307,7 @@ out_truncate: | |||
291 | static void hugetlbfs_drop_inode(struct inode *inode) | 307 | static void hugetlbfs_drop_inode(struct inode *inode) |
292 | { | 308 | { |
293 | if (!inode->i_nlink) | 309 | if (!inode->i_nlink) |
294 | hugetlbfs_delete_inode(inode); | 310 | generic_delete_inode(inode); |
295 | else | 311 | else |
296 | hugetlbfs_forget_inode(inode); | 312 | hugetlbfs_forget_inode(inode); |
297 | } | 313 | } |
@@ -308,7 +324,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) | |||
308 | 324 | ||
309 | vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { | 325 | vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { |
310 | unsigned long h_vm_pgoff; | 326 | unsigned long h_vm_pgoff; |
311 | unsigned long v_length; | ||
312 | unsigned long v_offset; | 327 | unsigned long v_offset; |
313 | 328 | ||
314 | h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); | 329 | h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); |
@@ -319,11 +334,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) | |||
319 | if (h_vm_pgoff >= h_pgoff) | 334 | if (h_vm_pgoff >= h_pgoff) |
320 | v_offset = 0; | 335 | v_offset = 0; |
321 | 336 | ||
322 | v_length = vma->vm_end - vma->vm_start; | 337 | unmap_hugepage_range(vma, |
323 | 338 | vma->vm_start + v_offset, vma->vm_end); | |
324 | zap_hugepage_range(vma, | ||
325 | vma->vm_start + v_offset, | ||
326 | v_length - v_offset); | ||
327 | } | 339 | } |
328 | } | 340 | } |
329 | 341 | ||
@@ -379,17 +391,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, | |||
379 | gid_t gid, int mode, dev_t dev) | 391 | gid_t gid, int mode, dev_t dev) |
380 | { | 392 | { |
381 | struct inode *inode; | 393 | struct inode *inode; |
382 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); | ||
383 | |||
384 | if (sbinfo->free_inodes >= 0) { | ||
385 | spin_lock(&sbinfo->stat_lock); | ||
386 | if (!sbinfo->free_inodes) { | ||
387 | spin_unlock(&sbinfo->stat_lock); | ||
388 | return NULL; | ||
389 | } | ||
390 | sbinfo->free_inodes--; | ||
391 | spin_unlock(&sbinfo->stat_lock); | ||
392 | } | ||
393 | 394 | ||
394 | inode = new_inode(sb); | 395 | inode = new_inode(sb); |
395 | if (inode) { | 396 | if (inode) { |
@@ -531,29 +532,51 @@ static void hugetlbfs_put_super(struct super_block *sb) | |||
531 | } | 532 | } |
532 | } | 533 | } |
533 | 534 | ||
535 | static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) | ||
536 | { | ||
537 | if (sbinfo->free_inodes >= 0) { | ||
538 | spin_lock(&sbinfo->stat_lock); | ||
539 | if (unlikely(!sbinfo->free_inodes)) { | ||
540 | spin_unlock(&sbinfo->stat_lock); | ||
541 | return 0; | ||
542 | } | ||
543 | sbinfo->free_inodes--; | ||
544 | spin_unlock(&sbinfo->stat_lock); | ||
545 | } | ||
546 | |||
547 | return 1; | ||
548 | } | ||
549 | |||
550 | static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) | ||
551 | { | ||
552 | if (sbinfo->free_inodes >= 0) { | ||
553 | spin_lock(&sbinfo->stat_lock); | ||
554 | sbinfo->free_inodes++; | ||
555 | spin_unlock(&sbinfo->stat_lock); | ||
556 | } | ||
557 | } | ||
558 | |||
559 | |||
534 | static kmem_cache_t *hugetlbfs_inode_cachep; | 560 | static kmem_cache_t *hugetlbfs_inode_cachep; |
535 | 561 | ||
536 | static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) | 562 | static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) |
537 | { | 563 | { |
564 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); | ||
538 | struct hugetlbfs_inode_info *p; | 565 | struct hugetlbfs_inode_info *p; |
539 | 566 | ||
567 | if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) | ||
568 | return NULL; | ||
540 | p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); | 569 | p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); |
541 | if (!p) | 570 | if (unlikely(!p)) { |
571 | hugetlbfs_inc_free_inodes(sbinfo); | ||
542 | return NULL; | 572 | return NULL; |
573 | } | ||
543 | return &p->vfs_inode; | 574 | return &p->vfs_inode; |
544 | } | 575 | } |
545 | 576 | ||
546 | static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) | ||
547 | { | ||
548 | struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; | ||
549 | |||
550 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
551 | SLAB_CTOR_CONSTRUCTOR) | ||
552 | inode_init_once(&ei->vfs_inode); | ||
553 | } | ||
554 | |||
555 | static void hugetlbfs_destroy_inode(struct inode *inode) | 577 | static void hugetlbfs_destroy_inode(struct inode *inode) |
556 | { | 578 | { |
579 | hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); | ||
557 | mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); | 580 | mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); |
558 | kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); | 581 | kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); |
559 | } | 582 | } |
@@ -565,6 +588,16 @@ static struct address_space_operations hugetlbfs_aops = { | |||
565 | .set_page_dirty = hugetlbfs_set_page_dirty, | 588 | .set_page_dirty = hugetlbfs_set_page_dirty, |
566 | }; | 589 | }; |
567 | 590 | ||
591 | |||
592 | static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) | ||
593 | { | ||
594 | struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; | ||
595 | |||
596 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
597 | SLAB_CTOR_CONSTRUCTOR) | ||
598 | inode_init_once(&ei->vfs_inode); | ||
599 | } | ||
600 | |||
568 | struct file_operations hugetlbfs_file_operations = { | 601 | struct file_operations hugetlbfs_file_operations = { |
569 | .mmap = hugetlbfs_file_mmap, | 602 | .mmap = hugetlbfs_file_mmap, |
570 | .fsync = simple_sync_file, | 603 | .fsync = simple_sync_file, |
@@ -592,6 +625,7 @@ static struct super_operations hugetlbfs_ops = { | |||
592 | .alloc_inode = hugetlbfs_alloc_inode, | 625 | .alloc_inode = hugetlbfs_alloc_inode, |
593 | .destroy_inode = hugetlbfs_destroy_inode, | 626 | .destroy_inode = hugetlbfs_destroy_inode, |
594 | .statfs = hugetlbfs_statfs, | 627 | .statfs = hugetlbfs_statfs, |
628 | .delete_inode = hugetlbfs_delete_inode, | ||
595 | .drop_inode = hugetlbfs_drop_inode, | 629 | .drop_inode = hugetlbfs_drop_inode, |
596 | .put_super = hugetlbfs_put_super, | 630 | .put_super = hugetlbfs_put_super, |
597 | }; | 631 | }; |
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 26091a5f88d4..8a53981f9f27 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c | |||
@@ -86,7 +86,7 @@ struct meta_anchor { | |||
86 | atomic_t io_count; | 86 | atomic_t io_count; |
87 | struct metapage *mp[MPS_PER_PAGE]; | 87 | struct metapage *mp[MPS_PER_PAGE]; |
88 | }; | 88 | }; |
89 | #define mp_anchor(page) ((struct meta_anchor *)page->private) | 89 | #define mp_anchor(page) ((struct meta_anchor *)page_private(page)) |
90 | 90 | ||
91 | static inline struct metapage *page_to_mp(struct page *page, uint offset) | 91 | static inline struct metapage *page_to_mp(struct page *page, uint offset) |
92 | { | 92 | { |
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) | |||
108 | if (!a) | 108 | if (!a) |
109 | return -ENOMEM; | 109 | return -ENOMEM; |
110 | memset(a, 0, sizeof(struct meta_anchor)); | 110 | memset(a, 0, sizeof(struct meta_anchor)); |
111 | page->private = (unsigned long)a; | 111 | set_page_private(page, (unsigned long)a); |
112 | SetPagePrivate(page); | 112 | SetPagePrivate(page); |
113 | kmap(page); | 113 | kmap(page); |
114 | } | 114 | } |
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) | |||
136 | a->mp[index] = NULL; | 136 | a->mp[index] = NULL; |
137 | if (--a->mp_count == 0) { | 137 | if (--a->mp_count == 0) { |
138 | kfree(a); | 138 | kfree(a); |
139 | page->private = 0; | 139 | set_page_private(page, 0); |
140 | ClearPagePrivate(page); | 140 | ClearPagePrivate(page); |
141 | kunmap(page); | 141 | kunmap(page); |
142 | } | 142 | } |
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *)) | |||
156 | #else | 156 | #else |
157 | static inline struct metapage *page_to_mp(struct page *page, uint offset) | 157 | static inline struct metapage *page_to_mp(struct page *page, uint offset) |
158 | { | 158 | { |
159 | return PagePrivate(page) ? (struct metapage *)page->private : NULL; | 159 | return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; |
160 | } | 160 | } |
161 | 161 | ||
162 | static inline int insert_metapage(struct page *page, struct metapage *mp) | 162 | static inline int insert_metapage(struct page *page, struct metapage *mp) |
163 | { | 163 | { |
164 | if (mp) { | 164 | if (mp) { |
165 | page->private = (unsigned long)mp; | 165 | set_page_private(page, (unsigned long)mp); |
166 | SetPagePrivate(page); | 166 | SetPagePrivate(page); |
167 | kmap(page); | 167 | kmap(page); |
168 | } | 168 | } |
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) | |||
171 | 171 | ||
172 | static inline void remove_metapage(struct page *page, struct metapage *mp) | 172 | static inline void remove_metapage(struct page *page, struct metapage *mp) |
173 | { | 173 | { |
174 | page->private = 0; | 174 | set_page_private(page, 0); |
175 | ClearPagePrivate(page); | 175 | ClearPagePrivate(page); |
176 | kunmap(page); | 176 | kunmap(page); |
177 | } | 177 | } |
diff --git a/fs/proc/array.c b/fs/proc/array.c index d84eecacbeaf..3e1239e4b303 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) | |||
438 | jiffies_to_clock_t(it_real_value), | 438 | jiffies_to_clock_t(it_real_value), |
439 | start_time, | 439 | start_time, |
440 | vsize, | 440 | vsize, |
441 | mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */ | 441 | mm ? get_mm_rss(mm) : 0, |
442 | rsslim, | 442 | rsslim, |
443 | mm ? mm->start_code : 0, | 443 | mm ? mm->start_code : 0, |
444 | mm ? mm->end_code : 0, | 444 | mm ? mm->end_code : 0, |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c7ef3e48e35b..d2fa42006d8f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -14,22 +14,41 @@ | |||
14 | char *task_mem(struct mm_struct *mm, char *buffer) | 14 | char *task_mem(struct mm_struct *mm, char *buffer) |
15 | { | 15 | { |
16 | unsigned long data, text, lib; | 16 | unsigned long data, text, lib; |
17 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; | ||
18 | |||
19 | /* | ||
20 | * Note: to minimize their overhead, mm maintains hiwater_vm and | ||
21 | * hiwater_rss only when about to *lower* total_vm or rss. Any | ||
22 | * collector of these hiwater stats must therefore get total_vm | ||
23 | * and rss too, which will usually be the higher. Barriers? not | ||
24 | * worth the effort, such snapshots can always be inconsistent. | ||
25 | */ | ||
26 | hiwater_vm = total_vm = mm->total_vm; | ||
27 | if (hiwater_vm < mm->hiwater_vm) | ||
28 | hiwater_vm = mm->hiwater_vm; | ||
29 | hiwater_rss = total_rss = get_mm_rss(mm); | ||
30 | if (hiwater_rss < mm->hiwater_rss) | ||
31 | hiwater_rss = mm->hiwater_rss; | ||
17 | 32 | ||
18 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; | 33 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; |
19 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; | 34 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; |
20 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; | 35 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; |
21 | buffer += sprintf(buffer, | 36 | buffer += sprintf(buffer, |
37 | "VmPeak:\t%8lu kB\n" | ||
22 | "VmSize:\t%8lu kB\n" | 38 | "VmSize:\t%8lu kB\n" |
23 | "VmLck:\t%8lu kB\n" | 39 | "VmLck:\t%8lu kB\n" |
40 | "VmHWM:\t%8lu kB\n" | ||
24 | "VmRSS:\t%8lu kB\n" | 41 | "VmRSS:\t%8lu kB\n" |
25 | "VmData:\t%8lu kB\n" | 42 | "VmData:\t%8lu kB\n" |
26 | "VmStk:\t%8lu kB\n" | 43 | "VmStk:\t%8lu kB\n" |
27 | "VmExe:\t%8lu kB\n" | 44 | "VmExe:\t%8lu kB\n" |
28 | "VmLib:\t%8lu kB\n" | 45 | "VmLib:\t%8lu kB\n" |
29 | "VmPTE:\t%8lu kB\n", | 46 | "VmPTE:\t%8lu kB\n", |
30 | (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), | 47 | hiwater_vm << (PAGE_SHIFT-10), |
48 | (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), | ||
31 | mm->locked_vm << (PAGE_SHIFT-10), | 49 | mm->locked_vm << (PAGE_SHIFT-10), |
32 | get_mm_counter(mm, rss) << (PAGE_SHIFT-10), | 50 | hiwater_rss << (PAGE_SHIFT-10), |
51 | total_rss << (PAGE_SHIFT-10), | ||
33 | data << (PAGE_SHIFT-10), | 52 | data << (PAGE_SHIFT-10), |
34 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, | 53 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, |
35 | (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); | 54 | (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); |
@@ -44,13 +63,11 @@ unsigned long task_vsize(struct mm_struct *mm) | |||
44 | int task_statm(struct mm_struct *mm, int *shared, int *text, | 63 | int task_statm(struct mm_struct *mm, int *shared, int *text, |
45 | int *data, int *resident) | 64 | int *data, int *resident) |
46 | { | 65 | { |
47 | int rss = get_mm_counter(mm, rss); | 66 | *shared = get_mm_counter(mm, file_rss); |
48 | |||
49 | *shared = rss - get_mm_counter(mm, anon_rss); | ||
50 | *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) | 67 | *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) |
51 | >> PAGE_SHIFT; | 68 | >> PAGE_SHIFT; |
52 | *data = mm->total_vm - mm->shared_vm; | 69 | *data = mm->total_vm - mm->shared_vm; |
53 | *resident = rss; | 70 | *resident = *shared + get_mm_counter(mm, anon_rss); |
54 | return mm->total_vm; | 71 | return mm->total_vm; |
55 | } | 72 | } |
56 | 73 | ||
@@ -186,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
186 | struct mem_size_stats *mss) | 203 | struct mem_size_stats *mss) |
187 | { | 204 | { |
188 | pte_t *pte, ptent; | 205 | pte_t *pte, ptent; |
206 | spinlock_t *ptl; | ||
189 | unsigned long pfn; | 207 | unsigned long pfn; |
190 | struct page *page; | 208 | struct page *page; |
191 | 209 | ||
192 | pte = pte_offset_map(pmd, addr); | 210 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
193 | do { | 211 | do { |
194 | ptent = *pte; | 212 | ptent = *pte; |
195 | if (pte_none(ptent) || !pte_present(ptent)) | 213 | if (!pte_present(ptent)) |
196 | continue; | 214 | continue; |
197 | 215 | ||
198 | mss->resident += PAGE_SIZE; | 216 | mss->resident += PAGE_SIZE; |
@@ -213,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
213 | mss->private_clean += PAGE_SIZE; | 231 | mss->private_clean += PAGE_SIZE; |
214 | } | 232 | } |
215 | } while (pte++, addr += PAGE_SIZE, addr != end); | 233 | } while (pte++, addr += PAGE_SIZE, addr != end); |
216 | pte_unmap(pte - 1); | 234 | pte_unmap_unlock(pte - 1, ptl); |
217 | cond_resched_lock(&vma->vm_mm->page_table_lock); | 235 | cond_resched(); |
218 | } | 236 | } |
219 | 237 | ||
220 | static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 238 | static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
@@ -268,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma, | |||
268 | static int show_smap(struct seq_file *m, void *v) | 286 | static int show_smap(struct seq_file *m, void *v) |
269 | { | 287 | { |
270 | struct vm_area_struct *vma = v; | 288 | struct vm_area_struct *vma = v; |
271 | struct mm_struct *mm = vma->vm_mm; | ||
272 | struct mem_size_stats mss; | 289 | struct mem_size_stats mss; |
273 | 290 | ||
274 | memset(&mss, 0, sizeof mss); | 291 | memset(&mss, 0, sizeof mss); |
275 | 292 | if (vma->vm_mm) | |
276 | if (mm) { | ||
277 | spin_lock(&mm->page_table_lock); | ||
278 | smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); | 293 | smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); |
279 | spin_unlock(&mm->page_table_lock); | ||
280 | } | ||
281 | |||
282 | return show_map_internal(m, v, &mss); | 294 | return show_map_internal(m, v, &mss); |
283 | } | 295 | } |
284 | 296 | ||
@@ -407,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) | |||
407 | for_each_node(i) | 419 | for_each_node(i) |
408 | md->node[i] =0; | 420 | md->node[i] =0; |
409 | 421 | ||
410 | spin_lock(&mm->page_table_lock); | ||
411 | for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { | 422 | for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { |
412 | page = follow_page(mm, vaddr, 0); | 423 | page = follow_page(mm, vaddr, 0); |
413 | if (page) { | 424 | if (page) { |
@@ -422,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) | |||
422 | md->anon++; | 433 | md->anon++; |
423 | md->node[page_to_nid(page)]++; | 434 | md->node[page_to_nid(page)]++; |
424 | } | 435 | } |
436 | cond_resched(); | ||
425 | } | 437 | } |
426 | spin_unlock(&mm->page_table_lock); | ||
427 | return md; | 438 | return md; |
428 | } | 439 | } |
429 | 440 | ||
@@ -469,7 +480,7 @@ static int show_numa_map(struct seq_file *m, void *v) | |||
469 | seq_printf(m, " interleave={"); | 480 | seq_printf(m, " interleave={"); |
470 | first = 1; | 481 | first = 1; |
471 | for_each_node(n) { | 482 | for_each_node(n) { |
472 | if (test_bit(n, pol->v.nodes)) { | 483 | if (node_isset(n, pol->v.nodes)) { |
473 | if (!first) | 484 | if (!first) |
474 | seq_putc(m,','); | 485 | seq_putc(m,','); |
475 | else | 486 | else |
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ba4767c04adf..4cd46abe8434 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c | |||
@@ -181,8 +181,9 @@ set_page_region( | |||
181 | size_t offset, | 181 | size_t offset, |
182 | size_t length) | 182 | size_t length) |
183 | { | 183 | { |
184 | page->private |= page_region_mask(offset, length); | 184 | set_page_private(page, |
185 | if (page->private == ~0UL) | 185 | page_private(page) | page_region_mask(offset, length)); |
186 | if (page_private(page) == ~0UL) | ||
186 | SetPageUptodate(page); | 187 | SetPageUptodate(page); |
187 | } | 188 | } |
188 | 189 | ||
@@ -194,7 +195,7 @@ test_page_region( | |||
194 | { | 195 | { |
195 | unsigned long mask = page_region_mask(offset, length); | 196 | unsigned long mask = page_region_mask(offset, length); |
196 | 197 | ||
197 | return (mask && (page->private & mask) == mask); | 198 | return (mask && (page_private(page) & mask) == mask); |
198 | } | 199 | } |
199 | 200 | ||
200 | /* | 201 | /* |
diff --git a/include/asm-alpha/barrier.h b/include/asm-alpha/barrier.h index 229c83fe77cb..681ff581afa5 100644 --- a/include/asm-alpha/barrier.h +++ b/include/asm-alpha/barrier.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef __BARRIER_H | 1 | #ifndef __BARRIER_H |
2 | #define __BARRIER_H | 2 | #define __BARRIER_H |
3 | 3 | ||
4 | #include <asm/compiler.h> | ||
5 | |||
4 | #define mb() \ | 6 | #define mb() \ |
5 | __asm__ __volatile__("mb": : :"memory") | 7 | __asm__ __volatile__("mb": : :"memory") |
6 | 8 | ||
diff --git a/include/asm-alpha/rwsem.h b/include/asm-alpha/rwsem.h index 8e058a67c9a4..fafdd4f7010a 100644 --- a/include/asm-alpha/rwsem.h +++ b/include/asm-alpha/rwsem.h | |||
@@ -262,5 +262,10 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) | |||
262 | #endif | 262 | #endif |
263 | } | 263 | } |
264 | 264 | ||
265 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
266 | { | ||
267 | return (sem->count != 0); | ||
268 | } | ||
269 | |||
265 | #endif /* __KERNEL__ */ | 270 | #endif /* __KERNEL__ */ |
266 | #endif /* _ALPHA_RWSEM_H */ | 271 | #endif /* _ALPHA_RWSEM_H */ |
diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h index 9bb325c54645..f49bfb78c221 100644 --- a/include/asm-arm/tlb.h +++ b/include/asm-arm/tlb.h | |||
@@ -27,11 +27,7 @@ | |||
27 | */ | 27 | */ |
28 | struct mmu_gather { | 28 | struct mmu_gather { |
29 | struct mm_struct *mm; | 29 | struct mm_struct *mm; |
30 | unsigned int freed; | ||
31 | unsigned int fullmm; | 30 | unsigned int fullmm; |
32 | |||
33 | unsigned int flushes; | ||
34 | unsigned int avoided_flushes; | ||
35 | }; | 31 | }; |
36 | 32 | ||
37 | DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); | 33 | DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); |
@@ -39,11 +35,9 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); | |||
39 | static inline struct mmu_gather * | 35 | static inline struct mmu_gather * |
40 | tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) | 36 | tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) |
41 | { | 37 | { |
42 | int cpu = smp_processor_id(); | 38 | struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); |
43 | struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); | ||
44 | 39 | ||
45 | tlb->mm = mm; | 40 | tlb->mm = mm; |
46 | tlb->freed = 0; | ||
47 | tlb->fullmm = full_mm_flush; | 41 | tlb->fullmm = full_mm_flush; |
48 | 42 | ||
49 | return tlb; | 43 | return tlb; |
@@ -52,24 +46,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) | |||
52 | static inline void | 46 | static inline void |
53 | tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | 47 | tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) |
54 | { | 48 | { |
55 | struct mm_struct *mm = tlb->mm; | ||
56 | unsigned long freed = tlb->freed; | ||
57 | int rss = get_mm_counter(mm, rss); | ||
58 | |||
59 | if (rss < freed) | ||
60 | freed = rss; | ||
61 | add_mm_counter(mm, rss, -freed); | ||
62 | |||
63 | if (tlb->fullmm) | 49 | if (tlb->fullmm) |
64 | flush_tlb_mm(mm); | 50 | flush_tlb_mm(tlb->mm); |
65 | 51 | ||
66 | /* keep the page table cache within bounds */ | 52 | /* keep the page table cache within bounds */ |
67 | check_pgt_cache(); | 53 | check_pgt_cache(); |
68 | } | ||
69 | 54 | ||
70 | static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb) | 55 | put_cpu_var(mmu_gathers); |
71 | { | ||
72 | return tlb->fullmm; | ||
73 | } | 56 | } |
74 | 57 | ||
75 | #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) | 58 | #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) |
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h index 1316352a58f3..08ddd85b8d35 100644 --- a/include/asm-arm26/tlb.h +++ b/include/asm-arm26/tlb.h | |||
@@ -10,24 +10,20 @@ | |||
10 | */ | 10 | */ |
11 | struct mmu_gather { | 11 | struct mmu_gather { |
12 | struct mm_struct *mm; | 12 | struct mm_struct *mm; |
13 | unsigned int freed; | 13 | unsigned int need_flush; |
14 | unsigned int fullmm; | 14 | unsigned int fullmm; |
15 | |||
16 | unsigned int flushes; | ||
17 | unsigned int avoided_flushes; | ||
18 | }; | 15 | }; |
19 | 16 | ||
20 | extern struct mmu_gather mmu_gathers[NR_CPUS]; | 17 | DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); |
21 | 18 | ||
22 | static inline struct mmu_gather * | 19 | static inline struct mmu_gather * |
23 | tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) | 20 | tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) |
24 | { | 21 | { |
25 | int cpu = smp_processor_id(); | 22 | struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); |
26 | struct mmu_gather *tlb = &mmu_gathers[cpu]; | ||
27 | 23 | ||
28 | tlb->mm = mm; | 24 | tlb->mm = mm; |
29 | tlb->freed = 0; | 25 | tlb->need_flush = 0; |
30 | tlb->fullmm = full_mm_flush; | 26 | tlb->fullmm = full_mm_flush; |
31 | 27 | ||
32 | return tlb; | 28 | return tlb; |
33 | } | 29 | } |
@@ -35,30 +31,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) | |||
35 | static inline void | 31 | static inline void |
36 | tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | 32 | tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) |
37 | { | 33 | { |
38 | struct mm_struct *mm = tlb->mm; | 34 | if (tlb->need_flush) |
39 | unsigned long freed = tlb->freed; | 35 | flush_tlb_mm(tlb->mm); |
40 | int rss = get_mm_counter(mm, rss); | ||
41 | |||
42 | if (rss < freed) | ||
43 | freed = rss; | ||
44 | add_mm_counter(mm, rss, -freed); | ||
45 | |||
46 | if (freed) { | ||
47 | flush_tlb_mm(mm); | ||
48 | tlb->flushes++; | ||
49 | } else { | ||
50 | tlb->avoided_flushes++; | ||
51 | } | ||
52 | 36 | ||
53 | /* keep the page table cache within bounds */ | 37 | /* keep the page table cache within bounds */ |
54 | check_pgt_cache(); | 38 | check_pgt_cache(); |
55 | } | ||
56 | |||
57 | 39 | ||
58 | static inline unsigned int | 40 | put_cpu_var(mmu_gathers); |
59 | tlb_is_full_mm(struct mmu_gather *tlb) | ||
60 | { | ||
61 | return tlb->fullmm; | ||
62 | } | 41 | } |
63 | 42 | ||
64 | #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) | 43 | #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) |
@@ -71,7 +50,13 @@ tlb_is_full_mm(struct mmu_gather *tlb) | |||
71 | } while (0) | 50 | } while (0) |
72 | #define tlb_end_vma(tlb,vma) do { } while (0) | 51 | #define tlb_end_vma(tlb,vma) do { } while (0) |
73 | 52 | ||
74 | #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) | 53 | static inline void |
54 | tlb_remove_page(struct mmu_gather *tlb, struct page *page) | ||
55 | { | ||
56 | tlb->need_flush = 1; | ||
57 | free_page_and_swap_cache(page); | ||
58 | } | ||
59 | |||
75 | #define pte_free_tlb(tlb,ptep) pte_free(ptep) | 60 | #define pte_free_tlb(tlb,ptep) pte_free(ptep) |
76 | #define pmd_free_tlb(tlb,pmdp) pmd_free(pmdp) | 61 | #define pmd_free_tlb(tlb,pmdp) pmd_free(pmdp) |
77 | 62 | ||
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index c20ec257ecc0..68c6fea994d9 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h | |||
@@ -10,14 +10,9 @@ | |||
10 | 10 | ||
11 | #define pud_t pgd_t | 11 | #define pud_t pgd_t |
12 | 12 | ||
13 | #define pmd_alloc(mm, pud, address) \ | 13 | #define pmd_alloc(mm, pud, address) \ |
14 | ({ pmd_t *ret; \ | 14 | ((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \ |
15 | if (pgd_none(*pud)) \ | 15 | NULL: pmd_offset(pud, address)) |
16 | ret = __pmd_alloc(mm, pud, address); \ | ||
17 | else \ | ||
18 | ret = pmd_offset(pud, address); \ | ||
19 | ret; \ | ||
20 | }) | ||
21 | 16 | ||
22 | #define pud_alloc(mm, pgd, address) (pgd) | 17 | #define pud_alloc(mm, pgd, address) (pgd) |
23 | #define pud_offset(pgd, start) (pgd) | 18 | #define pud_offset(pgd, start) (pgd) |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index ff28c8b31f58..7dca30a26c53 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -8,7 +8,7 @@ | |||
8 | * - update the page tables | 8 | * - update the page tables |
9 | * - inform the TLB about the new one | 9 | * - inform the TLB about the new one |
10 | * | 10 | * |
11 | * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock. | 11 | * We hold the mm semaphore for reading, and the pte lock. |
12 | * | 12 | * |
13 | * Note: the old pte is known to not be writable, so we don't need to | 13 | * Note: the old pte is known to not be writable, so we don't need to |
14 | * worry about dirty bits etc getting lost. | 14 | * worry about dirty bits etc getting lost. |
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 7d0298347ee7..cdd4145243cd 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h | |||
@@ -35,16 +35,13 @@ | |||
35 | #endif | 35 | #endif |
36 | 36 | ||
37 | /* struct mmu_gather is an opaque type used by the mm code for passing around | 37 | /* struct mmu_gather is an opaque type used by the mm code for passing around |
38 | * any data needed by arch specific code for tlb_remove_page. This structure | 38 | * any data needed by arch specific code for tlb_remove_page. |
39 | * can be per-CPU or per-MM as the page table lock is held for the duration of | ||
40 | * TLB shootdown. | ||
41 | */ | 39 | */ |
42 | struct mmu_gather { | 40 | struct mmu_gather { |
43 | struct mm_struct *mm; | 41 | struct mm_struct *mm; |
44 | unsigned int nr; /* set to ~0U means fast mode */ | 42 | unsigned int nr; /* set to ~0U means fast mode */ |
45 | unsigned int need_flush;/* Really unmapped some ptes? */ | 43 | unsigned int need_flush;/* Really unmapped some ptes? */ |
46 | unsigned int fullmm; /* non-zero means full mm flush */ | 44 | unsigned int fullmm; /* non-zero means full mm flush */ |
47 | unsigned long freed; | ||
48 | struct page * pages[FREE_PTE_NR]; | 45 | struct page * pages[FREE_PTE_NR]; |
49 | }; | 46 | }; |
50 | 47 | ||
@@ -57,7 +54,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); | |||
57 | static inline struct mmu_gather * | 54 | static inline struct mmu_gather * |
58 | tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) | 55 | tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) |
59 | { | 56 | { |
60 | struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id()); | 57 | struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); |
61 | 58 | ||
62 | tlb->mm = mm; | 59 | tlb->mm = mm; |
63 | 60 | ||
@@ -65,7 +62,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) | |||
65 | tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; | 62 | tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; |
66 | 63 | ||
67 | tlb->fullmm = full_mm_flush; | 64 | tlb->fullmm = full_mm_flush; |
68 | tlb->freed = 0; | ||
69 | 65 | ||
70 | return tlb; | 66 | return tlb; |
71 | } | 67 | } |
@@ -85,28 +81,17 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | |||
85 | 81 | ||
86 | /* tlb_finish_mmu | 82 | /* tlb_finish_mmu |
87 | * Called at the end of the shootdown operation to free up any resources | 83 | * Called at the end of the shootdown operation to free up any resources |
88 | * that were required. The page table lock is still held at this point. | 84 | * that were required. |
89 | */ | 85 | */ |
90 | static inline void | 86 | static inline void |
91 | tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | 87 | tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) |
92 | { | 88 | { |
93 | int freed = tlb->freed; | ||
94 | struct mm_struct *mm = tlb->mm; | ||
95 | int rss = get_mm_counter(mm, rss); | ||
96 | |||
97 | if (rss < freed) | ||
98 | freed = rss; | ||
99 | add_mm_counter(mm, rss, -freed); | ||
100 | tlb_flush_mmu(tlb, start, end); | 89 | tlb_flush_mmu(tlb, start, end); |
101 | 90 | ||
102 | /* keep the page table cache within bounds */ | 91 | /* keep the page table cache within bounds */ |
103 | check_pgt_cache(); | 92 | check_pgt_cache(); |
104 | } | ||
105 | 93 | ||
106 | static inline unsigned int | 94 | put_cpu_var(mmu_gathers); |
107 | tlb_is_full_mm(struct mmu_gather *tlb) | ||
108 | { | ||
109 | return tlb->fullmm; | ||
110 | } | 95 | } |
111 | 96 | ||
112 | /* tlb_remove_page | 97 | /* tlb_remove_page |
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 348fe3a4879d..620a90641ea8 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h | |||
@@ -88,12 +88,6 @@ static inline int pfn_to_nid(unsigned long pfn) | |||
88 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ | 88 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ |
89 | }) | 89 | }) |
90 | 90 | ||
91 | #define local_mapnr(kvaddr) \ | ||
92 | ({ \ | ||
93 | unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ | ||
94 | (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ | ||
95 | }) | ||
96 | |||
97 | /* XXX: FIXME -- wli */ | 91 | /* XXX: FIXME -- wli */ |
98 | #define kern_addr_valid(kaddr) (0) | 92 | #define kern_addr_valid(kaddr) (0) |
99 | 93 | ||
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index d101ac414f07..0e3ec809352d 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h | |||
@@ -203,7 +203,8 @@ extern unsigned long pg0[]; | |||
203 | #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) | 203 | #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) |
204 | #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) | 204 | #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) |
205 | 205 | ||
206 | #define pmd_none(x) (!pmd_val(x)) | 206 | /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ |
207 | #define pmd_none(x) (!(unsigned long)pmd_val(x)) | ||
207 | #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) | 208 | #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) |
208 | #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) | 209 | #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) |
209 | #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) | 210 | #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) |
diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h index 7625a675852f..be4ab859238e 100644 --- a/include/asm-i386/rwsem.h +++ b/include/asm-i386/rwsem.h | |||
@@ -284,5 +284,10 @@ LOCK_PREFIX "xadd %0,(%2)" | |||
284 | return tmp+delta; | 284 | return tmp+delta; |
285 | } | 285 | } |
286 | 286 | ||
287 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
288 | { | ||
289 | return (sem->count != 0); | ||
290 | } | ||
291 | |||
287 | #endif /* __KERNEL__ */ | 292 | #endif /* __KERNEL__ */ |
288 | #endif /* _I386_RWSEM_H */ | 293 | #endif /* _I386_RWSEM_H */ |
diff --git a/include/asm-ia64/rwsem.h b/include/asm-ia64/rwsem.h index e18b5ab0cb75..1327c91ea39c 100644 --- a/include/asm-ia64/rwsem.h +++ b/include/asm-ia64/rwsem.h | |||
@@ -186,4 +186,9 @@ __downgrade_write (struct rw_semaphore *sem) | |||
186 | #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) | 186 | #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) |
187 | #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) | 187 | #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) |
188 | 188 | ||
189 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
190 | { | ||
191 | return (sem->count != 0); | ||
192 | } | ||
193 | |||
189 | #endif /* _ASM_IA64_RWSEM_H */ | 194 | #endif /* _ASM_IA64_RWSEM_H */ |
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h index 3a9a6d1be75c..834370b9dea1 100644 --- a/include/asm-ia64/tlb.h +++ b/include/asm-ia64/tlb.h | |||
@@ -60,7 +60,6 @@ struct mmu_gather { | |||
60 | unsigned int nr; /* == ~0U => fast mode */ | 60 | unsigned int nr; /* == ~0U => fast mode */ |
61 | unsigned char fullmm; /* non-zero means full mm flush */ | 61 | unsigned char fullmm; /* non-zero means full mm flush */ |
62 | unsigned char need_flush; /* really unmapped some PTEs? */ | 62 | unsigned char need_flush; /* really unmapped some PTEs? */ |
63 | unsigned long freed; /* number of pages freed */ | ||
64 | unsigned long start_addr; | 63 | unsigned long start_addr; |
65 | unsigned long end_addr; | 64 | unsigned long end_addr; |
66 | struct page *pages[FREE_PTE_NR]; | 65 | struct page *pages[FREE_PTE_NR]; |
@@ -129,7 +128,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e | |||
129 | static inline struct mmu_gather * | 128 | static inline struct mmu_gather * |
130 | tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) | 129 | tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) |
131 | { | 130 | { |
132 | struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers); | 131 | struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); |
133 | 132 | ||
134 | tlb->mm = mm; | 133 | tlb->mm = mm; |
135 | /* | 134 | /* |
@@ -147,25 +146,17 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) | |||
147 | */ | 146 | */ |
148 | tlb->nr = (num_online_cpus() == 1) ? ~0U : 0; | 147 | tlb->nr = (num_online_cpus() == 1) ? ~0U : 0; |
149 | tlb->fullmm = full_mm_flush; | 148 | tlb->fullmm = full_mm_flush; |
150 | tlb->freed = 0; | ||
151 | tlb->start_addr = ~0UL; | 149 | tlb->start_addr = ~0UL; |
152 | return tlb; | 150 | return tlb; |
153 | } | 151 | } |
154 | 152 | ||
155 | /* | 153 | /* |
156 | * Called at the end of the shootdown operation to free up any resources that were | 154 | * Called at the end of the shootdown operation to free up any resources that were |
157 | * collected. The page table lock is still held at this point. | 155 | * collected. |
158 | */ | 156 | */ |
159 | static inline void | 157 | static inline void |
160 | tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) | 158 | tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) |
161 | { | 159 | { |
162 | unsigned long freed = tlb->freed; | ||
163 | struct mm_struct *mm = tlb->mm; | ||
164 | unsigned long rss = get_mm_counter(mm, rss); | ||
165 | |||
166 | if (rss < freed) | ||
167 | freed = rss; | ||
168 | add_mm_counter(mm, rss, -freed); | ||
169 | /* | 160 | /* |
170 | * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and | 161 | * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and |
171 | * tlb->end_addr. | 162 | * tlb->end_addr. |
@@ -174,12 +165,8 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) | |||
174 | 165 | ||
175 | /* keep the page table cache within bounds */ | 166 | /* keep the page table cache within bounds */ |
176 | check_pgt_cache(); | 167 | check_pgt_cache(); |
177 | } | ||
178 | 168 | ||
179 | static inline unsigned int | 169 | put_cpu_var(mmu_gathers); |
180 | tlb_is_full_mm(struct mmu_gather *tlb) | ||
181 | { | ||
182 | return tlb->fullmm; | ||
183 | } | 170 | } |
184 | 171 | ||
185 | /* | 172 | /* |
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h index d58878ec899e..adc7970a77ec 100644 --- a/include/asm-m32r/mmzone.h +++ b/include/asm-m32r/mmzone.h | |||
@@ -21,12 +21,6 @@ extern struct pglist_data *node_data[]; | |||
21 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ | 21 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ |
22 | }) | 22 | }) |
23 | 23 | ||
24 | #define local_mapnr(kvaddr) \ | ||
25 | ({ \ | ||
26 | unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ | ||
27 | (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ | ||
28 | }) | ||
29 | |||
30 | #define pfn_to_page(pfn) \ | 24 | #define pfn_to_page(pfn) \ |
31 | ({ \ | 25 | ({ \ |
32 | unsigned long __pfn = pfn; \ | 26 | unsigned long __pfn = pfn; \ |
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h index aa592d8c0e39..1bc3c83ee74b 100644 --- a/include/asm-parisc/cacheflush.h +++ b/include/asm-parisc/cacheflush.h | |||
@@ -100,30 +100,34 @@ static inline void flush_cache_range(struct vm_area_struct *vma, | |||
100 | 100 | ||
101 | /* Simple function to work out if we have an existing address translation | 101 | /* Simple function to work out if we have an existing address translation |
102 | * for a user space vma. */ | 102 | * for a user space vma. */ |
103 | static inline pte_t *__translation_exists(struct mm_struct *mm, | 103 | static inline int translation_exists(struct vm_area_struct *vma, |
104 | unsigned long addr) | 104 | unsigned long addr, unsigned long pfn) |
105 | { | 105 | { |
106 | pgd_t *pgd = pgd_offset(mm, addr); | 106 | pgd_t *pgd = pgd_offset(vma->vm_mm, addr); |
107 | pmd_t *pmd; | 107 | pmd_t *pmd; |
108 | pte_t *pte; | 108 | pte_t pte; |
109 | 109 | ||
110 | if(pgd_none(*pgd)) | 110 | if(pgd_none(*pgd)) |
111 | return NULL; | 111 | return 0; |
112 | 112 | ||
113 | pmd = pmd_offset(pgd, addr); | 113 | pmd = pmd_offset(pgd, addr); |
114 | if(pmd_none(*pmd) || pmd_bad(*pmd)) | 114 | if(pmd_none(*pmd) || pmd_bad(*pmd)) |
115 | return NULL; | 115 | return 0; |
116 | 116 | ||
117 | pte = pte_offset_map(pmd, addr); | 117 | /* We cannot take the pte lock here: flush_cache_page is usually |
118 | * called with pte lock already held. Whereas flush_dcache_page | ||
119 | * takes flush_dcache_mmap_lock, which is lower in the hierarchy: | ||
120 | * the vma itself is secure, but the pte might come or go racily. | ||
121 | */ | ||
122 | pte = *pte_offset_map(pmd, addr); | ||
123 | /* But pte_unmap() does nothing on this architecture */ | ||
118 | 124 | ||
119 | /* The PA flush mappings show up as pte_none, but they're | 125 | /* Filter out coincidental file entries and swap entries */ |
120 | * valid none the less */ | 126 | if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT))) |
121 | if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0)) | 127 | return 0; |
122 | return NULL; | ||
123 | return pte; | ||
124 | } | ||
125 | #define translation_exists(vma, addr) __translation_exists((vma)->vm_mm, addr) | ||
126 | 128 | ||
129 | return pte_pfn(pte) == pfn; | ||
130 | } | ||
127 | 131 | ||
128 | /* Private function to flush a page from the cache of a non-current | 132 | /* Private function to flush a page from the cache of a non-current |
129 | * process. cr25 contains the Page Directory of the current user | 133 | * process. cr25 contains the Page Directory of the current user |
@@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long | |||
175 | { | 179 | { |
176 | BUG_ON(!vma->vm_mm->context); | 180 | BUG_ON(!vma->vm_mm->context); |
177 | 181 | ||
178 | if(likely(translation_exists(vma, vmaddr))) | 182 | if (likely(translation_exists(vma, vmaddr, pfn))) |
179 | __flush_cache_page(vma, vmaddr); | 183 | __flush_cache_page(vma, vmaddr); |
180 | 184 | ||
181 | } | 185 | } |
182 | #endif | 186 | #endif |
183 | |||
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h index 595d3dce120a..ae039f4fd711 100644 --- a/include/asm-parisc/mmzone.h +++ b/include/asm-parisc/mmzone.h | |||
@@ -27,12 +27,6 @@ extern struct node_map_data node_data[]; | |||
27 | }) | 27 | }) |
28 | #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) | 28 | #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) |
29 | 29 | ||
30 | #define local_mapnr(kvaddr) \ | ||
31 | ({ \ | ||
32 | unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ | ||
33 | (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ | ||
34 | }) | ||
35 | |||
36 | #define pfn_to_page(pfn) \ | 30 | #define pfn_to_page(pfn) \ |
37 | ({ \ | 31 | ({ \ |
38 | unsigned long __pfn = (pfn); \ | 32 | unsigned long __pfn = (pfn); \ |
diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h index 84af4ab1fe51..e97aa8d1eff5 100644 --- a/include/asm-parisc/tlbflush.h +++ b/include/asm-parisc/tlbflush.h | |||
@@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, | |||
88 | if (npages >= 512) /* 2MB of space: arbitrary, should be tuned */ | 88 | if (npages >= 512) /* 2MB of space: arbitrary, should be tuned */ |
89 | flush_tlb_all(); | 89 | flush_tlb_all(); |
90 | else { | 90 | else { |
91 | 91 | preempt_disable(); | |
92 | mtsp(vma->vm_mm->context,1); | 92 | mtsp(vma->vm_mm->context,1); |
93 | purge_tlb_start(); | 93 | purge_tlb_start(); |
94 | if (split_tlb) { | 94 | if (split_tlb) { |
@@ -102,6 +102,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, | |||
102 | pdtlb(start); | 102 | pdtlb(start); |
103 | start += PAGE_SIZE; | 103 | start += PAGE_SIZE; |
104 | } | 104 | } |
105 | preempt_enable(); | ||
105 | } | 106 | } |
106 | purge_tlb_end(); | 107 | purge_tlb_end(); |
107 | } | 108 | } |
diff --git a/include/asm-ppc/rwsem.h b/include/asm-ppc/rwsem.h index 3e738f483c11..3501ea72f88c 100644 --- a/include/asm-ppc/rwsem.h +++ b/include/asm-ppc/rwsem.h | |||
@@ -168,5 +168,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) | |||
168 | return atomic_add_return(delta, (atomic_t *)(&sem->count)); | 168 | return atomic_add_return(delta, (atomic_t *)(&sem->count)); |
169 | } | 169 | } |
170 | 170 | ||
171 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
172 | { | ||
173 | return (sem->count != 0); | ||
174 | } | ||
175 | |||
171 | #endif /* __KERNEL__ */ | 176 | #endif /* __KERNEL__ */ |
172 | #endif /* _PPC_RWSEM_XADD_H */ | 177 | #endif /* _PPC_RWSEM_XADD_H */ |
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index ed473f4b0152..80a708e7093a 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h | |||
@@ -67,9 +67,6 @@ static inline int pa_to_nid(unsigned long pa) | |||
67 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 67 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
68 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) | 68 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) |
69 | 69 | ||
70 | #define local_mapnr(kvaddr) \ | ||
71 | ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) | ||
72 | |||
73 | #ifdef CONFIG_DISCONTIGMEM | 70 | #ifdef CONFIG_DISCONTIGMEM |
74 | 71 | ||
75 | /* | 72 | /* |
diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h index c83679c9d2b0..2eb1778a3a15 100644 --- a/include/asm-ppc64/pgtable.h +++ b/include/asm-ppc64/pgtable.h | |||
@@ -478,10 +478,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, | |||
478 | #define __HAVE_ARCH_PTE_SAME | 478 | #define __HAVE_ARCH_PTE_SAME |
479 | #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) | 479 | #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) |
480 | 480 | ||
481 | #define pte_ERROR(e) \ | ||
482 | printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) | ||
481 | #define pmd_ERROR(e) \ | 483 | #define pmd_ERROR(e) \ |
482 | printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) | 484 | printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) |
483 | #define pud_ERROR(e) \ | 485 | #define pud_ERROR(e) \ |
484 | printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) | 486 | printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e)) |
485 | #define pgd_ERROR(e) \ | 487 | #define pgd_ERROR(e) \ |
486 | printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) | 488 | printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) |
487 | 489 | ||
diff --git a/include/asm-ppc64/rwsem.h b/include/asm-ppc64/rwsem.h index bd5c2f093575..7a647fae3765 100644 --- a/include/asm-ppc64/rwsem.h +++ b/include/asm-ppc64/rwsem.h | |||
@@ -163,5 +163,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) | |||
163 | return atomic_add_return(delta, (atomic_t *)(&sem->count)); | 163 | return atomic_add_return(delta, (atomic_t *)(&sem->count)); |
164 | } | 164 | } |
165 | 165 | ||
166 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
167 | { | ||
168 | return (sem->count != 0); | ||
169 | } | ||
170 | |||
166 | #endif /* __KERNEL__ */ | 171 | #endif /* __KERNEL__ */ |
167 | #endif /* _PPC_RWSEM_XADD_H */ | 172 | #endif /* _PPC_RWSEM_XADD_H */ |
diff --git a/include/asm-s390/rwsem.h b/include/asm-s390/rwsem.h index 8c0cebbfc034..0422a085dd56 100644 --- a/include/asm-s390/rwsem.h +++ b/include/asm-s390/rwsem.h | |||
@@ -351,5 +351,10 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) | |||
351 | return new; | 351 | return new; |
352 | } | 352 | } |
353 | 353 | ||
354 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
355 | { | ||
356 | return (sem->count != 0); | ||
357 | } | ||
358 | |||
354 | #endif /* __KERNEL__ */ | 359 | #endif /* __KERNEL__ */ |
355 | #endif /* _S390_RWSEM_H */ | 360 | #endif /* _S390_RWSEM_H */ |
diff --git a/include/asm-sh/rwsem.h b/include/asm-sh/rwsem.h index 1be4337f5259..0262d3d1e5e0 100644 --- a/include/asm-sh/rwsem.h +++ b/include/asm-sh/rwsem.h | |||
@@ -166,5 +166,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) | |||
166 | return atomic_add_return(delta, (atomic_t *)(&sem->count)); | 166 | return atomic_add_return(delta, (atomic_t *)(&sem->count)); |
167 | } | 167 | } |
168 | 168 | ||
169 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
170 | { | ||
171 | return (sem->count != 0); | ||
172 | } | ||
173 | |||
169 | #endif /* __KERNEL__ */ | 174 | #endif /* __KERNEL__ */ |
170 | #endif /* _ASM_SH_RWSEM_H */ | 175 | #endif /* _ASM_SH_RWSEM_H */ |
diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h index 4568ee4022df..cef5e8270421 100644 --- a/include/asm-sparc64/rwsem.h +++ b/include/asm-sparc64/rwsem.h | |||
@@ -56,6 +56,11 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) | |||
56 | atomic_add(delta, (atomic_t *)(&sem->count)); | 56 | atomic_add(delta, (atomic_t *)(&sem->count)); |
57 | } | 57 | } |
58 | 58 | ||
59 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
60 | { | ||
61 | return (sem->count != 0); | ||
62 | } | ||
63 | |||
59 | #endif /* __KERNEL__ */ | 64 | #endif /* __KERNEL__ */ |
60 | 65 | ||
61 | #endif /* _SPARC64_RWSEM_H */ | 66 | #endif /* _SPARC64_RWSEM_H */ |
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h index 9baf57db01d2..66138d959df5 100644 --- a/include/asm-sparc64/tlb.h +++ b/include/asm-sparc64/tlb.h | |||
@@ -25,9 +25,8 @@ struct mmu_gather { | |||
25 | struct mm_struct *mm; | 25 | struct mm_struct *mm; |
26 | unsigned int pages_nr; | 26 | unsigned int pages_nr; |
27 | unsigned int need_flush; | 27 | unsigned int need_flush; |
28 | unsigned int tlb_frozen; | 28 | unsigned int fullmm; |
29 | unsigned int tlb_nr; | 29 | unsigned int tlb_nr; |
30 | unsigned long freed; | ||
31 | unsigned long vaddrs[TLB_BATCH_NR]; | 30 | unsigned long vaddrs[TLB_BATCH_NR]; |
32 | struct page *pages[FREE_PTE_NR]; | 31 | struct page *pages[FREE_PTE_NR]; |
33 | }; | 32 | }; |
@@ -44,14 +43,13 @@ extern void flush_tlb_pending(void); | |||
44 | 43 | ||
45 | static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) | 44 | static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) |
46 | { | 45 | { |
47 | struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); | 46 | struct mmu_gather *mp = &get_cpu_var(mmu_gathers); |
48 | 47 | ||
49 | BUG_ON(mp->tlb_nr); | 48 | BUG_ON(mp->tlb_nr); |
50 | 49 | ||
51 | mp->mm = mm; | 50 | mp->mm = mm; |
52 | mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U; | 51 | mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U; |
53 | mp->tlb_frozen = full_mm_flush; | 52 | mp->fullmm = full_mm_flush; |
54 | mp->freed = 0; | ||
55 | 53 | ||
56 | return mp; | 54 | return mp; |
57 | } | 55 | } |
@@ -78,30 +76,19 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm); | |||
78 | 76 | ||
79 | static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end) | 77 | static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end) |
80 | { | 78 | { |
81 | unsigned long freed = mp->freed; | ||
82 | struct mm_struct *mm = mp->mm; | ||
83 | unsigned long rss = get_mm_counter(mm, rss); | ||
84 | |||
85 | if (rss < freed) | ||
86 | freed = rss; | ||
87 | add_mm_counter(mm, rss, -freed); | ||
88 | |||
89 | tlb_flush_mmu(mp); | 79 | tlb_flush_mmu(mp); |
90 | 80 | ||
91 | if (mp->tlb_frozen) { | 81 | if (mp->fullmm) { |
92 | if (CTX_VALID(mm->context)) | 82 | if (CTX_VALID(mp->mm->context)) |
93 | do_flush_tlb_mm(mm); | 83 | do_flush_tlb_mm(mp->mm); |
94 | mp->tlb_frozen = 0; | 84 | mp->fullmm = 0; |
95 | } else | 85 | } else |
96 | flush_tlb_pending(); | 86 | flush_tlb_pending(); |
97 | 87 | ||
98 | /* keep the page table cache within bounds */ | 88 | /* keep the page table cache within bounds */ |
99 | check_pgt_cache(); | 89 | check_pgt_cache(); |
100 | } | ||
101 | 90 | ||
102 | static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp) | 91 | put_cpu_var(mmu_gathers); |
103 | { | ||
104 | return mp->tlb_frozen; | ||
105 | } | 92 | } |
106 | 93 | ||
107 | static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) | 94 | static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) |
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h index 616d02b57ea9..ac64eb955868 100644 --- a/include/asm-um/pgtable.h +++ b/include/asm-um/pgtable.h | |||
@@ -138,7 +138,7 @@ extern unsigned long pg0[1024]; | |||
138 | 138 | ||
139 | #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) | 139 | #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) |
140 | 140 | ||
141 | #define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) | 141 | #define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE)) |
142 | #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) | 142 | #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) |
143 | #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) | 143 | #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) |
144 | #define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) | 144 | #define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) |
diff --git a/include/asm-x86_64/rwsem.h b/include/asm-x86_64/rwsem.h index c002175b6e82..46077e9c1910 100644 --- a/include/asm-x86_64/rwsem.h +++ b/include/asm-x86_64/rwsem.h | |||
@@ -274,5 +274,10 @@ LOCK_PREFIX "xaddl %0,(%2)" | |||
274 | return tmp+delta; | 274 | return tmp+delta; |
275 | } | 275 | } |
276 | 276 | ||
277 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
278 | { | ||
279 | return (sem->count != 0); | ||
280 | } | ||
281 | |||
277 | #endif /* __KERNEL__ */ | 282 | #endif /* __KERNEL__ */ |
278 | #endif /* _X8664_RWSEM_H */ | 283 | #endif /* _X8664_RWSEM_H */ |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 88af42f5e04a..c937d6e65502 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp) | |||
126 | /* If we *know* page->private refers to buffer_heads */ | 126 | /* If we *know* page->private refers to buffer_heads */ |
127 | #define page_buffers(page) \ | 127 | #define page_buffers(page) \ |
128 | ({ \ | 128 | ({ \ |
129 | BUG_ON(!PagePrivate(page)); \ | 129 | BUG_ON(!PagePrivate(page)); \ |
130 | ((struct buffer_head *)(page)->private); \ | 130 | ((struct buffer_head *)page_private(page)); \ |
131 | }) | 131 | }) |
132 | #define page_has_buffers(page) PagePrivate(page) | 132 | #define page_has_buffers(page) PagePrivate(page) |
133 | 133 | ||
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page, | |||
219 | { | 219 | { |
220 | page_cache_get(page); | 220 | page_cache_get(page); |
221 | SetPagePrivate(page); | 221 | SetPagePrivate(page); |
222 | page->private = (unsigned long)head; | 222 | set_page_private(page, (unsigned long)head); |
223 | } | 223 | } |
224 | 224 | ||
225 | static inline void get_bh(struct buffer_head *bh) | 225 | static inline void get_bh(struct buffer_head *bh) |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d664330d900e..0cea162b08c0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) | |||
16 | int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); | 16 | int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); |
17 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); | 17 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); |
18 | int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); | 18 | int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); |
19 | void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); | ||
20 | void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); | 19 | void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); |
21 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); | 20 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); |
22 | int hugetlb_report_meminfo(char *); | 21 | int hugetlb_report_meminfo(char *); |
@@ -87,7 +86,6 @@ static inline unsigned long hugetlb_total_pages(void) | |||
87 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) | 86 | #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) |
88 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) | 87 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) |
89 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) | 88 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) |
90 | #define zap_hugepage_range(vma, start, len) BUG() | ||
91 | #define unmap_hugepage_range(vma, start, end) BUG() | 89 | #define unmap_hugepage_range(vma, start, end) BUG() |
92 | #define is_hugepage_mem_enough(size) 0 | 90 | #define is_hugepage_mem_enough(size) 0 |
93 | #define hugetlb_report_meminfo(buf) 0 | 91 | #define hugetlb_report_meminfo(buf) 0 |
diff --git a/include/linux/memory.h b/include/linux/memory.h new file mode 100644 index 000000000000..0def328ab5cf --- /dev/null +++ b/include/linux/memory.h | |||
@@ -0,0 +1,94 @@ | |||
1 | /* | ||
2 | * include/linux/memory.h - generic memory definition | ||
3 | * | ||
4 | * This is mainly for topological representation. We define the | ||
5 | * basic "struct memory_block" here, which can be embedded in per-arch | ||
6 | * definitions or NUMA information. | ||
7 | * | ||
8 | * Basic handling of the devices is done in drivers/base/memory.c | ||
9 | * and system devices are handled in drivers/base/sys.c. | ||
10 | * | ||
11 | * Memory block are exported via sysfs in the class/memory/devices/ | ||
12 | * directory. | ||
13 | * | ||
14 | */ | ||
15 | #ifndef _LINUX_MEMORY_H_ | ||
16 | #define _LINUX_MEMORY_H_ | ||
17 | |||
18 | #include <linux/sysdev.h> | ||
19 | #include <linux/node.h> | ||
20 | #include <linux/compiler.h> | ||
21 | |||
22 | #include <asm/semaphore.h> | ||
23 | |||
24 | struct memory_block { | ||
25 | unsigned long phys_index; | ||
26 | unsigned long state; | ||
27 | /* | ||
28 | * This serializes all state change requests. It isn't | ||
29 | * held during creation because the control files are | ||
30 | * created long after the critical areas during | ||
31 | * initialization. | ||
32 | */ | ||
33 | struct semaphore state_sem; | ||
34 | int phys_device; /* to which fru does this belong? */ | ||
35 | void *hw; /* optional pointer to fw/hw data */ | ||
36 | int (*phys_callback)(struct memory_block *); | ||
37 | struct sys_device sysdev; | ||
38 | }; | ||
39 | |||
40 | /* These states are exposed to userspace as text strings in sysfs */ | ||
41 | #define MEM_ONLINE (1<<0) /* exposed to userspace */ | ||
42 | #define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ | ||
43 | #define MEM_OFFLINE (1<<2) /* exposed to userspace */ | ||
44 | |||
45 | /* | ||
46 | * All of these states are currently kernel-internal for notifying | ||
47 | * kernel components and architectures. | ||
48 | * | ||
49 | * For MEM_MAPPING_INVALID, all notifier chains with priority >0 | ||
50 | * are called before pfn_to_page() becomes invalid. The priority=0 | ||
51 | * entry is reserved for the function that actually makes | ||
52 | * pfn_to_page() stop working. Any notifiers that want to be called | ||
53 | * after that should have priority <0. | ||
54 | */ | ||
55 | #define MEM_MAPPING_INVALID (1<<3) | ||
56 | |||
57 | #ifndef CONFIG_MEMORY_HOTPLUG | ||
58 | static inline int memory_dev_init(void) | ||
59 | { | ||
60 | return 0; | ||
61 | } | ||
62 | static inline int register_memory_notifier(struct notifier_block *nb) | ||
63 | { | ||
64 | return 0; | ||
65 | } | ||
66 | static inline void unregister_memory_notifier(struct notifier_block *nb) | ||
67 | { | ||
68 | } | ||
69 | #else | ||
70 | extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); | ||
71 | extern int register_new_memory(struct mem_section *); | ||
72 | extern int unregister_memory_section(struct mem_section *); | ||
73 | extern int memory_dev_init(void); | ||
74 | extern int register_memory_notifier(struct notifier_block *nb); | ||
75 | extern void unregister_memory_notifier(struct notifier_block *nb); | ||
76 | |||
77 | #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT) | ||
78 | |||
79 | extern int invalidate_phys_mapping(unsigned long, unsigned long); | ||
80 | struct notifier_block; | ||
81 | |||
82 | extern int register_memory_notifier(struct notifier_block *nb); | ||
83 | extern void unregister_memory_notifier(struct notifier_block *nb); | ||
84 | |||
85 | extern struct sysdev_class memory_sysdev_class; | ||
86 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
87 | |||
88 | #define hotplug_memory_notifier(fn, pri) { \ | ||
89 | static struct notifier_block fn##_mem_nb = \ | ||
90 | { .notifier_call = fn, .priority = pri }; \ | ||
91 | register_memory_notifier(&fn##_mem_nb); \ | ||
92 | } | ||
93 | |||
94 | #endif /* _LINUX_MEMORY_H_ */ | ||
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h new file mode 100644 index 000000000000..01f03bc06eff --- /dev/null +++ b/include/linux/memory_hotplug.h | |||
@@ -0,0 +1,104 @@ | |||
1 | #ifndef __LINUX_MEMORY_HOTPLUG_H | ||
2 | #define __LINUX_MEMORY_HOTPLUG_H | ||
3 | |||
4 | #include <linux/mmzone.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/mmzone.h> | ||
7 | #include <linux/notifier.h> | ||
8 | |||
9 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
10 | /* | ||
11 | * pgdat resizing functions | ||
12 | */ | ||
13 | static inline | ||
14 | void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) | ||
15 | { | ||
16 | spin_lock_irqsave(&pgdat->node_size_lock, *flags); | ||
17 | } | ||
18 | static inline | ||
19 | void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) | ||
20 | { | ||
21 | spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); | ||
22 | } | ||
23 | static inline | ||
24 | void pgdat_resize_init(struct pglist_data *pgdat) | ||
25 | { | ||
26 | spin_lock_init(&pgdat->node_size_lock); | ||
27 | } | ||
28 | /* | ||
29 | * Zone resizing functions | ||
30 | */ | ||
31 | static inline unsigned zone_span_seqbegin(struct zone *zone) | ||
32 | { | ||
33 | return read_seqbegin(&zone->span_seqlock); | ||
34 | } | ||
35 | static inline int zone_span_seqretry(struct zone *zone, unsigned iv) | ||
36 | { | ||
37 | return read_seqretry(&zone->span_seqlock, iv); | ||
38 | } | ||
39 | static inline void zone_span_writelock(struct zone *zone) | ||
40 | { | ||
41 | write_seqlock(&zone->span_seqlock); | ||
42 | } | ||
43 | static inline void zone_span_writeunlock(struct zone *zone) | ||
44 | { | ||
45 | write_sequnlock(&zone->span_seqlock); | ||
46 | } | ||
47 | static inline void zone_seqlock_init(struct zone *zone) | ||
48 | { | ||
49 | seqlock_init(&zone->span_seqlock); | ||
50 | } | ||
51 | extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); | ||
52 | extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); | ||
53 | extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); | ||
54 | /* need some defines for these for archs that don't support it */ | ||
55 | extern void online_page(struct page *page); | ||
56 | /* VM interface that may be used by firmware interface */ | ||
57 | extern int add_memory(u64 start, u64 size); | ||
58 | extern int remove_memory(u64 start, u64 size); | ||
59 | extern int online_pages(unsigned long, unsigned long); | ||
60 | |||
61 | /* reasonably generic interface to expand the physical pages in a zone */ | ||
62 | extern int __add_pages(struct zone *zone, unsigned long start_pfn, | ||
63 | unsigned long nr_pages); | ||
64 | #else /* ! CONFIG_MEMORY_HOTPLUG */ | ||
65 | /* | ||
66 | * Stub functions for when hotplug is off | ||
67 | */ | ||
68 | static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} | ||
69 | static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} | ||
70 | static inline void pgdat_resize_init(struct pglist_data *pgdat) {} | ||
71 | |||
72 | static inline unsigned zone_span_seqbegin(struct zone *zone) | ||
73 | { | ||
74 | return 0; | ||
75 | } | ||
76 | static inline int zone_span_seqretry(struct zone *zone, unsigned iv) | ||
77 | { | ||
78 | return 0; | ||
79 | } | ||
80 | static inline void zone_span_writelock(struct zone *zone) {} | ||
81 | static inline void zone_span_writeunlock(struct zone *zone) {} | ||
82 | static inline void zone_seqlock_init(struct zone *zone) {} | ||
83 | |||
84 | static inline int mhp_notimplemented(const char *func) | ||
85 | { | ||
86 | printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); | ||
87 | dump_stack(); | ||
88 | return -ENOSYS; | ||
89 | } | ||
90 | |||
91 | static inline int __add_pages(struct zone *zone, unsigned long start_pfn, | ||
92 | unsigned long nr_pages) | ||
93 | { | ||
94 | return mhp_notimplemented(__FUNCTION__); | ||
95 | } | ||
96 | #endif /* ! CONFIG_MEMORY_HOTPLUG */ | ||
97 | static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, | ||
98 | unsigned long nr_pages) | ||
99 | { | ||
100 | printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__); | ||
101 | dump_stack(); | ||
102 | return -ENOSYS; | ||
103 | } | ||
104 | #endif /* __LINUX_MEMORY_HOTPLUG_H */ | ||
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 58385ee1c0ac..7af8cb836e78 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -27,10 +27,10 @@ | |||
27 | 27 | ||
28 | #include <linux/config.h> | 28 | #include <linux/config.h> |
29 | #include <linux/mmzone.h> | 29 | #include <linux/mmzone.h> |
30 | #include <linux/bitmap.h> | ||
31 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
32 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
33 | #include <linux/spinlock.h> | 32 | #include <linux/spinlock.h> |
33 | #include <linux/nodemask.h> | ||
34 | 34 | ||
35 | struct vm_area_struct; | 35 | struct vm_area_struct; |
36 | 36 | ||
@@ -47,8 +47,7 @@ struct vm_area_struct; | |||
47 | * Locking policy for interlave: | 47 | * Locking policy for interlave: |
48 | * In process context there is no locking because only the process accesses | 48 | * In process context there is no locking because only the process accesses |
49 | * its own state. All vma manipulation is somewhat protected by a down_read on | 49 | * its own state. All vma manipulation is somewhat protected by a down_read on |
50 | * mmap_sem. For allocating in the interleave policy the page_table_lock | 50 | * mmap_sem. |
51 | * must be also aquired to protect il_next. | ||
52 | * | 51 | * |
53 | * Freeing policy: | 52 | * Freeing policy: |
54 | * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. | 53 | * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. |
@@ -63,7 +62,7 @@ struct mempolicy { | |||
63 | union { | 62 | union { |
64 | struct zonelist *zonelist; /* bind */ | 63 | struct zonelist *zonelist; /* bind */ |
65 | short preferred_node; /* preferred */ | 64 | short preferred_node; /* preferred */ |
66 | DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */ | 65 | nodemask_t nodes; /* interleave */ |
67 | /* undefined for default */ | 66 | /* undefined for default */ |
68 | } v; | 67 | } v; |
69 | }; | 68 | }; |
diff --git a/include/linux/mm.h b/include/linux/mm.h index e1649578fb0c..5c1fb0a2e806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp); | |||
157 | 157 | ||
158 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ | 158 | #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ |
159 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ | 159 | #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ |
160 | #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ | 160 | #define VM_RESERVED 0x00080000 /* Pages managed in a special way */ |
161 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ | 161 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ |
162 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ | 162 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ |
163 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ | 163 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ |
@@ -226,13 +226,18 @@ struct page { | |||
226 | * to show when page is mapped | 226 | * to show when page is mapped |
227 | * & limit reverse map searches. | 227 | * & limit reverse map searches. |
228 | */ | 228 | */ |
229 | unsigned long private; /* Mapping-private opaque data: | 229 | union { |
230 | unsigned long private; /* Mapping-private opaque data: | ||
230 | * usually used for buffer_heads | 231 | * usually used for buffer_heads |
231 | * if PagePrivate set; used for | 232 | * if PagePrivate set; used for |
232 | * swp_entry_t if PageSwapCache | 233 | * swp_entry_t if PageSwapCache |
233 | * When page is free, this indicates | 234 | * When page is free, this indicates |
234 | * order in the buddy system. | 235 | * order in the buddy system. |
235 | */ | 236 | */ |
237 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
238 | spinlock_t ptl; | ||
239 | #endif | ||
240 | } u; | ||
236 | struct address_space *mapping; /* If low bit clear, points to | 241 | struct address_space *mapping; /* If low bit clear, points to |
237 | * inode address_space, or NULL. | 242 | * inode address_space, or NULL. |
238 | * If page mapped as anonymous | 243 | * If page mapped as anonymous |
@@ -260,6 +265,9 @@ struct page { | |||
260 | #endif /* WANT_PAGE_VIRTUAL */ | 265 | #endif /* WANT_PAGE_VIRTUAL */ |
261 | }; | 266 | }; |
262 | 267 | ||
268 | #define page_private(page) ((page)->u.private) | ||
269 | #define set_page_private(page, v) ((page)->u.private = (v)) | ||
270 | |||
263 | /* | 271 | /* |
264 | * FIXME: take this include out, include page-flags.h in | 272 | * FIXME: take this include out, include page-flags.h in |
265 | * files which need it (119 of them) | 273 | * files which need it (119 of them) |
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); | |||
311 | 319 | ||
312 | #ifdef CONFIG_HUGETLB_PAGE | 320 | #ifdef CONFIG_HUGETLB_PAGE |
313 | 321 | ||
314 | static inline int page_count(struct page *p) | 322 | static inline int page_count(struct page *page) |
315 | { | 323 | { |
316 | if (PageCompound(p)) | 324 | if (PageCompound(page)) |
317 | p = (struct page *)p->private; | 325 | page = (struct page *)page_private(page); |
318 | return atomic_read(&(p)->_count) + 1; | 326 | return atomic_read(&page->_count) + 1; |
319 | } | 327 | } |
320 | 328 | ||
321 | static inline void get_page(struct page *page) | 329 | static inline void get_page(struct page *page) |
322 | { | 330 | { |
323 | if (unlikely(PageCompound(page))) | 331 | if (unlikely(PageCompound(page))) |
324 | page = (struct page *)page->private; | 332 | page = (struct page *)page_private(page); |
325 | atomic_inc(&page->_count); | 333 | atomic_inc(&page->_count); |
326 | } | 334 | } |
327 | 335 | ||
@@ -338,7 +346,7 @@ static inline void get_page(struct page *page) | |||
338 | 346 | ||
339 | static inline void put_page(struct page *page) | 347 | static inline void put_page(struct page *page) |
340 | { | 348 | { |
341 | if (!PageReserved(page) && put_page_testzero(page)) | 349 | if (put_page_testzero(page)) |
342 | __page_cache_release(page); | 350 | __page_cache_release(page); |
343 | } | 351 | } |
344 | 352 | ||
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page) | |||
587 | static inline pgoff_t page_index(struct page *page) | 595 | static inline pgoff_t page_index(struct page *page) |
588 | { | 596 | { |
589 | if (unlikely(PageSwapCache(page))) | 597 | if (unlikely(PageSwapCache(page))) |
590 | return page->private; | 598 | return page_private(page); |
591 | return page->index; | 599 | return page->index; |
592 | } | 600 | } |
593 | 601 | ||
@@ -682,7 +690,7 @@ struct zap_details { | |||
682 | 690 | ||
683 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | 691 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, |
684 | unsigned long size, struct zap_details *); | 692 | unsigned long size, struct zap_details *); |
685 | unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, | 693 | unsigned long unmap_vmas(struct mmu_gather **tlb, |
686 | struct vm_area_struct *start_vma, unsigned long start_addr, | 694 | struct vm_area_struct *start_vma, unsigned long start_addr, |
687 | unsigned long end_addr, unsigned long *nr_accounted, | 695 | unsigned long end_addr, unsigned long *nr_accounted, |
688 | struct zap_details *); | 696 | struct zap_details *); |
@@ -704,10 +712,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, | |||
704 | } | 712 | } |
705 | 713 | ||
706 | extern int vmtruncate(struct inode * inode, loff_t offset); | 714 | extern int vmtruncate(struct inode * inode, loff_t offset); |
707 | extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); | ||
708 | extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); | ||
709 | extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); | ||
710 | extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); | ||
711 | extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); | 715 | extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); |
712 | extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); | 716 | extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); |
713 | extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); | 717 | extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); |
@@ -723,6 +727,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); | |||
723 | 727 | ||
724 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, | 728 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, |
725 | int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); | 729 | int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); |
730 | void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long); | ||
726 | 731 | ||
727 | int __set_page_dirty_buffers(struct page *page); | 732 | int __set_page_dirty_buffers(struct page *page); |
728 | int __set_page_dirty_nobuffers(struct page *page); | 733 | int __set_page_dirty_nobuffers(struct page *page); |
@@ -759,38 +764,83 @@ struct shrinker; | |||
759 | extern struct shrinker *set_shrinker(int, shrinker_t); | 764 | extern struct shrinker *set_shrinker(int, shrinker_t); |
760 | extern void remove_shrinker(struct shrinker *shrinker); | 765 | extern void remove_shrinker(struct shrinker *shrinker); |
761 | 766 | ||
762 | /* | 767 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); |
763 | * On a two-level or three-level page table, this ends up being trivial. Thus | 768 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); |
764 | * the inlining and the symmetry break with pte_alloc_map() that does all | 769 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); |
765 | * of this out-of-line. | 770 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); |
766 | */ | 771 | |
767 | /* | 772 | /* |
768 | * The following ifdef needed to get the 4level-fixup.h header to work. | 773 | * The following ifdef needed to get the 4level-fixup.h header to work. |
769 | * Remove it when 4level-fixup.h has been removed. | 774 | * Remove it when 4level-fixup.h has been removed. |
770 | */ | 775 | */ |
771 | #ifdef CONFIG_MMU | 776 | #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) |
772 | #ifndef __ARCH_HAS_4LEVEL_HACK | ||
773 | static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | 777 | static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) |
774 | { | 778 | { |
775 | if (pgd_none(*pgd)) | 779 | return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? |
776 | return __pud_alloc(mm, pgd, address); | 780 | NULL: pud_offset(pgd, address); |
777 | return pud_offset(pgd, address); | ||
778 | } | 781 | } |
779 | 782 | ||
780 | static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | 783 | static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) |
781 | { | 784 | { |
782 | if (pud_none(*pud)) | 785 | return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? |
783 | return __pmd_alloc(mm, pud, address); | 786 | NULL: pmd_offset(pud, address); |
784 | return pmd_offset(pud, address); | ||
785 | } | 787 | } |
786 | #endif | 788 | #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ |
787 | #endif /* CONFIG_MMU */ | 789 | |
790 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
791 | /* | ||
792 | * We tuck a spinlock to guard each pagetable page into its struct page, | ||
793 | * at page->private, with BUILD_BUG_ON to make sure that this will not | ||
794 | * overflow into the next struct page (as it might with DEBUG_SPINLOCK). | ||
795 | * When freeing, reset page->mapping so free_pages_check won't complain. | ||
796 | */ | ||
797 | #define __pte_lockptr(page) &((page)->u.ptl) | ||
798 | #define pte_lock_init(_page) do { \ | ||
799 | spin_lock_init(__pte_lockptr(_page)); \ | ||
800 | } while (0) | ||
801 | #define pte_lock_deinit(page) ((page)->mapping = NULL) | ||
802 | #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) | ||
803 | #else | ||
804 | /* | ||
805 | * We use mm->page_table_lock to guard all pagetable pages of the mm. | ||
806 | */ | ||
807 | #define pte_lock_init(page) do {} while (0) | ||
808 | #define pte_lock_deinit(page) do {} while (0) | ||
809 | #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) | ||
810 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | ||
811 | |||
812 | #define pte_offset_map_lock(mm, pmd, address, ptlp) \ | ||
813 | ({ \ | ||
814 | spinlock_t *__ptl = pte_lockptr(mm, pmd); \ | ||
815 | pte_t *__pte = pte_offset_map(pmd, address); \ | ||
816 | *(ptlp) = __ptl; \ | ||
817 | spin_lock(__ptl); \ | ||
818 | __pte; \ | ||
819 | }) | ||
820 | |||
821 | #define pte_unmap_unlock(pte, ptl) do { \ | ||
822 | spin_unlock(ptl); \ | ||
823 | pte_unmap(pte); \ | ||
824 | } while (0) | ||
825 | |||
826 | #define pte_alloc_map(mm, pmd, address) \ | ||
827 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ | ||
828 | NULL: pte_offset_map(pmd, address)) | ||
829 | |||
830 | #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ | ||
831 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ | ||
832 | NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) | ||
833 | |||
834 | #define pte_alloc_kernel(pmd, address) \ | ||
835 | ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ | ||
836 | NULL: pte_offset_kernel(pmd, address)) | ||
788 | 837 | ||
789 | extern void free_area_init(unsigned long * zones_size); | 838 | extern void free_area_init(unsigned long * zones_size); |
790 | extern void free_area_init_node(int nid, pg_data_t *pgdat, | 839 | extern void free_area_init_node(int nid, pg_data_t *pgdat, |
791 | unsigned long * zones_size, unsigned long zone_start_pfn, | 840 | unsigned long * zones_size, unsigned long zone_start_pfn, |
792 | unsigned long *zholes_size); | 841 | unsigned long *zholes_size); |
793 | extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); | 842 | extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); |
843 | extern void setup_per_zone_pages_min(void); | ||
794 | extern void mem_init(void); | 844 | extern void mem_init(void); |
795 | extern void show_mem(void); | 845 | extern void show_mem(void); |
796 | extern void si_meminfo(struct sysinfo * val); | 846 | extern void si_meminfo(struct sysinfo * val); |
@@ -834,6 +884,7 @@ extern int split_vma(struct mm_struct *, | |||
834 | extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); | 884 | extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); |
835 | extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, | 885 | extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, |
836 | struct rb_node **, struct rb_node *); | 886 | struct rb_node **, struct rb_node *); |
887 | extern void unlink_file_vma(struct vm_area_struct *); | ||
837 | extern struct vm_area_struct *copy_vma(struct vm_area_struct **, | 888 | extern struct vm_area_struct *copy_vma(struct vm_area_struct **, |
838 | unsigned long addr, unsigned long len, pgoff_t pgoff); | 889 | unsigned long addr, unsigned long len, pgoff_t pgoff); |
839 | extern void exit_mmap(struct mm_struct *); | 890 | extern void exit_mmap(struct mm_struct *); |
@@ -894,7 +945,8 @@ void handle_ra_miss(struct address_space *mapping, | |||
894 | unsigned long max_sane_readahead(unsigned long nr); | 945 | unsigned long max_sane_readahead(unsigned long nr); |
895 | 946 | ||
896 | /* Do stack extension */ | 947 | /* Do stack extension */ |
897 | extern int expand_stack(struct vm_area_struct * vma, unsigned long address); | 948 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); |
949 | extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); | ||
898 | 950 | ||
899 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 951 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
900 | extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); | 952 | extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); |
@@ -917,40 +969,28 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) | |||
917 | return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 969 | return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; |
918 | } | 970 | } |
919 | 971 | ||
920 | extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); | 972 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); |
973 | struct page *vmalloc_to_page(void *addr); | ||
974 | unsigned long vmalloc_to_pfn(void *addr); | ||
975 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, | ||
976 | unsigned long pfn, unsigned long size, pgprot_t); | ||
921 | 977 | ||
922 | extern struct page * vmalloc_to_page(void *addr); | 978 | struct page *follow_page(struct mm_struct *, unsigned long address, |
923 | extern unsigned long vmalloc_to_pfn(void *addr); | 979 | unsigned int foll_flags); |
924 | extern struct page * follow_page(struct mm_struct *mm, unsigned long address, | 980 | #define FOLL_WRITE 0x01 /* check pte is writable */ |
925 | int write); | 981 | #define FOLL_TOUCH 0x02 /* mark page accessed */ |
926 | extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); | 982 | #define FOLL_GET 0x04 /* do get_page on page */ |
927 | int remap_pfn_range(struct vm_area_struct *, unsigned long, | 983 | #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ |
928 | unsigned long, unsigned long, pgprot_t); | ||
929 | 984 | ||
930 | #ifdef CONFIG_PROC_FS | 985 | #ifdef CONFIG_PROC_FS |
931 | void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); | 986 | void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); |
932 | #else | 987 | #else |
933 | static inline void __vm_stat_account(struct mm_struct *mm, | 988 | static inline void vm_stat_account(struct mm_struct *mm, |
934 | unsigned long flags, struct file *file, long pages) | 989 | unsigned long flags, struct file *file, long pages) |
935 | { | 990 | { |
936 | } | 991 | } |
937 | #endif /* CONFIG_PROC_FS */ | 992 | #endif /* CONFIG_PROC_FS */ |
938 | 993 | ||
939 | static inline void vm_stat_account(struct vm_area_struct *vma) | ||
940 | { | ||
941 | __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, | ||
942 | vma_pages(vma)); | ||
943 | } | ||
944 | |||
945 | static inline void vm_stat_unaccount(struct vm_area_struct *vma) | ||
946 | { | ||
947 | __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, | ||
948 | -vma_pages(vma)); | ||
949 | } | ||
950 | |||
951 | /* update per process rss and vm hiwater data */ | ||
952 | extern void update_mem_hiwater(struct task_struct *tsk); | ||
953 | |||
954 | #ifndef CONFIG_DEBUG_PAGEALLOC | 994 | #ifndef CONFIG_DEBUG_PAGEALLOC |
955 | static inline void | 995 | static inline void |
956 | kernel_map_pages(struct page *page, int numpages, int enable) | 996 | kernel_map_pages(struct page *page, int numpages, int enable) |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7519eb4191e7..f5fa3082fd6a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/threads.h> | 12 | #include <linux/threads.h> |
13 | #include <linux/numa.h> | 13 | #include <linux/numa.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/seqlock.h> | ||
15 | #include <asm/atomic.h> | 16 | #include <asm/atomic.h> |
16 | 17 | ||
17 | /* Free memory management - zoned buddy allocator. */ | 18 | /* Free memory management - zoned buddy allocator. */ |
@@ -137,6 +138,10 @@ struct zone { | |||
137 | * free areas of different sizes | 138 | * free areas of different sizes |
138 | */ | 139 | */ |
139 | spinlock_t lock; | 140 | spinlock_t lock; |
141 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
142 | /* see spanned/present_pages for more description */ | ||
143 | seqlock_t span_seqlock; | ||
144 | #endif | ||
140 | struct free_area free_area[MAX_ORDER]; | 145 | struct free_area free_area[MAX_ORDER]; |
141 | 146 | ||
142 | 147 | ||
@@ -220,6 +225,16 @@ struct zone { | |||
220 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ | 225 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ |
221 | unsigned long zone_start_pfn; | 226 | unsigned long zone_start_pfn; |
222 | 227 | ||
228 | /* | ||
229 | * zone_start_pfn, spanned_pages and present_pages are all | ||
230 | * protected by span_seqlock. It is a seqlock because it has | ||
231 | * to be read outside of zone->lock, and it is done in the main | ||
232 | * allocator path. But, it is written quite infrequently. | ||
233 | * | ||
234 | * The lock is declared along with zone->lock because it is | ||
235 | * frequently read in proximity to zone->lock. It's good to | ||
236 | * give them a chance of being in the same cacheline. | ||
237 | */ | ||
223 | unsigned long spanned_pages; /* total size, including holes */ | 238 | unsigned long spanned_pages; /* total size, including holes */ |
224 | unsigned long present_pages; /* amount of memory (excluding holes) */ | 239 | unsigned long present_pages; /* amount of memory (excluding holes) */ |
225 | 240 | ||
@@ -273,6 +288,16 @@ typedef struct pglist_data { | |||
273 | struct page *node_mem_map; | 288 | struct page *node_mem_map; |
274 | #endif | 289 | #endif |
275 | struct bootmem_data *bdata; | 290 | struct bootmem_data *bdata; |
291 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
292 | /* | ||
293 | * Must be held any time you expect node_start_pfn, node_present_pages | ||
294 | * or node_spanned_pages stay constant. Holding this will also | ||
295 | * guarantee that any pfn_valid() stays that way. | ||
296 | * | ||
297 | * Nests above zone->lock and zone->size_seqlock. | ||
298 | */ | ||
299 | spinlock_t node_size_lock; | ||
300 | #endif | ||
276 | unsigned long node_start_pfn; | 301 | unsigned long node_start_pfn; |
277 | unsigned long node_present_pages; /* total number of physical pages */ | 302 | unsigned long node_present_pages; /* total number of physical pages */ |
278 | unsigned long node_spanned_pages; /* total size of physical page | 303 | unsigned long node_spanned_pages; /* total size of physical page |
@@ -293,6 +318,8 @@ typedef struct pglist_data { | |||
293 | #endif | 318 | #endif |
294 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) | 319 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) |
295 | 320 | ||
321 | #include <linux/memory_hotplug.h> | ||
322 | |||
296 | extern struct pglist_data *pgdat_list; | 323 | extern struct pglist_data *pgdat_list; |
297 | 324 | ||
298 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 325 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, |
@@ -509,6 +536,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) | |||
509 | return NULL; | 536 | return NULL; |
510 | return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; | 537 | return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; |
511 | } | 538 | } |
539 | extern int __section_nr(struct mem_section* ms); | ||
512 | 540 | ||
513 | /* | 541 | /* |
514 | * We use the lower bits of the mem_map pointer to store | 542 | * We use the lower bits of the mem_map pointer to store |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e80fb7ee6efd..35b30e6c8cf8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -95,8 +95,8 @@ int try_to_unmap(struct page *); | |||
95 | /* | 95 | /* |
96 | * Called from mm/filemap_xip.c to unmap empty zero page | 96 | * Called from mm/filemap_xip.c to unmap empty zero page |
97 | */ | 97 | */ |
98 | pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long); | 98 | pte_t *page_check_address(struct page *, struct mm_struct *, |
99 | 99 | unsigned long, spinlock_t **); | |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * Used by swapoff to help locate where page is expected in vma. | 102 | * Used by swapoff to help locate where page is expected in vma. |
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index b52a2af25f1f..f30f805080ae 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h | |||
@@ -61,5 +61,10 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem)); | |||
61 | extern void FASTCALL(__up_write(struct rw_semaphore *sem)); | 61 | extern void FASTCALL(__up_write(struct rw_semaphore *sem)); |
62 | extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); | 62 | extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); |
63 | 63 | ||
64 | static inline int rwsem_is_locked(struct rw_semaphore *sem) | ||
65 | { | ||
66 | return (sem->activity != 0); | ||
67 | } | ||
68 | |||
64 | #endif /* __KERNEL__ */ | 69 | #endif /* __KERNEL__ */ |
65 | #endif /* _LINUX_RWSEM_SPINLOCK_H */ | 70 | #endif /* _LINUX_RWSEM_SPINLOCK_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 27519df0f987..1c30bc308ef1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -249,6 +249,36 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, | |||
249 | extern void arch_unmap_area(struct mm_struct *, unsigned long); | 249 | extern void arch_unmap_area(struct mm_struct *, unsigned long); |
250 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); | 250 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); |
251 | 251 | ||
252 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
253 | /* | ||
254 | * The mm counters are not protected by its page_table_lock, | ||
255 | * so must be incremented atomically. | ||
256 | */ | ||
257 | #ifdef ATOMIC64_INIT | ||
258 | #define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) | ||
259 | #define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) | ||
260 | #define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) | ||
261 | #define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) | ||
262 | #define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) | ||
263 | typedef atomic64_t mm_counter_t; | ||
264 | #else /* !ATOMIC64_INIT */ | ||
265 | /* | ||
266 | * The counters wrap back to 0 at 2^32 * PAGE_SIZE, | ||
267 | * that is, at 16TB if using 4kB page size. | ||
268 | */ | ||
269 | #define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) | ||
270 | #define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) | ||
271 | #define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) | ||
272 | #define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) | ||
273 | #define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) | ||
274 | typedef atomic_t mm_counter_t; | ||
275 | #endif /* !ATOMIC64_INIT */ | ||
276 | |||
277 | #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | ||
278 | /* | ||
279 | * The mm counters are protected by its page_table_lock, | ||
280 | * so can be incremented directly. | ||
281 | */ | ||
252 | #define set_mm_counter(mm, member, value) (mm)->_##member = (value) | 282 | #define set_mm_counter(mm, member, value) (mm)->_##member = (value) |
253 | #define get_mm_counter(mm, member) ((mm)->_##member) | 283 | #define get_mm_counter(mm, member) ((mm)->_##member) |
254 | #define add_mm_counter(mm, member, value) (mm)->_##member += (value) | 284 | #define add_mm_counter(mm, member, value) (mm)->_##member += (value) |
@@ -256,6 +286,20 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); | |||
256 | #define dec_mm_counter(mm, member) (mm)->_##member-- | 286 | #define dec_mm_counter(mm, member) (mm)->_##member-- |
257 | typedef unsigned long mm_counter_t; | 287 | typedef unsigned long mm_counter_t; |
258 | 288 | ||
289 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | ||
290 | |||
291 | #define get_mm_rss(mm) \ | ||
292 | (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) | ||
293 | #define update_hiwater_rss(mm) do { \ | ||
294 | unsigned long _rss = get_mm_rss(mm); \ | ||
295 | if ((mm)->hiwater_rss < _rss) \ | ||
296 | (mm)->hiwater_rss = _rss; \ | ||
297 | } while (0) | ||
298 | #define update_hiwater_vm(mm) do { \ | ||
299 | if ((mm)->hiwater_vm < (mm)->total_vm) \ | ||
300 | (mm)->hiwater_vm = (mm)->total_vm; \ | ||
301 | } while (0) | ||
302 | |||
259 | struct mm_struct { | 303 | struct mm_struct { |
260 | struct vm_area_struct * mmap; /* list of VMAs */ | 304 | struct vm_area_struct * mmap; /* list of VMAs */ |
261 | struct rb_root mm_rb; | 305 | struct rb_root mm_rb; |
@@ -279,15 +323,20 @@ struct mm_struct { | |||
279 | * by mmlist_lock | 323 | * by mmlist_lock |
280 | */ | 324 | */ |
281 | 325 | ||
326 | /* Special counters, in some configurations protected by the | ||
327 | * page_table_lock, in other configurations by being atomic. | ||
328 | */ | ||
329 | mm_counter_t _file_rss; | ||
330 | mm_counter_t _anon_rss; | ||
331 | |||
332 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ | ||
333 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | ||
334 | |||
335 | unsigned long total_vm, locked_vm, shared_vm, exec_vm; | ||
336 | unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; | ||
282 | unsigned long start_code, end_code, start_data, end_data; | 337 | unsigned long start_code, end_code, start_data, end_data; |
283 | unsigned long start_brk, brk, start_stack; | 338 | unsigned long start_brk, brk, start_stack; |
284 | unsigned long arg_start, arg_end, env_start, env_end; | 339 | unsigned long arg_start, arg_end, env_start, env_end; |
285 | unsigned long total_vm, locked_vm, shared_vm; | ||
286 | unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; | ||
287 | |||
288 | /* Special counters protected by the page_table_lock */ | ||
289 | mm_counter_t _rss; | ||
290 | mm_counter_t _anon_rss; | ||
291 | 340 | ||
292 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ | 341 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
293 | 342 | ||
@@ -308,11 +357,7 @@ struct mm_struct { | |||
308 | /* aio bits */ | 357 | /* aio bits */ |
309 | rwlock_t ioctx_list_lock; | 358 | rwlock_t ioctx_list_lock; |
310 | struct kioctx *ioctx_list; | 359 | struct kioctx *ioctx_list; |
311 | |||
312 | struct kioctx default_kioctx; | 360 | struct kioctx default_kioctx; |
313 | |||
314 | unsigned long hiwater_rss; /* High-water RSS usage */ | ||
315 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | ||
316 | }; | 361 | }; |
317 | 362 | ||
318 | struct sighand_struct { | 363 | struct sighand_struct { |
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3701a0673d2c..1d5577b2b752 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h | |||
@@ -32,10 +32,14 @@ struct vm_struct { | |||
32 | * Highlevel APIs for driver use | 32 | * Highlevel APIs for driver use |
33 | */ | 33 | */ |
34 | extern void *vmalloc(unsigned long size); | 34 | extern void *vmalloc(unsigned long size); |
35 | extern void *vmalloc_node(unsigned long size, int node); | ||
35 | extern void *vmalloc_exec(unsigned long size); | 36 | extern void *vmalloc_exec(unsigned long size); |
36 | extern void *vmalloc_32(unsigned long size); | 37 | extern void *vmalloc_32(unsigned long size); |
37 | extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); | 38 | extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); |
38 | extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); | 39 | extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, |
40 | pgprot_t prot); | ||
41 | extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, | ||
42 | pgprot_t prot, int node); | ||
39 | extern void vfree(void *addr); | 43 | extern void vfree(void *addr); |
40 | 44 | ||
41 | extern void *vmap(struct page **pages, unsigned int count, | 45 | extern void *vmap(struct page **pages, unsigned int count, |
@@ -48,6 +52,8 @@ extern void vunmap(void *addr); | |||
48 | extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); | 52 | extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); |
49 | extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 53 | extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
50 | unsigned long start, unsigned long end); | 54 | unsigned long start, unsigned long end); |
55 | extern struct vm_struct *get_vm_area_node(unsigned long size, | ||
56 | unsigned long flags, int node); | ||
51 | extern struct vm_struct *remove_vm_area(void *addr); | 57 | extern struct vm_struct *remove_vm_area(void *addr); |
52 | extern struct vm_struct *__remove_vm_area(void *addr); | 58 | extern struct vm_struct *__remove_vm_area(void *addr); |
53 | extern int map_vm_area(struct vm_struct *area, pgprot_t prot, | 59 | extern int map_vm_area(struct vm_struct *area, pgprot_t prot, |
@@ -233,10 +233,11 @@ static int newseg (key_t key, int shmflg, size_t size) | |||
233 | shp->id = shm_buildid(id,shp->shm_perm.seq); | 233 | shp->id = shm_buildid(id,shp->shm_perm.seq); |
234 | shp->shm_file = file; | 234 | shp->shm_file = file; |
235 | file->f_dentry->d_inode->i_ino = shp->id; | 235 | file->f_dentry->d_inode->i_ino = shp->id; |
236 | if (shmflg & SHM_HUGETLB) | 236 | |
237 | set_file_hugepages(file); | 237 | /* Hugetlb ops would have already been assigned. */ |
238 | else | 238 | if (!(shmflg & SHM_HUGETLB)) |
239 | file->f_op = &shm_file_operations; | 239 | file->f_op = &shm_file_operations; |
240 | |||
240 | shm_tot += numpages; | 241 | shm_tot += numpages; |
241 | shm_unlock(shp); | 242 | shm_unlock(shp); |
242 | return shp->id; | 243 | return shp->id; |
diff --git a/kernel/acct.c b/kernel/acct.c index b756f527497e..2e3f4a47e7d0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
553 | if (delta == 0) | 553 | if (delta == 0) |
554 | return; | 554 | return; |
555 | tsk->acct_stimexpd = tsk->stime; | 555 | tsk->acct_stimexpd = tsk->stime; |
556 | tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); | 556 | tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); |
557 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; | 557 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; |
558 | } | 558 | } |
559 | } | 559 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 3b25b182d2be..79f52b85d6ed 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code) | |||
839 | preempt_count()); | 839 | preempt_count()); |
840 | 840 | ||
841 | acct_update_integrals(tsk); | 841 | acct_update_integrals(tsk); |
842 | update_mem_hiwater(tsk); | 842 | if (tsk->mm) { |
843 | update_hiwater_rss(tsk->mm); | ||
844 | update_hiwater_vm(tsk->mm); | ||
845 | } | ||
843 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 846 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
844 | if (group_dead) { | 847 | if (group_dead) { |
845 | del_timer_sync(&tsk->signal->real_timer); | 848 | del_timer_sync(&tsk->signal->real_timer); |
diff --git a/kernel/fork.c b/kernel/fork.c index 280bd44ac441..8a069612eac3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
182 | } | 182 | } |
183 | 183 | ||
184 | #ifdef CONFIG_MMU | 184 | #ifdef CONFIG_MMU |
185 | static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | 185 | static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
186 | { | 186 | { |
187 | struct vm_area_struct * mpnt, *tmp, **pprev; | 187 | struct vm_area_struct *mpnt, *tmp, **pprev; |
188 | struct rb_node **rb_link, *rb_parent; | 188 | struct rb_node **rb_link, *rb_parent; |
189 | int retval; | 189 | int retval; |
190 | unsigned long charge; | 190 | unsigned long charge; |
191 | struct mempolicy *pol; | 191 | struct mempolicy *pol; |
192 | 192 | ||
193 | down_write(&oldmm->mmap_sem); | 193 | down_write(&oldmm->mmap_sem); |
194 | flush_cache_mm(current->mm); | 194 | flush_cache_mm(oldmm); |
195 | down_write(&mm->mmap_sem); | ||
196 | |||
195 | mm->locked_vm = 0; | 197 | mm->locked_vm = 0; |
196 | mm->mmap = NULL; | 198 | mm->mmap = NULL; |
197 | mm->mmap_cache = NULL; | 199 | mm->mmap_cache = NULL; |
198 | mm->free_area_cache = oldmm->mmap_base; | 200 | mm->free_area_cache = oldmm->mmap_base; |
199 | mm->cached_hole_size = ~0UL; | 201 | mm->cached_hole_size = ~0UL; |
200 | mm->map_count = 0; | 202 | mm->map_count = 0; |
201 | set_mm_counter(mm, rss, 0); | ||
202 | set_mm_counter(mm, anon_rss, 0); | ||
203 | cpus_clear(mm->cpu_vm_mask); | 203 | cpus_clear(mm->cpu_vm_mask); |
204 | mm->mm_rb = RB_ROOT; | 204 | mm->mm_rb = RB_ROOT; |
205 | rb_link = &mm->mm_rb.rb_node; | 205 | rb_link = &mm->mm_rb.rb_node; |
206 | rb_parent = NULL; | 206 | rb_parent = NULL; |
207 | pprev = &mm->mmap; | 207 | pprev = &mm->mmap; |
208 | 208 | ||
209 | for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { | 209 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
210 | struct file *file; | 210 | struct file *file; |
211 | 211 | ||
212 | if (mpnt->vm_flags & VM_DONTCOPY) { | 212 | if (mpnt->vm_flags & VM_DONTCOPY) { |
213 | long pages = vma_pages(mpnt); | 213 | long pages = vma_pages(mpnt); |
214 | mm->total_vm -= pages; | 214 | mm->total_vm -= pages; |
215 | __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | 215 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, |
216 | -pages); | 216 | -pages); |
217 | continue; | 217 | continue; |
218 | } | 218 | } |
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
253 | } | 253 | } |
254 | 254 | ||
255 | /* | 255 | /* |
256 | * Link in the new vma and copy the page table entries: | 256 | * Link in the new vma and copy the page table entries. |
257 | * link in first so that swapoff can see swap entries. | ||
258 | * Note that, exceptionally, here the vma is inserted | ||
259 | * without holding mm->mmap_sem. | ||
260 | */ | 257 | */ |
261 | spin_lock(&mm->page_table_lock); | ||
262 | *pprev = tmp; | 258 | *pprev = tmp; |
263 | pprev = &tmp->vm_next; | 259 | pprev = &tmp->vm_next; |
264 | 260 | ||
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
267 | rb_parent = &tmp->vm_rb; | 263 | rb_parent = &tmp->vm_rb; |
268 | 264 | ||
269 | mm->map_count++; | 265 | mm->map_count++; |
270 | retval = copy_page_range(mm, current->mm, tmp); | 266 | retval = copy_page_range(mm, oldmm, tmp); |
271 | spin_unlock(&mm->page_table_lock); | ||
272 | 267 | ||
273 | if (tmp->vm_ops && tmp->vm_ops->open) | 268 | if (tmp->vm_ops && tmp->vm_ops->open) |
274 | tmp->vm_ops->open(tmp); | 269 | tmp->vm_ops->open(tmp); |
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
277 | goto out; | 272 | goto out; |
278 | } | 273 | } |
279 | retval = 0; | 274 | retval = 0; |
280 | |||
281 | out: | 275 | out: |
282 | flush_tlb_mm(current->mm); | 276 | up_write(&mm->mmap_sem); |
277 | flush_tlb_mm(oldmm); | ||
283 | up_write(&oldmm->mmap_sem); | 278 | up_write(&oldmm->mmap_sem); |
284 | return retval; | 279 | return retval; |
285 | fail_nomem_policy: | 280 | fail_nomem_policy: |
@@ -323,6 +318,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) | |||
323 | INIT_LIST_HEAD(&mm->mmlist); | 318 | INIT_LIST_HEAD(&mm->mmlist); |
324 | mm->core_waiters = 0; | 319 | mm->core_waiters = 0; |
325 | mm->nr_ptes = 0; | 320 | mm->nr_ptes = 0; |
321 | set_mm_counter(mm, file_rss, 0); | ||
322 | set_mm_counter(mm, anon_rss, 0); | ||
326 | spin_lock_init(&mm->page_table_lock); | 323 | spin_lock_init(&mm->page_table_lock); |
327 | rwlock_init(&mm->ioctx_list_lock); | 324 | rwlock_init(&mm->ioctx_list_lock); |
328 | mm->ioctx_list = NULL; | 325 | mm->ioctx_list = NULL; |
@@ -499,7 +496,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | |||
499 | if (retval) | 496 | if (retval) |
500 | goto free_pt; | 497 | goto free_pt; |
501 | 498 | ||
502 | mm->hiwater_rss = get_mm_counter(mm,rss); | 499 | mm->hiwater_rss = get_mm_rss(mm); |
503 | mm->hiwater_vm = mm->total_vm; | 500 | mm->hiwater_vm = mm->total_vm; |
504 | 501 | ||
505 | good_mm: | 502 | good_mm: |
diff --git a/kernel/futex.c b/kernel/futex.c index ca05fe6a70b2..3b4d5ad44cc6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
205 | /* | 205 | /* |
206 | * Do a quick atomic lookup first - this is the fastpath. | 206 | * Do a quick atomic lookup first - this is the fastpath. |
207 | */ | 207 | */ |
208 | spin_lock(¤t->mm->page_table_lock); | 208 | page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET); |
209 | page = follow_page(mm, uaddr, 0); | ||
210 | if (likely(page != NULL)) { | 209 | if (likely(page != NULL)) { |
211 | key->shared.pgoff = | 210 | key->shared.pgoff = |
212 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
213 | spin_unlock(¤t->mm->page_table_lock); | 212 | put_page(page); |
214 | return 0; | 213 | return 0; |
215 | } | 214 | } |
216 | spin_unlock(¤t->mm->page_table_lock); | ||
217 | 215 | ||
218 | /* | 216 | /* |
219 | * Do it the general way. | 217 | * Do it the general way. |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 36c5d9cd4cc1..2c95848fbce8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) | |||
334 | if (pages) { | 334 | if (pages) { |
335 | unsigned int count, i; | 335 | unsigned int count, i; |
336 | pages->mapping = NULL; | 336 | pages->mapping = NULL; |
337 | pages->private = order; | 337 | set_page_private(pages, order); |
338 | count = 1 << order; | 338 | count = 1 << order; |
339 | for (i = 0; i < count; i++) | 339 | for (i = 0; i < count; i++) |
340 | SetPageReserved(pages + i); | 340 | SetPageReserved(pages + i); |
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page) | |||
347 | { | 347 | { |
348 | unsigned int order, count, i; | 348 | unsigned int order, count, i; |
349 | 349 | ||
350 | order = page->private; | 350 | order = page_private(page); |
351 | count = 1 << order; | 351 | count = 1 << order; |
352 | for (i = 0; i < count; i++) | 352 | for (i = 0; i < count; i++) |
353 | ClearPageReserved(page + i); | 353 | ClearPageReserved(page + i); |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 10bc5ec496d7..016504ccfccf 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone) | |||
578 | continue; | 578 | continue; |
579 | page = pfn_to_page(pfn); | 579 | page = pfn_to_page(pfn); |
580 | /* | 580 | /* |
581 | * This condition results from rvmalloc() sans vmalloc_32() | 581 | * PageReserved results from rvmalloc() sans vmalloc_32() |
582 | * and architectural memory reservations. This should be | 582 | * and architectural memory reservations. |
583 | * corrected eventually when the cases giving rise to this | 583 | * |
584 | * are better understood. | 584 | * rvmalloc should not cause this, because all implementations |
585 | * appear to always be using vmalloc_32 on architectures with | ||
586 | * highmem. This is a good thing, because we would like to save | ||
587 | * rvmalloc pages. | ||
588 | * | ||
589 | * It appears to be triggered by pages which do not point to | ||
590 | * valid memory (see arch/i386/mm/init.c:one_highpage_init(), | ||
591 | * which sets PageReserved if the page does not point to valid | ||
592 | * RAM. | ||
593 | * | ||
594 | * XXX: must remove usage of PageReserved! | ||
585 | */ | 595 | */ |
586 | if (PageReserved(page)) { | 596 | if (PageReserved(page)) |
587 | printk("highmem reserved page?!\n"); | ||
588 | continue; | 597 | continue; |
589 | } | ||
590 | BUG_ON(PageNosave(page)); | 598 | BUG_ON(PageNosave(page)); |
591 | if (PageNosaveFree(page)) | 599 | if (PageNosaveFree(page)) |
592 | continue; | 600 | continue; |
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn) | |||
672 | return 0; | 680 | return 0; |
673 | 681 | ||
674 | page = pfn_to_page(pfn); | 682 | page = pfn_to_page(pfn); |
675 | BUG_ON(PageReserved(page) && PageNosave(page)); | ||
676 | if (PageNosave(page)) | 683 | if (PageNosave(page)) |
677 | return 0; | 684 | return 0; |
678 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | 685 | if (pfn_is_nosave(pfn)) { |
679 | pr_debug("[nosave pfn 0x%lx]", pfn); | 686 | pr_debug("[nosave pfn 0x%lx]", pfn); |
680 | return 0; | 687 | return 0; |
681 | } | 688 | } |
diff --git a/kernel/sched.c b/kernel/sched.c index 1e5cafdf4e27..4f26c544d02c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
2511 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 2511 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
2512 | /* Account for system time used */ | 2512 | /* Account for system time used */ |
2513 | acct_update_integrals(p); | 2513 | acct_update_integrals(p); |
2514 | /* Update rss highwater mark */ | ||
2515 | update_mem_hiwater(p); | ||
2516 | } | 2514 | } |
2517 | 2515 | ||
2518 | /* | 2516 | /* |
diff --git a/kernel/timer.c b/kernel/timer.c index 3ba10fa35b60..6a2e5f8dc725 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -752,6 +752,15 @@ static void second_overflow(void) | |||
752 | else | 752 | else |
753 | time_adj += (time_adj >> 2) + (time_adj >> 5); | 753 | time_adj += (time_adj >> 2) + (time_adj >> 5); |
754 | #endif | 754 | #endif |
755 | #if HZ == 250 | ||
756 | /* Compensate for (HZ==250) != (1 << SHIFT_HZ). | ||
757 | * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14) | ||
758 | */ | ||
759 | if (time_adj < 0) | ||
760 | time_adj -= (-time_adj >> 6) + (-time_adj >> 7); | ||
761 | else | ||
762 | time_adj += (time_adj >> 6) + (time_adj >> 7); | ||
763 | #endif | ||
755 | #if HZ == 1000 | 764 | #if HZ == 1000 |
756 | /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). | 765 | /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). |
757 | * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) | 766 | * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) |
diff --git a/mm/Kconfig b/mm/Kconfig index 391ffc54d136..1a4473fcb2ca 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -111,3 +111,24 @@ config SPARSEMEM_STATIC | |||
111 | config SPARSEMEM_EXTREME | 111 | config SPARSEMEM_EXTREME |
112 | def_bool y | 112 | def_bool y |
113 | depends on SPARSEMEM && !SPARSEMEM_STATIC | 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC |
114 | |||
115 | # eventually, we can have this option just 'select SPARSEMEM' | ||
116 | config MEMORY_HOTPLUG | ||
117 | bool "Allow for memory hot-add" | ||
118 | depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND | ||
119 | |||
120 | comment "Memory hotplug is currently incompatible with Software Suspend" | ||
121 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND | ||
122 | |||
123 | # Heavily threaded applications may benefit from splitting the mm-wide | ||
124 | # page_table_lock, so that faults on different parts of the user address | ||
125 | # space can be handled with less contention: split it at this NR_CPUS. | ||
126 | # Default to 4 for wider testing, though 8 might be more appropriate. | ||
127 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. | ||
128 | # PA-RISC's debug spinlock_t is too large for the 32-bit struct page. | ||
129 | # | ||
130 | config SPLIT_PTLOCK_CPUS | ||
131 | int | ||
132 | default "4096" if ARM && !CPU_CACHE_VIPT | ||
133 | default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT | ||
134 | default "4" | ||
diff --git a/mm/Makefile b/mm/Makefile index 4cd69e3ce421..2fa6d2ca9f28 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o | |||
18 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 18 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
19 | obj-$(CONFIG_SHMEM) += shmem.o | 19 | obj-$(CONFIG_SHMEM) += shmem.o |
20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
21 | 21 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | |
22 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 22 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
diff --git a/mm/bootmem.c b/mm/bootmem.c index a58699b6579e..e8c567177dcf 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
305 | if (j + 16 < BITS_PER_LONG) | 305 | if (j + 16 < BITS_PER_LONG) |
306 | prefetchw(page + j + 16); | 306 | prefetchw(page + j + 16); |
307 | __ClearPageReserved(page + j); | 307 | __ClearPageReserved(page + j); |
308 | set_page_count(page + j, 0); | ||
308 | } | 309 | } |
309 | __free_pages(page, order); | 310 | __free_pages(page, order); |
310 | i += BITS_PER_LONG; | 311 | i += BITS_PER_LONG; |
diff --git a/mm/filemap.c b/mm/filemap.c index 1c31b2fd2ca5..768687f1d46b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
66 | * | 66 | * |
67 | * ->mmap_sem | 67 | * ->mmap_sem |
68 | * ->i_mmap_lock | 68 | * ->i_mmap_lock |
69 | * ->page_table_lock (various places, mainly in mmap.c) | 69 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
70 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 70 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
71 | * | 71 | * |
72 | * ->mmap_sem | 72 | * ->mmap_sem |
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
86 | * ->anon_vma.lock (vma_adjust) | 86 | * ->anon_vma.lock (vma_adjust) |
87 | * | 87 | * |
88 | * ->anon_vma.lock | 88 | * ->anon_vma.lock |
89 | * ->page_table_lock (anon_vma_prepare and various) | 89 | * ->page_table_lock or pte_lock (anon_vma_prepare and various) |
90 | * | 90 | * |
91 | * ->page_table_lock | 91 | * ->page_table_lock or pte_lock |
92 | * ->swap_lock (try_to_unmap_one) | 92 | * ->swap_lock (try_to_unmap_one) |
93 | * ->private_lock (try_to_unmap_one) | 93 | * ->private_lock (try_to_unmap_one) |
94 | * ->tree_lock (try_to_unmap_one) | 94 | * ->tree_lock (try_to_unmap_one) |
@@ -152,7 +152,7 @@ static int sync_page(void *word) | |||
152 | * in the ->sync_page() methods make essential use of the | 152 | * in the ->sync_page() methods make essential use of the |
153 | * page_mapping(), merely passing the page down to the backing | 153 | * page_mapping(), merely passing the page down to the backing |
154 | * device's unplug functions when it's non-NULL, which in turn | 154 | * device's unplug functions when it's non-NULL, which in turn |
155 | * ignore it for all cases but swap, where only page->private is | 155 | * ignore it for all cases but swap, where only page_private(page) is |
156 | * of interest. When page_mapping() does go NULL, the entire | 156 | * of interest. When page_mapping() does go NULL, the entire |
157 | * call stack gracefully ignores the page and returns. | 157 | * call stack gracefully ignores the page and returns. |
158 | * -- wli | 158 | * -- wli |
@@ -1520,7 +1520,7 @@ repeat: | |||
1520 | page_cache_release(page); | 1520 | page_cache_release(page); |
1521 | return err; | 1521 | return err; |
1522 | } | 1522 | } |
1523 | } else { | 1523 | } else if (vma->vm_flags & VM_NONLINEAR) { |
1524 | /* No page was found just because we can't read it in now (being | 1524 | /* No page was found just because we can't read it in now (being |
1525 | * here implies nonblock != 0), but the page may exist, so set | 1525 | * here implies nonblock != 0), but the page may exist, so set |
1526 | * the PTE to fault it in later. */ | 1526 | * the PTE to fault it in later. */ |
@@ -1537,6 +1537,7 @@ repeat: | |||
1537 | 1537 | ||
1538 | return 0; | 1538 | return 0; |
1539 | } | 1539 | } |
1540 | EXPORT_SYMBOL(filemap_populate); | ||
1540 | 1541 | ||
1541 | struct vm_operations_struct generic_file_vm_ops = { | 1542 | struct vm_operations_struct generic_file_vm_ops = { |
1542 | .nopage = filemap_nopage, | 1543 | .nopage = filemap_nopage, |
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
1555 | vma->vm_ops = &generic_file_vm_ops; | 1556 | vma->vm_ops = &generic_file_vm_ops; |
1556 | return 0; | 1557 | return 0; |
1557 | } | 1558 | } |
1558 | EXPORT_SYMBOL(filemap_populate); | ||
1559 | 1559 | ||
1560 | /* | 1560 | /* |
1561 | * This is for filesystems which do not implement ->writepage. | 1561 | * This is for filesystems which do not implement ->writepage. |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 8c199f537732..9cf687e4a29a 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping, | |||
174 | unsigned long address; | 174 | unsigned long address; |
175 | pte_t *pte; | 175 | pte_t *pte; |
176 | pte_t pteval; | 176 | pte_t pteval; |
177 | spinlock_t *ptl; | ||
178 | struct page *page; | ||
177 | 179 | ||
178 | spin_lock(&mapping->i_mmap_lock); | 180 | spin_lock(&mapping->i_mmap_lock); |
179 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 181 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
@@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping, | |||
181 | address = vma->vm_start + | 183 | address = vma->vm_start + |
182 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 184 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
183 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
184 | /* | 186 | page = ZERO_PAGE(address); |
185 | * We need the page_table_lock to protect us from page faults, | 187 | pte = page_check_address(page, mm, address, &ptl); |
186 | * munmap, fork, etc... | 188 | if (pte) { |
187 | */ | ||
188 | pte = page_check_address(ZERO_PAGE(address), mm, | ||
189 | address); | ||
190 | if (!IS_ERR(pte)) { | ||
191 | /* Nuke the page table entry. */ | 189 | /* Nuke the page table entry. */ |
192 | flush_cache_page(vma, address, pte_pfn(*pte)); | 190 | flush_cache_page(vma, address, pte_pfn(*pte)); |
193 | pteval = ptep_clear_flush(vma, address, pte); | 191 | pteval = ptep_clear_flush(vma, address, pte); |
192 | page_remove_rmap(page); | ||
193 | dec_mm_counter(mm, file_rss); | ||
194 | BUG_ON(pte_dirty(pteval)); | 194 | BUG_ON(pte_dirty(pteval)); |
195 | pte_unmap(pte); | 195 | pte_unmap_unlock(pte, ptl); |
196 | spin_unlock(&mm->page_table_lock); | 196 | page_cache_release(page); |
197 | } | 197 | } |
198 | } | 198 | } |
199 | spin_unlock(&mapping->i_mmap_lock); | 199 | spin_unlock(&mapping->i_mmap_lock); |
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area, | |||
228 | 228 | ||
229 | page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); | 229 | page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); |
230 | if (!IS_ERR(page)) { | 230 | if (!IS_ERR(page)) { |
231 | return page; | 231 | goto out; |
232 | } | 232 | } |
233 | if (PTR_ERR(page) != -ENODATA) | 233 | if (PTR_ERR(page) != -ENODATA) |
234 | return NULL; | 234 | return NULL; |
@@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area, | |||
249 | page = ZERO_PAGE(address); | 249 | page = ZERO_PAGE(address); |
250 | } | 250 | } |
251 | 251 | ||
252 | out: | ||
253 | page_cache_get(page); | ||
252 | return page; | 254 | return page; |
253 | } | 255 | } |
254 | 256 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index ab23a0673c35..d862be3bc3e3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -20,33 +20,32 @@ | |||
20 | #include <asm/cacheflush.h> | 20 | #include <asm/cacheflush.h> |
21 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
22 | 22 | ||
23 | static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | 23 | static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, |
24 | unsigned long addr, pte_t *ptep) | 24 | unsigned long addr, pte_t *ptep) |
25 | { | 25 | { |
26 | pte_t pte = *ptep; | 26 | pte_t pte = *ptep; |
27 | struct page *page = NULL; | ||
27 | 28 | ||
28 | if (pte_none(pte)) | ||
29 | return; | ||
30 | if (pte_present(pte)) { | 29 | if (pte_present(pte)) { |
31 | unsigned long pfn = pte_pfn(pte); | 30 | unsigned long pfn = pte_pfn(pte); |
32 | |||
33 | flush_cache_page(vma, addr, pfn); | 31 | flush_cache_page(vma, addr, pfn); |
34 | pte = ptep_clear_flush(vma, addr, ptep); | 32 | pte = ptep_clear_flush(vma, addr, ptep); |
35 | if (pfn_valid(pfn)) { | 33 | if (unlikely(!pfn_valid(pfn))) { |
36 | struct page *page = pfn_to_page(pfn); | 34 | print_bad_pte(vma, pte, addr); |
37 | if (!PageReserved(page)) { | 35 | goto out; |
38 | if (pte_dirty(pte)) | ||
39 | set_page_dirty(page); | ||
40 | page_remove_rmap(page); | ||
41 | page_cache_release(page); | ||
42 | dec_mm_counter(mm, rss); | ||
43 | } | ||
44 | } | 36 | } |
37 | page = pfn_to_page(pfn); | ||
38 | if (pte_dirty(pte)) | ||
39 | set_page_dirty(page); | ||
40 | page_remove_rmap(page); | ||
41 | page_cache_release(page); | ||
45 | } else { | 42 | } else { |
46 | if (!pte_file(pte)) | 43 | if (!pte_file(pte)) |
47 | free_swap_and_cache(pte_to_swp_entry(pte)); | 44 | free_swap_and_cache(pte_to_swp_entry(pte)); |
48 | pte_clear(mm, addr, ptep); | 45 | pte_clear(mm, addr, ptep); |
49 | } | 46 | } |
47 | out: | ||
48 | return !!page; | ||
50 | } | 49 | } |
51 | 50 | ||
52 | /* | 51 | /* |
@@ -64,21 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
64 | pud_t *pud; | 63 | pud_t *pud; |
65 | pgd_t *pgd; | 64 | pgd_t *pgd; |
66 | pte_t pte_val; | 65 | pte_t pte_val; |
66 | spinlock_t *ptl; | ||
67 | |||
68 | BUG_ON(vma->vm_flags & VM_RESERVED); | ||
67 | 69 | ||
68 | pgd = pgd_offset(mm, addr); | 70 | pgd = pgd_offset(mm, addr); |
69 | spin_lock(&mm->page_table_lock); | ||
70 | |||
71 | pud = pud_alloc(mm, pgd, addr); | 71 | pud = pud_alloc(mm, pgd, addr); |
72 | if (!pud) | 72 | if (!pud) |
73 | goto err_unlock; | 73 | goto out; |
74 | |||
75 | pmd = pmd_alloc(mm, pud, addr); | 74 | pmd = pmd_alloc(mm, pud, addr); |
76 | if (!pmd) | 75 | if (!pmd) |
77 | goto err_unlock; | 76 | goto out; |
78 | 77 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | |
79 | pte = pte_alloc_map(mm, pmd, addr); | ||
80 | if (!pte) | 78 | if (!pte) |
81 | goto err_unlock; | 79 | goto out; |
82 | 80 | ||
83 | /* | 81 | /* |
84 | * This page may have been truncated. Tell the | 82 | * This page may have been truncated. Tell the |
@@ -88,29 +86,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
88 | inode = vma->vm_file->f_mapping->host; | 86 | inode = vma->vm_file->f_mapping->host; |
89 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 87 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
90 | if (!page->mapping || page->index >= size) | 88 | if (!page->mapping || page->index >= size) |
91 | goto err_unlock; | 89 | goto unlock; |
92 | err = -ENOMEM; | 90 | err = -ENOMEM; |
93 | if (page_mapcount(page) > INT_MAX/2) | 91 | if (page_mapcount(page) > INT_MAX/2) |
94 | goto err_unlock; | 92 | goto unlock; |
95 | 93 | ||
96 | zap_pte(mm, vma, addr, pte); | 94 | if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) |
95 | inc_mm_counter(mm, file_rss); | ||
97 | 96 | ||
98 | inc_mm_counter(mm,rss); | ||
99 | flush_icache_page(vma, page); | 97 | flush_icache_page(vma, page); |
100 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 98 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
101 | page_add_file_rmap(page); | 99 | page_add_file_rmap(page); |
102 | pte_val = *pte; | 100 | pte_val = *pte; |
103 | pte_unmap(pte); | ||
104 | update_mmu_cache(vma, addr, pte_val); | 101 | update_mmu_cache(vma, addr, pte_val); |
105 | |||
106 | err = 0; | 102 | err = 0; |
107 | err_unlock: | 103 | unlock: |
108 | spin_unlock(&mm->page_table_lock); | 104 | pte_unmap_unlock(pte, ptl); |
105 | out: | ||
109 | return err; | 106 | return err; |
110 | } | 107 | } |
111 | EXPORT_SYMBOL(install_page); | 108 | EXPORT_SYMBOL(install_page); |
112 | 109 | ||
113 | |||
114 | /* | 110 | /* |
115 | * Install a file pte to a given virtual memory address, release any | 111 | * Install a file pte to a given virtual memory address, release any |
116 | * previously existing mapping. | 112 | * previously existing mapping. |
@@ -124,37 +120,35 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
124 | pud_t *pud; | 120 | pud_t *pud; |
125 | pgd_t *pgd; | 121 | pgd_t *pgd; |
126 | pte_t pte_val; | 122 | pte_t pte_val; |
123 | spinlock_t *ptl; | ||
124 | |||
125 | BUG_ON(vma->vm_flags & VM_RESERVED); | ||
127 | 126 | ||
128 | pgd = pgd_offset(mm, addr); | 127 | pgd = pgd_offset(mm, addr); |
129 | spin_lock(&mm->page_table_lock); | ||
130 | |||
131 | pud = pud_alloc(mm, pgd, addr); | 128 | pud = pud_alloc(mm, pgd, addr); |
132 | if (!pud) | 129 | if (!pud) |
133 | goto err_unlock; | 130 | goto out; |
134 | |||
135 | pmd = pmd_alloc(mm, pud, addr); | 131 | pmd = pmd_alloc(mm, pud, addr); |
136 | if (!pmd) | 132 | if (!pmd) |
137 | goto err_unlock; | 133 | goto out; |
138 | 134 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | |
139 | pte = pte_alloc_map(mm, pmd, addr); | ||
140 | if (!pte) | 135 | if (!pte) |
141 | goto err_unlock; | 136 | goto out; |
142 | 137 | ||
143 | zap_pte(mm, vma, addr, pte); | 138 | if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { |
139 | update_hiwater_rss(mm); | ||
140 | dec_mm_counter(mm, file_rss); | ||
141 | } | ||
144 | 142 | ||
145 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 143 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
146 | pte_val = *pte; | 144 | pte_val = *pte; |
147 | pte_unmap(pte); | ||
148 | update_mmu_cache(vma, addr, pte_val); | 145 | update_mmu_cache(vma, addr, pte_val); |
149 | spin_unlock(&mm->page_table_lock); | 146 | pte_unmap_unlock(pte, ptl); |
150 | return 0; | 147 | err = 0; |
151 | 148 | out: | |
152 | err_unlock: | ||
153 | spin_unlock(&mm->page_table_lock); | ||
154 | return err; | 149 | return err; |
155 | } | 150 | } |
156 | 151 | ||
157 | |||
158 | /*** | 152 | /*** |
159 | * sys_remap_file_pages - remap arbitrary pages of a shared backing store | 153 | * sys_remap_file_pages - remap arbitrary pages of a shared backing store |
160 | * file within an existing vma. | 154 | * file within an existing vma. |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61d380678030..c9b43360fd33 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
277 | unsigned long addr; | 277 | unsigned long addr; |
278 | 278 | ||
279 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 279 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
280 | src_pte = huge_pte_offset(src, addr); | ||
281 | if (!src_pte) | ||
282 | continue; | ||
280 | dst_pte = huge_pte_alloc(dst, addr); | 283 | dst_pte = huge_pte_alloc(dst, addr); |
281 | if (!dst_pte) | 284 | if (!dst_pte) |
282 | goto nomem; | 285 | goto nomem; |
286 | spin_lock(&dst->page_table_lock); | ||
283 | spin_lock(&src->page_table_lock); | 287 | spin_lock(&src->page_table_lock); |
284 | src_pte = huge_pte_offset(src, addr); | 288 | if (!pte_none(*src_pte)) { |
285 | if (src_pte && !pte_none(*src_pte)) { | ||
286 | entry = *src_pte; | 289 | entry = *src_pte; |
287 | ptepage = pte_page(entry); | 290 | ptepage = pte_page(entry); |
288 | get_page(ptepage); | 291 | get_page(ptepage); |
289 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | 292 | add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); |
290 | set_huge_pte_at(dst, addr, dst_pte, entry); | 293 | set_huge_pte_at(dst, addr, dst_pte, entry); |
291 | } | 294 | } |
292 | spin_unlock(&src->page_table_lock); | 295 | spin_unlock(&src->page_table_lock); |
296 | spin_unlock(&dst->page_table_lock); | ||
293 | } | 297 | } |
294 | return 0; | 298 | return 0; |
295 | 299 | ||
@@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
310 | BUG_ON(start & ~HPAGE_MASK); | 314 | BUG_ON(start & ~HPAGE_MASK); |
311 | BUG_ON(end & ~HPAGE_MASK); | 315 | BUG_ON(end & ~HPAGE_MASK); |
312 | 316 | ||
317 | spin_lock(&mm->page_table_lock); | ||
318 | |||
319 | /* Update high watermark before we lower rss */ | ||
320 | update_hiwater_rss(mm); | ||
321 | |||
313 | for (address = start; address < end; address += HPAGE_SIZE) { | 322 | for (address = start; address < end; address += HPAGE_SIZE) { |
314 | ptep = huge_pte_offset(mm, address); | 323 | ptep = huge_pte_offset(mm, address); |
315 | if (! ptep) | 324 | if (!ptep) |
316 | /* This can happen on truncate, or if an | ||
317 | * mmap() is aborted due to an error before | ||
318 | * the prefault */ | ||
319 | continue; | 325 | continue; |
320 | 326 | ||
321 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 327 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
@@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
324 | 330 | ||
325 | page = pte_page(pte); | 331 | page = pte_page(pte); |
326 | put_page(page); | 332 | put_page(page); |
327 | add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); | 333 | add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); |
328 | } | 334 | } |
329 | flush_tlb_range(vma, start, end); | ||
330 | } | ||
331 | |||
332 | void zap_hugepage_range(struct vm_area_struct *vma, | ||
333 | unsigned long start, unsigned long length) | ||
334 | { | ||
335 | struct mm_struct *mm = vma->vm_mm; | ||
336 | 335 | ||
337 | spin_lock(&mm->page_table_lock); | ||
338 | unmap_hugepage_range(vma, start, start + length); | ||
339 | spin_unlock(&mm->page_table_lock); | 336 | spin_unlock(&mm->page_table_lock); |
337 | flush_tlb_range(vma, start, end); | ||
340 | } | 338 | } |
341 | 339 | ||
342 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | 340 | static struct page *find_lock_huge_page(struct address_space *mapping, |
341 | unsigned long idx) | ||
343 | { | 342 | { |
344 | struct mm_struct *mm = current->mm; | 343 | struct page *page; |
345 | unsigned long addr; | 344 | int err; |
346 | int ret = 0; | 345 | struct inode *inode = mapping->host; |
347 | 346 | unsigned long size; | |
348 | WARN_ON(!is_vm_hugetlb_page(vma)); | 347 | |
349 | BUG_ON(vma->vm_start & ~HPAGE_MASK); | 348 | retry: |
350 | BUG_ON(vma->vm_end & ~HPAGE_MASK); | 349 | page = find_lock_page(mapping, idx); |
351 | 350 | if (page) | |
352 | hugetlb_prefault_arch_hook(mm); | 351 | goto out; |
353 | 352 | ||
354 | spin_lock(&mm->page_table_lock); | 353 | /* Check to make sure the mapping hasn't been truncated */ |
355 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 354 | size = i_size_read(inode) >> HPAGE_SHIFT; |
356 | unsigned long idx; | 355 | if (idx >= size) |
357 | pte_t *pte = huge_pte_alloc(mm, addr); | 356 | goto out; |
358 | struct page *page; | 357 | |
359 | 358 | if (hugetlb_get_quota(mapping)) | |
360 | if (!pte) { | 359 | goto out; |
361 | ret = -ENOMEM; | 360 | page = alloc_huge_page(); |
362 | goto out; | 361 | if (!page) { |
363 | } | 362 | hugetlb_put_quota(mapping); |
363 | goto out; | ||
364 | } | ||
364 | 365 | ||
365 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | 366 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); |
366 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | 367 | if (err) { |
367 | page = find_get_page(mapping, idx); | 368 | put_page(page); |
368 | if (!page) { | 369 | hugetlb_put_quota(mapping); |
369 | /* charge the fs quota first */ | 370 | if (err == -EEXIST) |
370 | if (hugetlb_get_quota(mapping)) { | 371 | goto retry; |
371 | ret = -ENOMEM; | 372 | page = NULL; |
372 | goto out; | ||
373 | } | ||
374 | page = alloc_huge_page(); | ||
375 | if (!page) { | ||
376 | hugetlb_put_quota(mapping); | ||
377 | ret = -ENOMEM; | ||
378 | goto out; | ||
379 | } | ||
380 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
381 | if (! ret) { | ||
382 | unlock_page(page); | ||
383 | } else { | ||
384 | hugetlb_put_quota(mapping); | ||
385 | free_huge_page(page); | ||
386 | goto out; | ||
387 | } | ||
388 | } | ||
389 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
390 | set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); | ||
391 | } | 373 | } |
392 | out: | 374 | out: |
393 | spin_unlock(&mm->page_table_lock); | 375 | return page; |
394 | return ret; | ||
395 | } | 376 | } |
396 | 377 | ||
397 | /* | ||
398 | * On ia64 at least, it is possible to receive a hugetlb fault from a | ||
399 | * stale zero entry left in the TLB from earlier hardware prefetching. | ||
400 | * Low-level arch code should already have flushed the stale entry as | ||
401 | * part of its fault handling, but we do need to accept this minor fault | ||
402 | * and return successfully. Whereas the "normal" case is that this is | ||
403 | * an access to a hugetlb page which has been truncated off since mmap. | ||
404 | */ | ||
405 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 378 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
406 | unsigned long address, int write_access) | 379 | unsigned long address, int write_access) |
407 | { | 380 | { |
408 | int ret = VM_FAULT_SIGBUS; | 381 | int ret = VM_FAULT_SIGBUS; |
382 | unsigned long idx; | ||
383 | unsigned long size; | ||
409 | pte_t *pte; | 384 | pte_t *pte; |
385 | struct page *page; | ||
386 | struct address_space *mapping; | ||
387 | |||
388 | pte = huge_pte_alloc(mm, address); | ||
389 | if (!pte) | ||
390 | goto out; | ||
391 | |||
392 | mapping = vma->vm_file->f_mapping; | ||
393 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | ||
394 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
395 | |||
396 | /* | ||
397 | * Use page lock to guard against racing truncation | ||
398 | * before we get page_table_lock. | ||
399 | */ | ||
400 | page = find_lock_huge_page(mapping, idx); | ||
401 | if (!page) | ||
402 | goto out; | ||
410 | 403 | ||
411 | spin_lock(&mm->page_table_lock); | 404 | spin_lock(&mm->page_table_lock); |
412 | pte = huge_pte_offset(mm, address); | 405 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
413 | if (pte && !pte_none(*pte)) | 406 | if (idx >= size) |
414 | ret = VM_FAULT_MINOR; | 407 | goto backout; |
408 | |||
409 | ret = VM_FAULT_MINOR; | ||
410 | if (!pte_none(*pte)) | ||
411 | goto backout; | ||
412 | |||
413 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
414 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | ||
415 | spin_unlock(&mm->page_table_lock); | 415 | spin_unlock(&mm->page_table_lock); |
416 | unlock_page(page); | ||
417 | out: | ||
416 | return ret; | 418 | return ret; |
419 | |||
420 | backout: | ||
421 | spin_unlock(&mm->page_table_lock); | ||
422 | hugetlb_put_quota(mapping); | ||
423 | unlock_page(page); | ||
424 | put_page(page); | ||
425 | goto out; | ||
417 | } | 426 | } |
418 | 427 | ||
419 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 428 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
423 | unsigned long vpfn, vaddr = *position; | 432 | unsigned long vpfn, vaddr = *position; |
424 | int remainder = *length; | 433 | int remainder = *length; |
425 | 434 | ||
426 | BUG_ON(!is_vm_hugetlb_page(vma)); | ||
427 | |||
428 | vpfn = vaddr/PAGE_SIZE; | 435 | vpfn = vaddr/PAGE_SIZE; |
429 | spin_lock(&mm->page_table_lock); | 436 | spin_lock(&mm->page_table_lock); |
430 | while (vaddr < vma->vm_end && remainder) { | 437 | while (vaddr < vma->vm_end && remainder) { |
438 | pte_t *pte; | ||
439 | struct page *page; | ||
431 | 440 | ||
432 | if (pages) { | 441 | /* |
433 | pte_t *pte; | 442 | * Some archs (sparc64, sh*) have multiple pte_ts to |
434 | struct page *page; | 443 | * each hugepage. We have to make * sure we get the |
435 | 444 | * first, for the page indexing below to work. | |
436 | /* Some archs (sparc64, sh*) have multiple | 445 | */ |
437 | * pte_ts to each hugepage. We have to make | 446 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); |
438 | * sure we get the first, for the page | ||
439 | * indexing below to work. */ | ||
440 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | ||
441 | |||
442 | /* the hugetlb file might have been truncated */ | ||
443 | if (!pte || pte_none(*pte)) { | ||
444 | remainder = 0; | ||
445 | if (!i) | ||
446 | i = -EFAULT; | ||
447 | break; | ||
448 | } | ||
449 | 447 | ||
450 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | 448 | if (!pte || pte_none(*pte)) { |
449 | int ret; | ||
451 | 450 | ||
452 | WARN_ON(!PageCompound(page)); | 451 | spin_unlock(&mm->page_table_lock); |
452 | ret = hugetlb_fault(mm, vma, vaddr, 0); | ||
453 | spin_lock(&mm->page_table_lock); | ||
454 | if (ret == VM_FAULT_MINOR) | ||
455 | continue; | ||
456 | |||
457 | remainder = 0; | ||
458 | if (!i) | ||
459 | i = -EFAULT; | ||
460 | break; | ||
461 | } | ||
453 | 462 | ||
463 | if (pages) { | ||
464 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
454 | get_page(page); | 465 | get_page(page); |
455 | pages[i] = page; | 466 | pages[i] = page; |
456 | } | 467 | } |
diff --git a/mm/madvise.c b/mm/madvise.c index 20e075d1c64c..17aaf3e16449 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, | |||
126 | unsigned long start, unsigned long end) | 126 | unsigned long start, unsigned long end) |
127 | { | 127 | { |
128 | *prev = vma; | 128 | *prev = vma; |
129 | if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) | 129 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) |
130 | return -EINVAL; | 130 | return -EINVAL; |
131 | 131 | ||
132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | 132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { |
diff --git a/mm/memory.c b/mm/memory.c index 1db40e935e55..0f60baf6f69b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | |||
114 | { | 114 | { |
115 | struct page *page = pmd_page(*pmd); | 115 | struct page *page = pmd_page(*pmd); |
116 | pmd_clear(pmd); | 116 | pmd_clear(pmd); |
117 | pte_lock_deinit(page); | ||
117 | pte_free_tlb(tlb, page); | 118 | pte_free_tlb(tlb, page); |
118 | dec_page_state(nr_page_table_pages); | 119 | dec_page_state(nr_page_table_pages); |
119 | tlb->mm->nr_ptes--; | 120 | tlb->mm->nr_ptes--; |
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb, | |||
249 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | 250 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); |
250 | } while (pgd++, addr = next, addr != end); | 251 | } while (pgd++, addr = next, addr != end); |
251 | 252 | ||
252 | if (!tlb_is_full_mm(*tlb)) | 253 | if (!(*tlb)->fullmm) |
253 | flush_tlb_pgtables((*tlb)->mm, start, end); | 254 | flush_tlb_pgtables((*tlb)->mm, start, end); |
254 | } | 255 | } |
255 | 256 | ||
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
260 | struct vm_area_struct *next = vma->vm_next; | 261 | struct vm_area_struct *next = vma->vm_next; |
261 | unsigned long addr = vma->vm_start; | 262 | unsigned long addr = vma->vm_start; |
262 | 263 | ||
264 | /* | ||
265 | * Hide vma from rmap and vmtruncate before freeing pgtables | ||
266 | */ | ||
267 | anon_vma_unlink(vma); | ||
268 | unlink_file_vma(vma); | ||
269 | |||
263 | if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { | 270 | if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { |
264 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, | 271 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, |
265 | floor, next? next->vm_start: ceiling); | 272 | floor, next? next->vm_start: ceiling); |
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
272 | HPAGE_SIZE)) { | 279 | HPAGE_SIZE)) { |
273 | vma = next; | 280 | vma = next; |
274 | next = vma->vm_next; | 281 | next = vma->vm_next; |
282 | anon_vma_unlink(vma); | ||
283 | unlink_file_vma(vma); | ||
275 | } | 284 | } |
276 | free_pgd_range(tlb, addr, vma->vm_end, | 285 | free_pgd_range(tlb, addr, vma->vm_end, |
277 | floor, next? next->vm_start: ceiling); | 286 | floor, next? next->vm_start: ceiling); |
@@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
280 | } | 289 | } |
281 | } | 290 | } |
282 | 291 | ||
283 | pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, | 292 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) |
284 | unsigned long address) | ||
285 | { | 293 | { |
286 | if (!pmd_present(*pmd)) { | 294 | struct page *new = pte_alloc_one(mm, address); |
287 | struct page *new; | 295 | if (!new) |
288 | 296 | return -ENOMEM; | |
289 | spin_unlock(&mm->page_table_lock); | 297 | |
290 | new = pte_alloc_one(mm, address); | 298 | pte_lock_init(new); |
291 | spin_lock(&mm->page_table_lock); | 299 | spin_lock(&mm->page_table_lock); |
292 | if (!new) | 300 | if (pmd_present(*pmd)) { /* Another has populated it */ |
293 | return NULL; | 301 | pte_lock_deinit(new); |
294 | /* | 302 | pte_free(new); |
295 | * Because we dropped the lock, we should re-check the | 303 | } else { |
296 | * entry, as somebody else could have populated it.. | ||
297 | */ | ||
298 | if (pmd_present(*pmd)) { | ||
299 | pte_free(new); | ||
300 | goto out; | ||
301 | } | ||
302 | mm->nr_ptes++; | 304 | mm->nr_ptes++; |
303 | inc_page_state(nr_page_table_pages); | 305 | inc_page_state(nr_page_table_pages); |
304 | pmd_populate(mm, pmd, new); | 306 | pmd_populate(mm, pmd, new); |
305 | } | 307 | } |
306 | out: | 308 | spin_unlock(&mm->page_table_lock); |
307 | return pte_offset_map(pmd, address); | 309 | return 0; |
308 | } | 310 | } |
309 | 311 | ||
310 | pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 312 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) |
311 | { | 313 | { |
312 | if (!pmd_present(*pmd)) { | 314 | pte_t *new = pte_alloc_one_kernel(&init_mm, address); |
313 | pte_t *new; | 315 | if (!new) |
316 | return -ENOMEM; | ||
314 | 317 | ||
315 | spin_unlock(&mm->page_table_lock); | 318 | spin_lock(&init_mm.page_table_lock); |
316 | new = pte_alloc_one_kernel(mm, address); | 319 | if (pmd_present(*pmd)) /* Another has populated it */ |
317 | spin_lock(&mm->page_table_lock); | 320 | pte_free_kernel(new); |
318 | if (!new) | 321 | else |
319 | return NULL; | 322 | pmd_populate_kernel(&init_mm, pmd, new); |
323 | spin_unlock(&init_mm.page_table_lock); | ||
324 | return 0; | ||
325 | } | ||
320 | 326 | ||
321 | /* | 327 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) |
322 | * Because we dropped the lock, we should re-check the | 328 | { |
323 | * entry, as somebody else could have populated it.. | 329 | if (file_rss) |
324 | */ | 330 | add_mm_counter(mm, file_rss, file_rss); |
325 | if (pmd_present(*pmd)) { | 331 | if (anon_rss) |
326 | pte_free_kernel(new); | 332 | add_mm_counter(mm, anon_rss, anon_rss); |
327 | goto out; | 333 | } |
328 | } | 334 | |
329 | pmd_populate_kernel(mm, pmd, new); | 335 | /* |
330 | } | 336 | * This function is called to print an error when a pte in a |
331 | out: | 337 | * !VM_RESERVED region is found pointing to an invalid pfn (which |
332 | return pte_offset_kernel(pmd, address); | 338 | * is an error. |
339 | * | ||
340 | * The calling function must still handle the error. | ||
341 | */ | ||
342 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | ||
343 | { | ||
344 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | ||
345 | "vm_flags = %lx, vaddr = %lx\n", | ||
346 | (long long)pte_val(pte), | ||
347 | (vma->vm_mm == current->mm ? current->comm : "???"), | ||
348 | vma->vm_flags, vaddr); | ||
349 | dump_stack(); | ||
333 | } | 350 | } |
334 | 351 | ||
335 | /* | 352 | /* |
336 | * copy one vm_area from one task to the other. Assumes the page tables | 353 | * copy one vm_area from one task to the other. Assumes the page tables |
337 | * already present in the new task to be cleared in the whole range | 354 | * already present in the new task to be cleared in the whole range |
338 | * covered by this vma. | 355 | * covered by this vma. |
339 | * | ||
340 | * dst->page_table_lock is held on entry and exit, | ||
341 | * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). | ||
342 | */ | 356 | */ |
343 | 357 | ||
344 | static inline void | 358 | static inline void |
345 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 359 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
346 | pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, | 360 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
347 | unsigned long addr) | 361 | unsigned long addr, int *rss) |
348 | { | 362 | { |
363 | unsigned long vm_flags = vma->vm_flags; | ||
349 | pte_t pte = *src_pte; | 364 | pte_t pte = *src_pte; |
350 | struct page *page; | 365 | struct page *page; |
351 | unsigned long pfn; | 366 | unsigned long pfn; |
@@ -357,29 +372,32 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
357 | /* make sure dst_mm is on swapoff's mmlist. */ | 372 | /* make sure dst_mm is on swapoff's mmlist. */ |
358 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 373 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
359 | spin_lock(&mmlist_lock); | 374 | spin_lock(&mmlist_lock); |
360 | list_add(&dst_mm->mmlist, &src_mm->mmlist); | 375 | if (list_empty(&dst_mm->mmlist)) |
376 | list_add(&dst_mm->mmlist, | ||
377 | &src_mm->mmlist); | ||
361 | spin_unlock(&mmlist_lock); | 378 | spin_unlock(&mmlist_lock); |
362 | } | 379 | } |
363 | } | 380 | } |
364 | set_pte_at(dst_mm, addr, dst_pte, pte); | 381 | goto out_set_pte; |
365 | return; | ||
366 | } | 382 | } |
367 | 383 | ||
368 | pfn = pte_pfn(pte); | 384 | /* If the region is VM_RESERVED, the mapping is not |
369 | /* the pte points outside of valid memory, the | 385 | * mapped via rmap - duplicate the pte as is. |
370 | * mapping is assumed to be good, meaningful | ||
371 | * and not mapped via rmap - duplicate the | ||
372 | * mapping as is. | ||
373 | */ | 386 | */ |
374 | page = NULL; | 387 | if (vm_flags & VM_RESERVED) |
375 | if (pfn_valid(pfn)) | 388 | goto out_set_pte; |
376 | page = pfn_to_page(pfn); | ||
377 | 389 | ||
378 | if (!page || PageReserved(page)) { | 390 | pfn = pte_pfn(pte); |
379 | set_pte_at(dst_mm, addr, dst_pte, pte); | 391 | /* If the pte points outside of valid memory but |
380 | return; | 392 | * the region is not VM_RESERVED, we have a problem. |
393 | */ | ||
394 | if (unlikely(!pfn_valid(pfn))) { | ||
395 | print_bad_pte(vma, pte, addr); | ||
396 | goto out_set_pte; /* try to do something sane */ | ||
381 | } | 397 | } |
382 | 398 | ||
399 | page = pfn_to_page(pfn); | ||
400 | |||
383 | /* | 401 | /* |
384 | * If it's a COW mapping, write protect it both | 402 | * If it's a COW mapping, write protect it both |
385 | * in the parent and the child | 403 | * in the parent and the child |
@@ -397,11 +415,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
397 | pte = pte_mkclean(pte); | 415 | pte = pte_mkclean(pte); |
398 | pte = pte_mkold(pte); | 416 | pte = pte_mkold(pte); |
399 | get_page(page); | 417 | get_page(page); |
400 | inc_mm_counter(dst_mm, rss); | ||
401 | if (PageAnon(page)) | ||
402 | inc_mm_counter(dst_mm, anon_rss); | ||
403 | set_pte_at(dst_mm, addr, dst_pte, pte); | ||
404 | page_dup_rmap(page); | 418 | page_dup_rmap(page); |
419 | rss[!!PageAnon(page)]++; | ||
420 | |||
421 | out_set_pte: | ||
422 | set_pte_at(dst_mm, addr, dst_pte, pte); | ||
405 | } | 423 | } |
406 | 424 | ||
407 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 425 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
@@ -409,38 +427,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
409 | unsigned long addr, unsigned long end) | 427 | unsigned long addr, unsigned long end) |
410 | { | 428 | { |
411 | pte_t *src_pte, *dst_pte; | 429 | pte_t *src_pte, *dst_pte; |
412 | unsigned long vm_flags = vma->vm_flags; | 430 | spinlock_t *src_ptl, *dst_ptl; |
413 | int progress; | 431 | int progress = 0; |
432 | int rss[2]; | ||
414 | 433 | ||
415 | again: | 434 | again: |
416 | dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); | 435 | rss[1] = rss[0] = 0; |
436 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | ||
417 | if (!dst_pte) | 437 | if (!dst_pte) |
418 | return -ENOMEM; | 438 | return -ENOMEM; |
419 | src_pte = pte_offset_map_nested(src_pmd, addr); | 439 | src_pte = pte_offset_map_nested(src_pmd, addr); |
440 | src_ptl = pte_lockptr(src_mm, src_pmd); | ||
441 | spin_lock(src_ptl); | ||
420 | 442 | ||
421 | progress = 0; | ||
422 | spin_lock(&src_mm->page_table_lock); | ||
423 | do { | 443 | do { |
424 | /* | 444 | /* |
425 | * We are holding two locks at this point - either of them | 445 | * We are holding two locks at this point - either of them |
426 | * could generate latencies in another task on another CPU. | 446 | * could generate latencies in another task on another CPU. |
427 | */ | 447 | */ |
428 | if (progress >= 32 && (need_resched() || | 448 | if (progress >= 32) { |
429 | need_lockbreak(&src_mm->page_table_lock) || | 449 | progress = 0; |
430 | need_lockbreak(&dst_mm->page_table_lock))) | 450 | if (need_resched() || |
431 | break; | 451 | need_lockbreak(src_ptl) || |
452 | need_lockbreak(dst_ptl)) | ||
453 | break; | ||
454 | } | ||
432 | if (pte_none(*src_pte)) { | 455 | if (pte_none(*src_pte)) { |
433 | progress++; | 456 | progress++; |
434 | continue; | 457 | continue; |
435 | } | 458 | } |
436 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); | 459 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); |
437 | progress += 8; | 460 | progress += 8; |
438 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 461 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
439 | spin_unlock(&src_mm->page_table_lock); | ||
440 | 462 | ||
463 | spin_unlock(src_ptl); | ||
441 | pte_unmap_nested(src_pte - 1); | 464 | pte_unmap_nested(src_pte - 1); |
442 | pte_unmap(dst_pte - 1); | 465 | add_mm_rss(dst_mm, rss[0], rss[1]); |
443 | cond_resched_lock(&dst_mm->page_table_lock); | 466 | pte_unmap_unlock(dst_pte - 1, dst_ptl); |
467 | cond_resched(); | ||
444 | if (addr != end) | 468 | if (addr != end) |
445 | goto again; | 469 | goto again; |
446 | return 0; | 470 | return 0; |
@@ -525,24 +549,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
525 | return 0; | 549 | return 0; |
526 | } | 550 | } |
527 | 551 | ||
528 | static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | 552 | static void zap_pte_range(struct mmu_gather *tlb, |
553 | struct vm_area_struct *vma, pmd_t *pmd, | ||
529 | unsigned long addr, unsigned long end, | 554 | unsigned long addr, unsigned long end, |
530 | struct zap_details *details) | 555 | struct zap_details *details) |
531 | { | 556 | { |
557 | struct mm_struct *mm = tlb->mm; | ||
532 | pte_t *pte; | 558 | pte_t *pte; |
559 | spinlock_t *ptl; | ||
560 | int file_rss = 0; | ||
561 | int anon_rss = 0; | ||
533 | 562 | ||
534 | pte = pte_offset_map(pmd, addr); | 563 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
535 | do { | 564 | do { |
536 | pte_t ptent = *pte; | 565 | pte_t ptent = *pte; |
537 | if (pte_none(ptent)) | 566 | if (pte_none(ptent)) |
538 | continue; | 567 | continue; |
539 | if (pte_present(ptent)) { | 568 | if (pte_present(ptent)) { |
540 | struct page *page = NULL; | 569 | struct page *page = NULL; |
541 | unsigned long pfn = pte_pfn(ptent); | 570 | if (!(vma->vm_flags & VM_RESERVED)) { |
542 | if (pfn_valid(pfn)) { | 571 | unsigned long pfn = pte_pfn(ptent); |
543 | page = pfn_to_page(pfn); | 572 | if (unlikely(!pfn_valid(pfn))) |
544 | if (PageReserved(page)) | 573 | print_bad_pte(vma, ptent, addr); |
545 | page = NULL; | 574 | else |
575 | page = pfn_to_page(pfn); | ||
546 | } | 576 | } |
547 | if (unlikely(details) && page) { | 577 | if (unlikely(details) && page) { |
548 | /* | 578 | /* |
@@ -562,7 +592,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
562 | page->index > details->last_index)) | 592 | page->index > details->last_index)) |
563 | continue; | 593 | continue; |
564 | } | 594 | } |
565 | ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, | 595 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
566 | tlb->fullmm); | 596 | tlb->fullmm); |
567 | tlb_remove_tlb_entry(tlb, pte, addr); | 597 | tlb_remove_tlb_entry(tlb, pte, addr); |
568 | if (unlikely(!page)) | 598 | if (unlikely(!page)) |
@@ -570,15 +600,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
570 | if (unlikely(details) && details->nonlinear_vma | 600 | if (unlikely(details) && details->nonlinear_vma |
571 | && linear_page_index(details->nonlinear_vma, | 601 | && linear_page_index(details->nonlinear_vma, |
572 | addr) != page->index) | 602 | addr) != page->index) |
573 | set_pte_at(tlb->mm, addr, pte, | 603 | set_pte_at(mm, addr, pte, |
574 | pgoff_to_pte(page->index)); | 604 | pgoff_to_pte(page->index)); |
575 | if (pte_dirty(ptent)) | ||
576 | set_page_dirty(page); | ||
577 | if (PageAnon(page)) | 605 | if (PageAnon(page)) |
578 | dec_mm_counter(tlb->mm, anon_rss); | 606 | anon_rss--; |
579 | else if (pte_young(ptent)) | 607 | else { |
580 | mark_page_accessed(page); | 608 | if (pte_dirty(ptent)) |
581 | tlb->freed++; | 609 | set_page_dirty(page); |
610 | if (pte_young(ptent)) | ||
611 | mark_page_accessed(page); | ||
612 | file_rss--; | ||
613 | } | ||
582 | page_remove_rmap(page); | 614 | page_remove_rmap(page); |
583 | tlb_remove_page(tlb, page); | 615 | tlb_remove_page(tlb, page); |
584 | continue; | 616 | continue; |
@@ -591,12 +623,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
591 | continue; | 623 | continue; |
592 | if (!pte_file(ptent)) | 624 | if (!pte_file(ptent)) |
593 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 625 | free_swap_and_cache(pte_to_swp_entry(ptent)); |
594 | pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); | 626 | pte_clear_full(mm, addr, pte, tlb->fullmm); |
595 | } while (pte++, addr += PAGE_SIZE, addr != end); | 627 | } while (pte++, addr += PAGE_SIZE, addr != end); |
596 | pte_unmap(pte - 1); | 628 | |
629 | add_mm_rss(mm, file_rss, anon_rss); | ||
630 | pte_unmap_unlock(pte - 1, ptl); | ||
597 | } | 631 | } |
598 | 632 | ||
599 | static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 633 | static inline void zap_pmd_range(struct mmu_gather *tlb, |
634 | struct vm_area_struct *vma, pud_t *pud, | ||
600 | unsigned long addr, unsigned long end, | 635 | unsigned long addr, unsigned long end, |
601 | struct zap_details *details) | 636 | struct zap_details *details) |
602 | { | 637 | { |
@@ -608,11 +643,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
608 | next = pmd_addr_end(addr, end); | 643 | next = pmd_addr_end(addr, end); |
609 | if (pmd_none_or_clear_bad(pmd)) | 644 | if (pmd_none_or_clear_bad(pmd)) |
610 | continue; | 645 | continue; |
611 | zap_pte_range(tlb, pmd, addr, next, details); | 646 | zap_pte_range(tlb, vma, pmd, addr, next, details); |
612 | } while (pmd++, addr = next, addr != end); | 647 | } while (pmd++, addr = next, addr != end); |
613 | } | 648 | } |
614 | 649 | ||
615 | static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 650 | static inline void zap_pud_range(struct mmu_gather *tlb, |
651 | struct vm_area_struct *vma, pgd_t *pgd, | ||
616 | unsigned long addr, unsigned long end, | 652 | unsigned long addr, unsigned long end, |
617 | struct zap_details *details) | 653 | struct zap_details *details) |
618 | { | 654 | { |
@@ -624,7 +660,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
624 | next = pud_addr_end(addr, end); | 660 | next = pud_addr_end(addr, end); |
625 | if (pud_none_or_clear_bad(pud)) | 661 | if (pud_none_or_clear_bad(pud)) |
626 | continue; | 662 | continue; |
627 | zap_pmd_range(tlb, pud, addr, next, details); | 663 | zap_pmd_range(tlb, vma, pud, addr, next, details); |
628 | } while (pud++, addr = next, addr != end); | 664 | } while (pud++, addr = next, addr != end); |
629 | } | 665 | } |
630 | 666 | ||
@@ -645,7 +681,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
645 | next = pgd_addr_end(addr, end); | 681 | next = pgd_addr_end(addr, end); |
646 | if (pgd_none_or_clear_bad(pgd)) | 682 | if (pgd_none_or_clear_bad(pgd)) |
647 | continue; | 683 | continue; |
648 | zap_pud_range(tlb, pgd, addr, next, details); | 684 | zap_pud_range(tlb, vma, pgd, addr, next, details); |
649 | } while (pgd++, addr = next, addr != end); | 685 | } while (pgd++, addr = next, addr != end); |
650 | tlb_end_vma(tlb, vma); | 686 | tlb_end_vma(tlb, vma); |
651 | } | 687 | } |
@@ -660,7 +696,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
660 | /** | 696 | /** |
661 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 697 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
662 | * @tlbp: address of the caller's struct mmu_gather | 698 | * @tlbp: address of the caller's struct mmu_gather |
663 | * @mm: the controlling mm_struct | ||
664 | * @vma: the starting vma | 699 | * @vma: the starting vma |
665 | * @start_addr: virtual address at which to start unmapping | 700 | * @start_addr: virtual address at which to start unmapping |
666 | * @end_addr: virtual address at which to end unmapping | 701 | * @end_addr: virtual address at which to end unmapping |
@@ -669,10 +704,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
669 | * | 704 | * |
670 | * Returns the end address of the unmapping (restart addr if interrupted). | 705 | * Returns the end address of the unmapping (restart addr if interrupted). |
671 | * | 706 | * |
672 | * Unmap all pages in the vma list. Called under page_table_lock. | 707 | * Unmap all pages in the vma list. |
673 | * | 708 | * |
674 | * We aim to not hold page_table_lock for too long (for scheduling latency | 709 | * We aim to not hold locks for too long (for scheduling latency reasons). |
675 | * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | 710 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to |
676 | * return the ending mmu_gather to the caller. | 711 | * return the ending mmu_gather to the caller. |
677 | * | 712 | * |
678 | * Only addresses between `start' and `end' will be unmapped. | 713 | * Only addresses between `start' and `end' will be unmapped. |
@@ -684,7 +719,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
684 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | 719 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
685 | * drops the lock and schedules. | 720 | * drops the lock and schedules. |
686 | */ | 721 | */ |
687 | unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | 722 | unsigned long unmap_vmas(struct mmu_gather **tlbp, |
688 | struct vm_area_struct *vma, unsigned long start_addr, | 723 | struct vm_area_struct *vma, unsigned long start_addr, |
689 | unsigned long end_addr, unsigned long *nr_accounted, | 724 | unsigned long end_addr, unsigned long *nr_accounted, |
690 | struct zap_details *details) | 725 | struct zap_details *details) |
@@ -694,7 +729,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | |||
694 | int tlb_start_valid = 0; | 729 | int tlb_start_valid = 0; |
695 | unsigned long start = start_addr; | 730 | unsigned long start = start_addr; |
696 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | 731 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; |
697 | int fullmm = tlb_is_full_mm(*tlbp); | 732 | int fullmm = (*tlbp)->fullmm; |
698 | 733 | ||
699 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 734 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { |
700 | unsigned long end; | 735 | unsigned long end; |
@@ -734,19 +769,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | |||
734 | tlb_finish_mmu(*tlbp, tlb_start, start); | 769 | tlb_finish_mmu(*tlbp, tlb_start, start); |
735 | 770 | ||
736 | if (need_resched() || | 771 | if (need_resched() || |
737 | need_lockbreak(&mm->page_table_lock) || | ||
738 | (i_mmap_lock && need_lockbreak(i_mmap_lock))) { | 772 | (i_mmap_lock && need_lockbreak(i_mmap_lock))) { |
739 | if (i_mmap_lock) { | 773 | if (i_mmap_lock) { |
740 | /* must reset count of rss freed */ | 774 | *tlbp = NULL; |
741 | *tlbp = tlb_gather_mmu(mm, fullmm); | ||
742 | goto out; | 775 | goto out; |
743 | } | 776 | } |
744 | spin_unlock(&mm->page_table_lock); | ||
745 | cond_resched(); | 777 | cond_resched(); |
746 | spin_lock(&mm->page_table_lock); | ||
747 | } | 778 | } |
748 | 779 | ||
749 | *tlbp = tlb_gather_mmu(mm, fullmm); | 780 | *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); |
750 | tlb_start_valid = 0; | 781 | tlb_start_valid = 0; |
751 | zap_bytes = ZAP_BLOCK_SIZE; | 782 | zap_bytes = ZAP_BLOCK_SIZE; |
752 | } | 783 | } |
@@ -770,123 +801,93 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
770 | unsigned long end = address + size; | 801 | unsigned long end = address + size; |
771 | unsigned long nr_accounted = 0; | 802 | unsigned long nr_accounted = 0; |
772 | 803 | ||
773 | if (is_vm_hugetlb_page(vma)) { | ||
774 | zap_hugepage_range(vma, address, size); | ||
775 | return end; | ||
776 | } | ||
777 | |||
778 | lru_add_drain(); | 804 | lru_add_drain(); |
779 | spin_lock(&mm->page_table_lock); | ||
780 | tlb = tlb_gather_mmu(mm, 0); | 805 | tlb = tlb_gather_mmu(mm, 0); |
781 | end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); | 806 | update_hiwater_rss(mm); |
782 | tlb_finish_mmu(tlb, address, end); | 807 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); |
783 | spin_unlock(&mm->page_table_lock); | 808 | if (tlb) |
809 | tlb_finish_mmu(tlb, address, end); | ||
784 | return end; | 810 | return end; |
785 | } | 811 | } |
786 | 812 | ||
787 | /* | 813 | /* |
788 | * Do a quick page-table lookup for a single page. | 814 | * Do a quick page-table lookup for a single page. |
789 | * mm->page_table_lock must be held. | ||
790 | */ | 815 | */ |
791 | static struct page *__follow_page(struct mm_struct *mm, unsigned long address, | 816 | struct page *follow_page(struct mm_struct *mm, unsigned long address, |
792 | int read, int write, int accessed) | 817 | unsigned int flags) |
793 | { | 818 | { |
794 | pgd_t *pgd; | 819 | pgd_t *pgd; |
795 | pud_t *pud; | 820 | pud_t *pud; |
796 | pmd_t *pmd; | 821 | pmd_t *pmd; |
797 | pte_t *ptep, pte; | 822 | pte_t *ptep, pte; |
823 | spinlock_t *ptl; | ||
798 | unsigned long pfn; | 824 | unsigned long pfn; |
799 | struct page *page; | 825 | struct page *page; |
800 | 826 | ||
801 | page = follow_huge_addr(mm, address, write); | 827 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
802 | if (! IS_ERR(page)) | 828 | if (!IS_ERR(page)) { |
803 | return page; | 829 | BUG_ON(flags & FOLL_GET); |
830 | goto out; | ||
831 | } | ||
804 | 832 | ||
833 | page = NULL; | ||
805 | pgd = pgd_offset(mm, address); | 834 | pgd = pgd_offset(mm, address); |
806 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 835 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
807 | goto out; | 836 | goto no_page_table; |
808 | 837 | ||
809 | pud = pud_offset(pgd, address); | 838 | pud = pud_offset(pgd, address); |
810 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 839 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) |
811 | goto out; | 840 | goto no_page_table; |
812 | 841 | ||
813 | pmd = pmd_offset(pud, address); | 842 | pmd = pmd_offset(pud, address); |
814 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 843 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
844 | goto no_page_table; | ||
845 | |||
846 | if (pmd_huge(*pmd)) { | ||
847 | BUG_ON(flags & FOLL_GET); | ||
848 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
815 | goto out; | 849 | goto out; |
816 | if (pmd_huge(*pmd)) | 850 | } |
817 | return follow_huge_pmd(mm, address, pmd, write); | ||
818 | 851 | ||
819 | ptep = pte_offset_map(pmd, address); | 852 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
820 | if (!ptep) | 853 | if (!ptep) |
821 | goto out; | 854 | goto out; |
822 | 855 | ||
823 | pte = *ptep; | 856 | pte = *ptep; |
824 | pte_unmap(ptep); | 857 | if (!pte_present(pte)) |
825 | if (pte_present(pte)) { | 858 | goto unlock; |
826 | if (write && !pte_write(pte)) | 859 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
827 | goto out; | 860 | goto unlock; |
828 | if (read && !pte_read(pte)) | 861 | pfn = pte_pfn(pte); |
829 | goto out; | 862 | if (!pfn_valid(pfn)) |
830 | pfn = pte_pfn(pte); | 863 | goto unlock; |
831 | if (pfn_valid(pfn)) { | 864 | |
832 | page = pfn_to_page(pfn); | 865 | page = pfn_to_page(pfn); |
833 | if (accessed) { | 866 | if (flags & FOLL_GET) |
834 | if (write && !pte_dirty(pte) &&!PageDirty(page)) | 867 | get_page(page); |
835 | set_page_dirty(page); | 868 | if (flags & FOLL_TOUCH) { |
836 | mark_page_accessed(page); | 869 | if ((flags & FOLL_WRITE) && |
837 | } | 870 | !pte_dirty(pte) && !PageDirty(page)) |
838 | return page; | 871 | set_page_dirty(page); |
839 | } | 872 | mark_page_accessed(page); |
840 | } | 873 | } |
841 | 874 | unlock: | |
875 | pte_unmap_unlock(ptep, ptl); | ||
842 | out: | 876 | out: |
843 | return NULL; | 877 | return page; |
844 | } | ||
845 | |||
846 | inline struct page * | ||
847 | follow_page(struct mm_struct *mm, unsigned long address, int write) | ||
848 | { | ||
849 | return __follow_page(mm, address, 0, write, 1); | ||
850 | } | ||
851 | |||
852 | /* | ||
853 | * check_user_page_readable() can be called frm niterrupt context by oprofile, | ||
854 | * so we need to avoid taking any non-irq-safe locks | ||
855 | */ | ||
856 | int check_user_page_readable(struct mm_struct *mm, unsigned long address) | ||
857 | { | ||
858 | return __follow_page(mm, address, 1, 0, 0) != NULL; | ||
859 | } | ||
860 | EXPORT_SYMBOL(check_user_page_readable); | ||
861 | |||
862 | static inline int | ||
863 | untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, | ||
864 | unsigned long address) | ||
865 | { | ||
866 | pgd_t *pgd; | ||
867 | pud_t *pud; | ||
868 | pmd_t *pmd; | ||
869 | |||
870 | /* Check if the vma is for an anonymous mapping. */ | ||
871 | if (vma->vm_ops && vma->vm_ops->nopage) | ||
872 | return 0; | ||
873 | |||
874 | /* Check if page directory entry exists. */ | ||
875 | pgd = pgd_offset(mm, address); | ||
876 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
877 | return 1; | ||
878 | |||
879 | pud = pud_offset(pgd, address); | ||
880 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
881 | return 1; | ||
882 | |||
883 | /* Check if page middle directory entry exists. */ | ||
884 | pmd = pmd_offset(pud, address); | ||
885 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
886 | return 1; | ||
887 | 878 | ||
888 | /* There is a pte slot for 'address' in 'mm'. */ | 879 | no_page_table: |
889 | return 0; | 880 | /* |
881 | * When core dumping an enormous anonymous area that nobody | ||
882 | * has touched so far, we don't want to allocate page tables. | ||
883 | */ | ||
884 | if (flags & FOLL_ANON) { | ||
885 | page = ZERO_PAGE(address); | ||
886 | if (flags & FOLL_GET) | ||
887 | get_page(page); | ||
888 | BUG_ON(flags & FOLL_WRITE); | ||
889 | } | ||
890 | return page; | ||
890 | } | 891 | } |
891 | 892 | ||
892 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 893 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
@@ -894,18 +895,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
894 | struct page **pages, struct vm_area_struct **vmas) | 895 | struct page **pages, struct vm_area_struct **vmas) |
895 | { | 896 | { |
896 | int i; | 897 | int i; |
897 | unsigned int flags; | 898 | unsigned int vm_flags; |
898 | 899 | ||
899 | /* | 900 | /* |
900 | * Require read or write permissions. | 901 | * Require read or write permissions. |
901 | * If 'force' is set, we only require the "MAY" flags. | 902 | * If 'force' is set, we only require the "MAY" flags. |
902 | */ | 903 | */ |
903 | flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 904 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
904 | flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 905 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
905 | i = 0; | 906 | i = 0; |
906 | 907 | ||
907 | do { | 908 | do { |
908 | struct vm_area_struct * vma; | 909 | struct vm_area_struct *vma; |
910 | unsigned int foll_flags; | ||
909 | 911 | ||
910 | vma = find_extend_vma(mm, start); | 912 | vma = find_extend_vma(mm, start); |
911 | if (!vma && in_gate_area(tsk, start)) { | 913 | if (!vma && in_gate_area(tsk, start)) { |
@@ -945,8 +947,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
945 | continue; | 947 | continue; |
946 | } | 948 | } |
947 | 949 | ||
948 | if (!vma || (vma->vm_flags & VM_IO) | 950 | if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) |
949 | || !(flags & vma->vm_flags)) | 951 | || !(vm_flags & vma->vm_flags)) |
950 | return i ? : -EFAULT; | 952 | return i ? : -EFAULT; |
951 | 953 | ||
952 | if (is_vm_hugetlb_page(vma)) { | 954 | if (is_vm_hugetlb_page(vma)) { |
@@ -954,29 +956,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
954 | &start, &len, i); | 956 | &start, &len, i); |
955 | continue; | 957 | continue; |
956 | } | 958 | } |
957 | spin_lock(&mm->page_table_lock); | 959 | |
960 | foll_flags = FOLL_TOUCH; | ||
961 | if (pages) | ||
962 | foll_flags |= FOLL_GET; | ||
963 | if (!write && !(vma->vm_flags & VM_LOCKED) && | ||
964 | (!vma->vm_ops || !vma->vm_ops->nopage)) | ||
965 | foll_flags |= FOLL_ANON; | ||
966 | |||
958 | do { | 967 | do { |
959 | int write_access = write; | ||
960 | struct page *page; | 968 | struct page *page; |
961 | 969 | ||
962 | cond_resched_lock(&mm->page_table_lock); | 970 | if (write) |
963 | while (!(page = follow_page(mm, start, write_access))) { | 971 | foll_flags |= FOLL_WRITE; |
964 | int ret; | ||
965 | |||
966 | /* | ||
967 | * Shortcut for anonymous pages. We don't want | ||
968 | * to force the creation of pages tables for | ||
969 | * insanely big anonymously mapped areas that | ||
970 | * nobody touched so far. This is important | ||
971 | * for doing a core dump for these mappings. | ||
972 | */ | ||
973 | if (!write && untouched_anonymous_page(mm,vma,start)) { | ||
974 | page = ZERO_PAGE(start); | ||
975 | break; | ||
976 | } | ||
977 | spin_unlock(&mm->page_table_lock); | ||
978 | ret = __handle_mm_fault(mm, vma, start, write_access); | ||
979 | 972 | ||
973 | cond_resched(); | ||
974 | while (!(page = follow_page(mm, start, foll_flags))) { | ||
975 | int ret; | ||
976 | ret = __handle_mm_fault(mm, vma, start, | ||
977 | foll_flags & FOLL_WRITE); | ||
980 | /* | 978 | /* |
981 | * The VM_FAULT_WRITE bit tells us that do_wp_page has | 979 | * The VM_FAULT_WRITE bit tells us that do_wp_page has |
982 | * broken COW when necessary, even if maybe_mkwrite | 980 | * broken COW when necessary, even if maybe_mkwrite |
@@ -984,7 +982,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
984 | * subsequent page lookups as if they were reads. | 982 | * subsequent page lookups as if they were reads. |
985 | */ | 983 | */ |
986 | if (ret & VM_FAULT_WRITE) | 984 | if (ret & VM_FAULT_WRITE) |
987 | write_access = 0; | 985 | foll_flags &= ~FOLL_WRITE; |
988 | 986 | ||
989 | switch (ret & ~VM_FAULT_WRITE) { | 987 | switch (ret & ~VM_FAULT_WRITE) { |
990 | case VM_FAULT_MINOR: | 988 | case VM_FAULT_MINOR: |
@@ -1000,13 +998,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1000 | default: | 998 | default: |
1001 | BUG(); | 999 | BUG(); |
1002 | } | 1000 | } |
1003 | spin_lock(&mm->page_table_lock); | ||
1004 | } | 1001 | } |
1005 | if (pages) { | 1002 | if (pages) { |
1006 | pages[i] = page; | 1003 | pages[i] = page; |
1007 | flush_dcache_page(page); | 1004 | flush_dcache_page(page); |
1008 | if (!PageReserved(page)) | ||
1009 | page_cache_get(page); | ||
1010 | } | 1005 | } |
1011 | if (vmas) | 1006 | if (vmas) |
1012 | vmas[i] = vma; | 1007 | vmas[i] = vma; |
@@ -1014,7 +1009,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1014 | start += PAGE_SIZE; | 1009 | start += PAGE_SIZE; |
1015 | len--; | 1010 | len--; |
1016 | } while (len && start < vma->vm_end); | 1011 | } while (len && start < vma->vm_end); |
1017 | spin_unlock(&mm->page_table_lock); | ||
1018 | } while (len); | 1012 | } while (len); |
1019 | return i; | 1013 | return i; |
1020 | } | 1014 | } |
@@ -1024,16 +1018,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1024 | unsigned long addr, unsigned long end, pgprot_t prot) | 1018 | unsigned long addr, unsigned long end, pgprot_t prot) |
1025 | { | 1019 | { |
1026 | pte_t *pte; | 1020 | pte_t *pte; |
1021 | spinlock_t *ptl; | ||
1027 | 1022 | ||
1028 | pte = pte_alloc_map(mm, pmd, addr); | 1023 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1029 | if (!pte) | 1024 | if (!pte) |
1030 | return -ENOMEM; | 1025 | return -ENOMEM; |
1031 | do { | 1026 | do { |
1032 | pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); | 1027 | struct page *page = ZERO_PAGE(addr); |
1028 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | ||
1029 | page_cache_get(page); | ||
1030 | page_add_file_rmap(page); | ||
1031 | inc_mm_counter(mm, file_rss); | ||
1033 | BUG_ON(!pte_none(*pte)); | 1032 | BUG_ON(!pte_none(*pte)); |
1034 | set_pte_at(mm, addr, pte, zero_pte); | 1033 | set_pte_at(mm, addr, pte, zero_pte); |
1035 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1034 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1036 | pte_unmap(pte - 1); | 1035 | pte_unmap_unlock(pte - 1, ptl); |
1037 | return 0; | 1036 | return 0; |
1038 | } | 1037 | } |
1039 | 1038 | ||
@@ -1083,14 +1082,12 @@ int zeromap_page_range(struct vm_area_struct *vma, | |||
1083 | BUG_ON(addr >= end); | 1082 | BUG_ON(addr >= end); |
1084 | pgd = pgd_offset(mm, addr); | 1083 | pgd = pgd_offset(mm, addr); |
1085 | flush_cache_range(vma, addr, end); | 1084 | flush_cache_range(vma, addr, end); |
1086 | spin_lock(&mm->page_table_lock); | ||
1087 | do { | 1085 | do { |
1088 | next = pgd_addr_end(addr, end); | 1086 | next = pgd_addr_end(addr, end); |
1089 | err = zeromap_pud_range(mm, pgd, addr, next, prot); | 1087 | err = zeromap_pud_range(mm, pgd, addr, next, prot); |
1090 | if (err) | 1088 | if (err) |
1091 | break; | 1089 | break; |
1092 | } while (pgd++, addr = next, addr != end); | 1090 | } while (pgd++, addr = next, addr != end); |
1093 | spin_unlock(&mm->page_table_lock); | ||
1094 | return err; | 1091 | return err; |
1095 | } | 1092 | } |
1096 | 1093 | ||
@@ -1104,17 +1101,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1104 | unsigned long pfn, pgprot_t prot) | 1101 | unsigned long pfn, pgprot_t prot) |
1105 | { | 1102 | { |
1106 | pte_t *pte; | 1103 | pte_t *pte; |
1104 | spinlock_t *ptl; | ||
1107 | 1105 | ||
1108 | pte = pte_alloc_map(mm, pmd, addr); | 1106 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1109 | if (!pte) | 1107 | if (!pte) |
1110 | return -ENOMEM; | 1108 | return -ENOMEM; |
1111 | do { | 1109 | do { |
1112 | BUG_ON(!pte_none(*pte)); | 1110 | BUG_ON(!pte_none(*pte)); |
1113 | if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) | 1111 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); |
1114 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); | ||
1115 | pfn++; | 1112 | pfn++; |
1116 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1113 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1117 | pte_unmap(pte - 1); | 1114 | pte_unmap_unlock(pte - 1, ptl); |
1118 | return 0; | 1115 | return 0; |
1119 | } | 1116 | } |
1120 | 1117 | ||
@@ -1173,8 +1170,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1173 | * rest of the world about it: | 1170 | * rest of the world about it: |
1174 | * VM_IO tells people not to look at these pages | 1171 | * VM_IO tells people not to look at these pages |
1175 | * (accesses can have side effects). | 1172 | * (accesses can have side effects). |
1176 | * VM_RESERVED tells swapout not to try to touch | 1173 | * VM_RESERVED tells the core MM not to "manage" these pages |
1177 | * this region. | 1174 | * (e.g. refcount, mapcount, try to swap them out). |
1178 | */ | 1175 | */ |
1179 | vma->vm_flags |= VM_IO | VM_RESERVED; | 1176 | vma->vm_flags |= VM_IO | VM_RESERVED; |
1180 | 1177 | ||
@@ -1182,7 +1179,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1182 | pfn -= addr >> PAGE_SHIFT; | 1179 | pfn -= addr >> PAGE_SHIFT; |
1183 | pgd = pgd_offset(mm, addr); | 1180 | pgd = pgd_offset(mm, addr); |
1184 | flush_cache_range(vma, addr, end); | 1181 | flush_cache_range(vma, addr, end); |
1185 | spin_lock(&mm->page_table_lock); | ||
1186 | do { | 1182 | do { |
1187 | next = pgd_addr_end(addr, end); | 1183 | next = pgd_addr_end(addr, end); |
1188 | err = remap_pud_range(mm, pgd, addr, next, | 1184 | err = remap_pud_range(mm, pgd, addr, next, |
@@ -1190,12 +1186,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1190 | if (err) | 1186 | if (err) |
1191 | break; | 1187 | break; |
1192 | } while (pgd++, addr = next, addr != end); | 1188 | } while (pgd++, addr = next, addr != end); |
1193 | spin_unlock(&mm->page_table_lock); | ||
1194 | return err; | 1189 | return err; |
1195 | } | 1190 | } |
1196 | EXPORT_SYMBOL(remap_pfn_range); | 1191 | EXPORT_SYMBOL(remap_pfn_range); |
1197 | 1192 | ||
1198 | /* | 1193 | /* |
1194 | * handle_pte_fault chooses page fault handler according to an entry | ||
1195 | * which was read non-atomically. Before making any commitment, on | ||
1196 | * those architectures or configurations (e.g. i386 with PAE) which | ||
1197 | * might give a mix of unmatched parts, do_swap_page and do_file_page | ||
1198 | * must check under lock before unmapping the pte and proceeding | ||
1199 | * (but do_wp_page is only called after already making such a check; | ||
1200 | * and do_anonymous_page and do_no_page can safely check later on). | ||
1201 | */ | ||
1202 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | ||
1203 | pte_t *page_table, pte_t orig_pte) | ||
1204 | { | ||
1205 | int same = 1; | ||
1206 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | ||
1207 | if (sizeof(pte_t) > sizeof(unsigned long)) { | ||
1208 | spinlock_t *ptl = pte_lockptr(mm, pmd); | ||
1209 | spin_lock(ptl); | ||
1210 | same = pte_same(*page_table, orig_pte); | ||
1211 | spin_unlock(ptl); | ||
1212 | } | ||
1213 | #endif | ||
1214 | pte_unmap(page_table); | ||
1215 | return same; | ||
1216 | } | ||
1217 | |||
1218 | /* | ||
1199 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | 1219 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when |
1200 | * servicing faults for write access. In the normal case, do always want | 1220 | * servicing faults for write access. In the normal case, do always want |
1201 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | 1221 | * pte_mkwrite. But get_user_pages can cause write faults for mappings |
@@ -1209,28 +1229,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
1209 | } | 1229 | } |
1210 | 1230 | ||
1211 | /* | 1231 | /* |
1212 | * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock | ||
1213 | */ | ||
1214 | static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, | ||
1215 | pte_t *page_table) | ||
1216 | { | ||
1217 | pte_t entry; | ||
1218 | |||
1219 | entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), | ||
1220 | vma); | ||
1221 | ptep_establish(vma, address, page_table, entry); | ||
1222 | update_mmu_cache(vma, address, entry); | ||
1223 | lazy_mmu_prot_update(entry); | ||
1224 | } | ||
1225 | |||
1226 | /* | ||
1227 | * This routine handles present pages, when users try to write | 1232 | * This routine handles present pages, when users try to write |
1228 | * to a shared page. It is done by copying the page to a new address | 1233 | * to a shared page. It is done by copying the page to a new address |
1229 | * and decrementing the shared-page counter for the old page. | 1234 | * and decrementing the shared-page counter for the old page. |
1230 | * | 1235 | * |
1231 | * Goto-purists beware: the only reason for goto's here is that it results | ||
1232 | * in better assembly code.. The "default" path will see no jumps at all. | ||
1233 | * | ||
1234 | * Note that this routine assumes that the protection checks have been | 1236 | * Note that this routine assumes that the protection checks have been |
1235 | * done by the caller (the low-level page fault routine in most cases). | 1237 | * done by the caller (the low-level page fault routine in most cases). |
1236 | * Thus we can safely just mark it writable once we've done any necessary | 1238 | * Thus we can safely just mark it writable once we've done any necessary |
@@ -1240,28 +1242,28 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page | |||
1240 | * change only once the write actually happens. This avoids a few races, | 1242 | * change only once the write actually happens. This avoids a few races, |
1241 | * and potentially makes it more efficient. | 1243 | * and potentially makes it more efficient. |
1242 | * | 1244 | * |
1243 | * We hold the mm semaphore and the page_table_lock on entry and exit | 1245 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
1244 | * with the page_table_lock released. | 1246 | * but allow concurrent faults), with pte both mapped and locked. |
1247 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
1245 | */ | 1248 | */ |
1246 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | 1249 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1247 | unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) | 1250 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
1251 | spinlock_t *ptl, pte_t orig_pte) | ||
1248 | { | 1252 | { |
1249 | struct page *old_page, *new_page; | 1253 | struct page *old_page, *new_page; |
1250 | unsigned long pfn = pte_pfn(pte); | 1254 | unsigned long pfn = pte_pfn(orig_pte); |
1251 | pte_t entry; | 1255 | pte_t entry; |
1252 | int ret; | 1256 | int ret = VM_FAULT_MINOR; |
1257 | |||
1258 | BUG_ON(vma->vm_flags & VM_RESERVED); | ||
1253 | 1259 | ||
1254 | if (unlikely(!pfn_valid(pfn))) { | 1260 | if (unlikely(!pfn_valid(pfn))) { |
1255 | /* | 1261 | /* |
1256 | * This should really halt the system so it can be debugged or | 1262 | * Page table corrupted: show pte and kill process. |
1257 | * at least the kernel stops what it's doing before it corrupts | ||
1258 | * data, but for the moment just pretend this is OOM. | ||
1259 | */ | 1263 | */ |
1260 | pte_unmap(page_table); | 1264 | print_bad_pte(vma, orig_pte, address); |
1261 | printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", | 1265 | ret = VM_FAULT_OOM; |
1262 | address); | 1266 | goto unlock; |
1263 | spin_unlock(&mm->page_table_lock); | ||
1264 | return VM_FAULT_OOM; | ||
1265 | } | 1267 | } |
1266 | old_page = pfn_to_page(pfn); | 1268 | old_page = pfn_to_page(pfn); |
1267 | 1269 | ||
@@ -1270,52 +1272,51 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | |||
1270 | unlock_page(old_page); | 1272 | unlock_page(old_page); |
1271 | if (reuse) { | 1273 | if (reuse) { |
1272 | flush_cache_page(vma, address, pfn); | 1274 | flush_cache_page(vma, address, pfn); |
1273 | entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), | 1275 | entry = pte_mkyoung(orig_pte); |
1274 | vma); | 1276 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1275 | ptep_set_access_flags(vma, address, page_table, entry, 1); | 1277 | ptep_set_access_flags(vma, address, page_table, entry, 1); |
1276 | update_mmu_cache(vma, address, entry); | 1278 | update_mmu_cache(vma, address, entry); |
1277 | lazy_mmu_prot_update(entry); | 1279 | lazy_mmu_prot_update(entry); |
1278 | pte_unmap(page_table); | 1280 | ret |= VM_FAULT_WRITE; |
1279 | spin_unlock(&mm->page_table_lock); | 1281 | goto unlock; |
1280 | return VM_FAULT_MINOR|VM_FAULT_WRITE; | ||
1281 | } | 1282 | } |
1282 | } | 1283 | } |
1283 | pte_unmap(page_table); | ||
1284 | 1284 | ||
1285 | /* | 1285 | /* |
1286 | * Ok, we need to copy. Oh, well.. | 1286 | * Ok, we need to copy. Oh, well.. |
1287 | */ | 1287 | */ |
1288 | if (!PageReserved(old_page)) | 1288 | page_cache_get(old_page); |
1289 | page_cache_get(old_page); | 1289 | pte_unmap_unlock(page_table, ptl); |
1290 | spin_unlock(&mm->page_table_lock); | ||
1291 | 1290 | ||
1292 | if (unlikely(anon_vma_prepare(vma))) | 1291 | if (unlikely(anon_vma_prepare(vma))) |
1293 | goto no_new_page; | 1292 | goto oom; |
1294 | if (old_page == ZERO_PAGE(address)) { | 1293 | if (old_page == ZERO_PAGE(address)) { |
1295 | new_page = alloc_zeroed_user_highpage(vma, address); | 1294 | new_page = alloc_zeroed_user_highpage(vma, address); |
1296 | if (!new_page) | 1295 | if (!new_page) |
1297 | goto no_new_page; | 1296 | goto oom; |
1298 | } else { | 1297 | } else { |
1299 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1298 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
1300 | if (!new_page) | 1299 | if (!new_page) |
1301 | goto no_new_page; | 1300 | goto oom; |
1302 | copy_user_highpage(new_page, old_page, address); | 1301 | copy_user_highpage(new_page, old_page, address); |
1303 | } | 1302 | } |
1303 | |||
1304 | /* | 1304 | /* |
1305 | * Re-check the pte - we dropped the lock | 1305 | * Re-check the pte - we dropped the lock |
1306 | */ | 1306 | */ |
1307 | ret = VM_FAULT_MINOR; | 1307 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1308 | spin_lock(&mm->page_table_lock); | 1308 | if (likely(pte_same(*page_table, orig_pte))) { |
1309 | page_table = pte_offset_map(pmd, address); | 1309 | page_remove_rmap(old_page); |
1310 | if (likely(pte_same(*page_table, pte))) { | 1310 | if (!PageAnon(old_page)) { |
1311 | if (PageAnon(old_page)) | 1311 | inc_mm_counter(mm, anon_rss); |
1312 | dec_mm_counter(mm, anon_rss); | 1312 | dec_mm_counter(mm, file_rss); |
1313 | if (PageReserved(old_page)) | 1313 | } |
1314 | inc_mm_counter(mm, rss); | ||
1315 | else | ||
1316 | page_remove_rmap(old_page); | ||
1317 | flush_cache_page(vma, address, pfn); | 1314 | flush_cache_page(vma, address, pfn); |
1318 | break_cow(vma, new_page, address, page_table); | 1315 | entry = mk_pte(new_page, vma->vm_page_prot); |
1316 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1317 | ptep_establish(vma, address, page_table, entry); | ||
1318 | update_mmu_cache(vma, address, entry); | ||
1319 | lazy_mmu_prot_update(entry); | ||
1319 | lru_cache_add_active(new_page); | 1320 | lru_cache_add_active(new_page); |
1320 | page_add_anon_rmap(new_page, vma, address); | 1321 | page_add_anon_rmap(new_page, vma, address); |
1321 | 1322 | ||
@@ -1323,13 +1324,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | |||
1323 | new_page = old_page; | 1324 | new_page = old_page; |
1324 | ret |= VM_FAULT_WRITE; | 1325 | ret |= VM_FAULT_WRITE; |
1325 | } | 1326 | } |
1326 | pte_unmap(page_table); | ||
1327 | page_cache_release(new_page); | 1327 | page_cache_release(new_page); |
1328 | page_cache_release(old_page); | 1328 | page_cache_release(old_page); |
1329 | spin_unlock(&mm->page_table_lock); | 1329 | unlock: |
1330 | pte_unmap_unlock(page_table, ptl); | ||
1330 | return ret; | 1331 | return ret; |
1331 | 1332 | oom: | |
1332 | no_new_page: | ||
1333 | page_cache_release(old_page); | 1333 | page_cache_release(old_page); |
1334 | return VM_FAULT_OOM; | 1334 | return VM_FAULT_OOM; |
1335 | } | 1335 | } |
@@ -1399,13 +1399,6 @@ again: | |||
1399 | 1399 | ||
1400 | restart_addr = zap_page_range(vma, start_addr, | 1400 | restart_addr = zap_page_range(vma, start_addr, |
1401 | end_addr - start_addr, details); | 1401 | end_addr - start_addr, details); |
1402 | |||
1403 | /* | ||
1404 | * We cannot rely on the break test in unmap_vmas: | ||
1405 | * on the one hand, we don't want to restart our loop | ||
1406 | * just because that broke out for the page_table_lock; | ||
1407 | * on the other hand, it does no test when vma is small. | ||
1408 | */ | ||
1409 | need_break = need_resched() || | 1402 | need_break = need_resched() || |
1410 | need_lockbreak(details->i_mmap_lock); | 1403 | need_lockbreak(details->i_mmap_lock); |
1411 | 1404 | ||
@@ -1654,38 +1647,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc | |||
1654 | } | 1647 | } |
1655 | 1648 | ||
1656 | /* | 1649 | /* |
1657 | * We hold the mm semaphore and the page_table_lock on entry and | 1650 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
1658 | * should release the pagetable lock on exit.. | 1651 | * but allow concurrent faults), and pte mapped but not yet locked. |
1652 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
1659 | */ | 1653 | */ |
1660 | static int do_swap_page(struct mm_struct * mm, | 1654 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1661 | struct vm_area_struct * vma, unsigned long address, | 1655 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
1662 | pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) | 1656 | int write_access, pte_t orig_pte) |
1663 | { | 1657 | { |
1658 | spinlock_t *ptl; | ||
1664 | struct page *page; | 1659 | struct page *page; |
1665 | swp_entry_t entry = pte_to_swp_entry(orig_pte); | 1660 | swp_entry_t entry; |
1666 | pte_t pte; | 1661 | pte_t pte; |
1667 | int ret = VM_FAULT_MINOR; | 1662 | int ret = VM_FAULT_MINOR; |
1668 | 1663 | ||
1669 | pte_unmap(page_table); | 1664 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
1670 | spin_unlock(&mm->page_table_lock); | 1665 | goto out; |
1666 | |||
1667 | entry = pte_to_swp_entry(orig_pte); | ||
1671 | page = lookup_swap_cache(entry); | 1668 | page = lookup_swap_cache(entry); |
1672 | if (!page) { | 1669 | if (!page) { |
1673 | swapin_readahead(entry, address, vma); | 1670 | swapin_readahead(entry, address, vma); |
1674 | page = read_swap_cache_async(entry, vma, address); | 1671 | page = read_swap_cache_async(entry, vma, address); |
1675 | if (!page) { | 1672 | if (!page) { |
1676 | /* | 1673 | /* |
1677 | * Back out if somebody else faulted in this pte while | 1674 | * Back out if somebody else faulted in this pte |
1678 | * we released the page table lock. | 1675 | * while we released the pte lock. |
1679 | */ | 1676 | */ |
1680 | spin_lock(&mm->page_table_lock); | 1677 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1681 | page_table = pte_offset_map(pmd, address); | ||
1682 | if (likely(pte_same(*page_table, orig_pte))) | 1678 | if (likely(pte_same(*page_table, orig_pte))) |
1683 | ret = VM_FAULT_OOM; | 1679 | ret = VM_FAULT_OOM; |
1684 | else | 1680 | goto unlock; |
1685 | ret = VM_FAULT_MINOR; | ||
1686 | pte_unmap(page_table); | ||
1687 | spin_unlock(&mm->page_table_lock); | ||
1688 | goto out; | ||
1689 | } | 1681 | } |
1690 | 1682 | ||
1691 | /* Had to read the page from swap area: Major fault */ | 1683 | /* Had to read the page from swap area: Major fault */ |
@@ -1698,15 +1690,11 @@ static int do_swap_page(struct mm_struct * mm, | |||
1698 | lock_page(page); | 1690 | lock_page(page); |
1699 | 1691 | ||
1700 | /* | 1692 | /* |
1701 | * Back out if somebody else faulted in this pte while we | 1693 | * Back out if somebody else already faulted in this pte. |
1702 | * released the page table lock. | ||
1703 | */ | 1694 | */ |
1704 | spin_lock(&mm->page_table_lock); | 1695 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1705 | page_table = pte_offset_map(pmd, address); | 1696 | if (unlikely(!pte_same(*page_table, orig_pte))) |
1706 | if (unlikely(!pte_same(*page_table, orig_pte))) { | ||
1707 | ret = VM_FAULT_MINOR; | ||
1708 | goto out_nomap; | 1697 | goto out_nomap; |
1709 | } | ||
1710 | 1698 | ||
1711 | if (unlikely(!PageUptodate(page))) { | 1699 | if (unlikely(!PageUptodate(page))) { |
1712 | ret = VM_FAULT_SIGBUS; | 1700 | ret = VM_FAULT_SIGBUS; |
@@ -1715,7 +1703,7 @@ static int do_swap_page(struct mm_struct * mm, | |||
1715 | 1703 | ||
1716 | /* The page isn't present yet, go ahead with the fault. */ | 1704 | /* The page isn't present yet, go ahead with the fault. */ |
1717 | 1705 | ||
1718 | inc_mm_counter(mm, rss); | 1706 | inc_mm_counter(mm, anon_rss); |
1719 | pte = mk_pte(page, vma->vm_page_prot); | 1707 | pte = mk_pte(page, vma->vm_page_prot); |
1720 | if (write_access && can_share_swap_page(page)) { | 1708 | if (write_access && can_share_swap_page(page)) { |
1721 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 1709 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
@@ -1733,7 +1721,7 @@ static int do_swap_page(struct mm_struct * mm, | |||
1733 | 1721 | ||
1734 | if (write_access) { | 1722 | if (write_access) { |
1735 | if (do_wp_page(mm, vma, address, | 1723 | if (do_wp_page(mm, vma, address, |
1736 | page_table, pmd, pte) == VM_FAULT_OOM) | 1724 | page_table, pmd, ptl, pte) == VM_FAULT_OOM) |
1737 | ret = VM_FAULT_OOM; | 1725 | ret = VM_FAULT_OOM; |
1738 | goto out; | 1726 | goto out; |
1739 | } | 1727 | } |
@@ -1741,74 +1729,76 @@ static int do_swap_page(struct mm_struct * mm, | |||
1741 | /* No need to invalidate - it was non-present before */ | 1729 | /* No need to invalidate - it was non-present before */ |
1742 | update_mmu_cache(vma, address, pte); | 1730 | update_mmu_cache(vma, address, pte); |
1743 | lazy_mmu_prot_update(pte); | 1731 | lazy_mmu_prot_update(pte); |
1744 | pte_unmap(page_table); | 1732 | unlock: |
1745 | spin_unlock(&mm->page_table_lock); | 1733 | pte_unmap_unlock(page_table, ptl); |
1746 | out: | 1734 | out: |
1747 | return ret; | 1735 | return ret; |
1748 | out_nomap: | 1736 | out_nomap: |
1749 | pte_unmap(page_table); | 1737 | pte_unmap_unlock(page_table, ptl); |
1750 | spin_unlock(&mm->page_table_lock); | ||
1751 | unlock_page(page); | 1738 | unlock_page(page); |
1752 | page_cache_release(page); | 1739 | page_cache_release(page); |
1753 | goto out; | 1740 | return ret; |
1754 | } | 1741 | } |
1755 | 1742 | ||
1756 | /* | 1743 | /* |
1757 | * We are called with the MM semaphore and page_table_lock | 1744 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
1758 | * spinlock held to protect against concurrent faults in | 1745 | * but allow concurrent faults), and pte mapped but not yet locked. |
1759 | * multithreaded programs. | 1746 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
1760 | */ | 1747 | */ |
1761 | static int | 1748 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1762 | do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1749 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
1763 | pte_t *page_table, pmd_t *pmd, int write_access, | 1750 | int write_access) |
1764 | unsigned long addr) | ||
1765 | { | 1751 | { |
1752 | struct page *page; | ||
1753 | spinlock_t *ptl; | ||
1766 | pte_t entry; | 1754 | pte_t entry; |
1767 | struct page * page = ZERO_PAGE(addr); | ||
1768 | |||
1769 | /* Read-only mapping of ZERO_PAGE. */ | ||
1770 | entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); | ||
1771 | 1755 | ||
1772 | /* ..except if it's a write access */ | ||
1773 | if (write_access) { | 1756 | if (write_access) { |
1774 | /* Allocate our own private page. */ | 1757 | /* Allocate our own private page. */ |
1775 | pte_unmap(page_table); | 1758 | pte_unmap(page_table); |
1776 | spin_unlock(&mm->page_table_lock); | ||
1777 | 1759 | ||
1778 | if (unlikely(anon_vma_prepare(vma))) | 1760 | if (unlikely(anon_vma_prepare(vma))) |
1779 | goto no_mem; | 1761 | goto oom; |
1780 | page = alloc_zeroed_user_highpage(vma, addr); | 1762 | page = alloc_zeroed_user_highpage(vma, address); |
1781 | if (!page) | 1763 | if (!page) |
1782 | goto no_mem; | 1764 | goto oom; |
1783 | 1765 | ||
1784 | spin_lock(&mm->page_table_lock); | 1766 | entry = mk_pte(page, vma->vm_page_prot); |
1785 | page_table = pte_offset_map(pmd, addr); | 1767 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1786 | 1768 | ||
1787 | if (!pte_none(*page_table)) { | 1769 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1788 | pte_unmap(page_table); | 1770 | if (!pte_none(*page_table)) |
1789 | page_cache_release(page); | 1771 | goto release; |
1790 | spin_unlock(&mm->page_table_lock); | 1772 | inc_mm_counter(mm, anon_rss); |
1791 | goto out; | ||
1792 | } | ||
1793 | inc_mm_counter(mm, rss); | ||
1794 | entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, | ||
1795 | vma->vm_page_prot)), | ||
1796 | vma); | ||
1797 | lru_cache_add_active(page); | 1773 | lru_cache_add_active(page); |
1798 | SetPageReferenced(page); | 1774 | SetPageReferenced(page); |
1799 | page_add_anon_rmap(page, vma, addr); | 1775 | page_add_anon_rmap(page, vma, address); |
1776 | } else { | ||
1777 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ | ||
1778 | page = ZERO_PAGE(address); | ||
1779 | page_cache_get(page); | ||
1780 | entry = mk_pte(page, vma->vm_page_prot); | ||
1781 | |||
1782 | ptl = pte_lockptr(mm, pmd); | ||
1783 | spin_lock(ptl); | ||
1784 | if (!pte_none(*page_table)) | ||
1785 | goto release; | ||
1786 | inc_mm_counter(mm, file_rss); | ||
1787 | page_add_file_rmap(page); | ||
1800 | } | 1788 | } |
1801 | 1789 | ||
1802 | set_pte_at(mm, addr, page_table, entry); | 1790 | set_pte_at(mm, address, page_table, entry); |
1803 | pte_unmap(page_table); | ||
1804 | 1791 | ||
1805 | /* No need to invalidate - it was non-present before */ | 1792 | /* No need to invalidate - it was non-present before */ |
1806 | update_mmu_cache(vma, addr, entry); | 1793 | update_mmu_cache(vma, address, entry); |
1807 | lazy_mmu_prot_update(entry); | 1794 | lazy_mmu_prot_update(entry); |
1808 | spin_unlock(&mm->page_table_lock); | 1795 | unlock: |
1809 | out: | 1796 | pte_unmap_unlock(page_table, ptl); |
1810 | return VM_FAULT_MINOR; | 1797 | return VM_FAULT_MINOR; |
1811 | no_mem: | 1798 | release: |
1799 | page_cache_release(page); | ||
1800 | goto unlock; | ||
1801 | oom: | ||
1812 | return VM_FAULT_OOM; | 1802 | return VM_FAULT_OOM; |
1813 | } | 1803 | } |
1814 | 1804 | ||
@@ -1821,25 +1811,23 @@ no_mem: | |||
1821 | * As this is called only for pages that do not currently exist, we | 1811 | * As this is called only for pages that do not currently exist, we |
1822 | * do not need to flush old virtual caches or the TLB. | 1812 | * do not need to flush old virtual caches or the TLB. |
1823 | * | 1813 | * |
1824 | * This is called with the MM semaphore held and the page table | 1814 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
1825 | * spinlock held. Exit with the spinlock released. | 1815 | * but allow concurrent faults), and pte mapped but not yet locked. |
1816 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
1826 | */ | 1817 | */ |
1827 | static int | 1818 | static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1828 | do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1819 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
1829 | unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) | 1820 | int write_access) |
1830 | { | 1821 | { |
1831 | struct page * new_page; | 1822 | spinlock_t *ptl; |
1823 | struct page *new_page; | ||
1832 | struct address_space *mapping = NULL; | 1824 | struct address_space *mapping = NULL; |
1833 | pte_t entry; | 1825 | pte_t entry; |
1834 | unsigned int sequence = 0; | 1826 | unsigned int sequence = 0; |
1835 | int ret = VM_FAULT_MINOR; | 1827 | int ret = VM_FAULT_MINOR; |
1836 | int anon = 0; | 1828 | int anon = 0; |
1837 | 1829 | ||
1838 | if (!vma->vm_ops || !vma->vm_ops->nopage) | ||
1839 | return do_anonymous_page(mm, vma, page_table, | ||
1840 | pmd, write_access, address); | ||
1841 | pte_unmap(page_table); | 1830 | pte_unmap(page_table); |
1842 | spin_unlock(&mm->page_table_lock); | ||
1843 | 1831 | ||
1844 | if (vma->vm_file) { | 1832 | if (vma->vm_file) { |
1845 | mapping = vma->vm_file->f_mapping; | 1833 | mapping = vma->vm_file->f_mapping; |
@@ -1847,7 +1835,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1847 | smp_rmb(); /* serializes i_size against truncate_count */ | 1835 | smp_rmb(); /* serializes i_size against truncate_count */ |
1848 | } | 1836 | } |
1849 | retry: | 1837 | retry: |
1850 | cond_resched(); | ||
1851 | new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | 1838 | new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); |
1852 | /* | 1839 | /* |
1853 | * No smp_rmb is needed here as long as there's a full | 1840 | * No smp_rmb is needed here as long as there's a full |
@@ -1880,19 +1867,20 @@ retry: | |||
1880 | anon = 1; | 1867 | anon = 1; |
1881 | } | 1868 | } |
1882 | 1869 | ||
1883 | spin_lock(&mm->page_table_lock); | 1870 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1884 | /* | 1871 | /* |
1885 | * For a file-backed vma, someone could have truncated or otherwise | 1872 | * For a file-backed vma, someone could have truncated or otherwise |
1886 | * invalidated this page. If unmap_mapping_range got called, | 1873 | * invalidated this page. If unmap_mapping_range got called, |
1887 | * retry getting the page. | 1874 | * retry getting the page. |
1888 | */ | 1875 | */ |
1889 | if (mapping && unlikely(sequence != mapping->truncate_count)) { | 1876 | if (mapping && unlikely(sequence != mapping->truncate_count)) { |
1890 | sequence = mapping->truncate_count; | 1877 | pte_unmap_unlock(page_table, ptl); |
1891 | spin_unlock(&mm->page_table_lock); | ||
1892 | page_cache_release(new_page); | 1878 | page_cache_release(new_page); |
1879 | cond_resched(); | ||
1880 | sequence = mapping->truncate_count; | ||
1881 | smp_rmb(); | ||
1893 | goto retry; | 1882 | goto retry; |
1894 | } | 1883 | } |
1895 | page_table = pte_offset_map(pmd, address); | ||
1896 | 1884 | ||
1897 | /* | 1885 | /* |
1898 | * This silly early PAGE_DIRTY setting removes a race | 1886 | * This silly early PAGE_DIRTY setting removes a race |
@@ -1906,68 +1894,67 @@ retry: | |||
1906 | */ | 1894 | */ |
1907 | /* Only go through if we didn't race with anybody else... */ | 1895 | /* Only go through if we didn't race with anybody else... */ |
1908 | if (pte_none(*page_table)) { | 1896 | if (pte_none(*page_table)) { |
1909 | if (!PageReserved(new_page)) | ||
1910 | inc_mm_counter(mm, rss); | ||
1911 | |||
1912 | flush_icache_page(vma, new_page); | 1897 | flush_icache_page(vma, new_page); |
1913 | entry = mk_pte(new_page, vma->vm_page_prot); | 1898 | entry = mk_pte(new_page, vma->vm_page_prot); |
1914 | if (write_access) | 1899 | if (write_access) |
1915 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1900 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1916 | set_pte_at(mm, address, page_table, entry); | 1901 | set_pte_at(mm, address, page_table, entry); |
1917 | if (anon) { | 1902 | if (anon) { |
1903 | inc_mm_counter(mm, anon_rss); | ||
1918 | lru_cache_add_active(new_page); | 1904 | lru_cache_add_active(new_page); |
1919 | page_add_anon_rmap(new_page, vma, address); | 1905 | page_add_anon_rmap(new_page, vma, address); |
1920 | } else | 1906 | } else if (!(vma->vm_flags & VM_RESERVED)) { |
1907 | inc_mm_counter(mm, file_rss); | ||
1921 | page_add_file_rmap(new_page); | 1908 | page_add_file_rmap(new_page); |
1922 | pte_unmap(page_table); | 1909 | } |
1923 | } else { | 1910 | } else { |
1924 | /* One of our sibling threads was faster, back out. */ | 1911 | /* One of our sibling threads was faster, back out. */ |
1925 | pte_unmap(page_table); | ||
1926 | page_cache_release(new_page); | 1912 | page_cache_release(new_page); |
1927 | spin_unlock(&mm->page_table_lock); | 1913 | goto unlock; |
1928 | goto out; | ||
1929 | } | 1914 | } |
1930 | 1915 | ||
1931 | /* no need to invalidate: a not-present page shouldn't be cached */ | 1916 | /* no need to invalidate: a not-present page shouldn't be cached */ |
1932 | update_mmu_cache(vma, address, entry); | 1917 | update_mmu_cache(vma, address, entry); |
1933 | lazy_mmu_prot_update(entry); | 1918 | lazy_mmu_prot_update(entry); |
1934 | spin_unlock(&mm->page_table_lock); | 1919 | unlock: |
1935 | out: | 1920 | pte_unmap_unlock(page_table, ptl); |
1936 | return ret; | 1921 | return ret; |
1937 | oom: | 1922 | oom: |
1938 | page_cache_release(new_page); | 1923 | page_cache_release(new_page); |
1939 | ret = VM_FAULT_OOM; | 1924 | return VM_FAULT_OOM; |
1940 | goto out; | ||
1941 | } | 1925 | } |
1942 | 1926 | ||
1943 | /* | 1927 | /* |
1944 | * Fault of a previously existing named mapping. Repopulate the pte | 1928 | * Fault of a previously existing named mapping. Repopulate the pte |
1945 | * from the encoded file_pte if possible. This enables swappable | 1929 | * from the encoded file_pte if possible. This enables swappable |
1946 | * nonlinear vmas. | 1930 | * nonlinear vmas. |
1931 | * | ||
1932 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
1933 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
1934 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
1947 | */ | 1935 | */ |
1948 | static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, | 1936 | static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1949 | unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) | 1937 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
1938 | int write_access, pte_t orig_pte) | ||
1950 | { | 1939 | { |
1951 | unsigned long pgoff; | 1940 | pgoff_t pgoff; |
1952 | int err; | 1941 | int err; |
1953 | 1942 | ||
1954 | BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); | 1943 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
1955 | /* | 1944 | return VM_FAULT_MINOR; |
1956 | * Fall back to the linear mapping if the fs does not support | ||
1957 | * ->populate: | ||
1958 | */ | ||
1959 | if (!vma->vm_ops->populate || | ||
1960 | (write_access && !(vma->vm_flags & VM_SHARED))) { | ||
1961 | pte_clear(mm, address, pte); | ||
1962 | return do_no_page(mm, vma, address, write_access, pte, pmd); | ||
1963 | } | ||
1964 | |||
1965 | pgoff = pte_to_pgoff(*pte); | ||
1966 | 1945 | ||
1967 | pte_unmap(pte); | 1946 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
1968 | spin_unlock(&mm->page_table_lock); | 1947 | /* |
1948 | * Page table corrupted: show pte and kill process. | ||
1949 | */ | ||
1950 | print_bad_pte(vma, orig_pte, address); | ||
1951 | return VM_FAULT_OOM; | ||
1952 | } | ||
1953 | /* We can then assume vm->vm_ops && vma->vm_ops->populate */ | ||
1969 | 1954 | ||
1970 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); | 1955 | pgoff = pte_to_pgoff(orig_pte); |
1956 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, | ||
1957 | vma->vm_page_prot, pgoff, 0); | ||
1971 | if (err == -ENOMEM) | 1958 | if (err == -ENOMEM) |
1972 | return VM_FAULT_OOM; | 1959 | return VM_FAULT_OOM; |
1973 | if (err) | 1960 | if (err) |
@@ -1984,56 +1971,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1984 | * with external mmu caches can use to update those (ie the Sparc or | 1971 | * with external mmu caches can use to update those (ie the Sparc or |
1985 | * PowerPC hashed page tables that act as extended TLBs). | 1972 | * PowerPC hashed page tables that act as extended TLBs). |
1986 | * | 1973 | * |
1987 | * Note the "page_table_lock". It is to protect against kswapd removing | 1974 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
1988 | * pages from under us. Note that kswapd only ever _removes_ pages, never | 1975 | * but allow concurrent faults), and pte mapped but not yet locked. |
1989 | * adds them. As such, once we have noticed that the page is not present, | 1976 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
1990 | * we can drop the lock early. | ||
1991 | * | ||
1992 | * The adding of pages is protected by the MM semaphore (which we hold), | ||
1993 | * so we don't need to worry about a page being suddenly been added into | ||
1994 | * our VM. | ||
1995 | * | ||
1996 | * We enter with the pagetable spinlock held, we are supposed to | ||
1997 | * release it when done. | ||
1998 | */ | 1977 | */ |
1999 | static inline int handle_pte_fault(struct mm_struct *mm, | 1978 | static inline int handle_pte_fault(struct mm_struct *mm, |
2000 | struct vm_area_struct * vma, unsigned long address, | 1979 | struct vm_area_struct *vma, unsigned long address, |
2001 | int write_access, pte_t *pte, pmd_t *pmd) | 1980 | pte_t *pte, pmd_t *pmd, int write_access) |
2002 | { | 1981 | { |
2003 | pte_t entry; | 1982 | pte_t entry; |
1983 | pte_t old_entry; | ||
1984 | spinlock_t *ptl; | ||
2004 | 1985 | ||
2005 | entry = *pte; | 1986 | old_entry = entry = *pte; |
2006 | if (!pte_present(entry)) { | 1987 | if (!pte_present(entry)) { |
2007 | /* | 1988 | if (pte_none(entry)) { |
2008 | * If it truly wasn't present, we know that kswapd | 1989 | if (!vma->vm_ops || !vma->vm_ops->nopage) |
2009 | * and the PTE updates will not touch it later. So | 1990 | return do_anonymous_page(mm, vma, address, |
2010 | * drop the lock. | 1991 | pte, pmd, write_access); |
2011 | */ | 1992 | return do_no_page(mm, vma, address, |
2012 | if (pte_none(entry)) | 1993 | pte, pmd, write_access); |
2013 | return do_no_page(mm, vma, address, write_access, pte, pmd); | 1994 | } |
2014 | if (pte_file(entry)) | 1995 | if (pte_file(entry)) |
2015 | return do_file_page(mm, vma, address, write_access, pte, pmd); | 1996 | return do_file_page(mm, vma, address, |
2016 | return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); | 1997 | pte, pmd, write_access, entry); |
1998 | return do_swap_page(mm, vma, address, | ||
1999 | pte, pmd, write_access, entry); | ||
2017 | } | 2000 | } |
2018 | 2001 | ||
2002 | ptl = pte_lockptr(mm, pmd); | ||
2003 | spin_lock(ptl); | ||
2004 | if (unlikely(!pte_same(*pte, entry))) | ||
2005 | goto unlock; | ||
2019 | if (write_access) { | 2006 | if (write_access) { |
2020 | if (!pte_write(entry)) | 2007 | if (!pte_write(entry)) |
2021 | return do_wp_page(mm, vma, address, pte, pmd, entry); | 2008 | return do_wp_page(mm, vma, address, |
2009 | pte, pmd, ptl, entry); | ||
2022 | entry = pte_mkdirty(entry); | 2010 | entry = pte_mkdirty(entry); |
2023 | } | 2011 | } |
2024 | entry = pte_mkyoung(entry); | 2012 | entry = pte_mkyoung(entry); |
2025 | ptep_set_access_flags(vma, address, pte, entry, write_access); | 2013 | if (!pte_same(old_entry, entry)) { |
2026 | update_mmu_cache(vma, address, entry); | 2014 | ptep_set_access_flags(vma, address, pte, entry, write_access); |
2027 | lazy_mmu_prot_update(entry); | 2015 | update_mmu_cache(vma, address, entry); |
2028 | pte_unmap(pte); | 2016 | lazy_mmu_prot_update(entry); |
2029 | spin_unlock(&mm->page_table_lock); | 2017 | } else { |
2018 | /* | ||
2019 | * This is needed only for protection faults but the arch code | ||
2020 | * is not yet telling us if this is a protection fault or not. | ||
2021 | * This still avoids useless tlb flushes for .text page faults | ||
2022 | * with threads. | ||
2023 | */ | ||
2024 | if (write_access) | ||
2025 | flush_tlb_page(vma, address); | ||
2026 | } | ||
2027 | unlock: | ||
2028 | pte_unmap_unlock(pte, ptl); | ||
2030 | return VM_FAULT_MINOR; | 2029 | return VM_FAULT_MINOR; |
2031 | } | 2030 | } |
2032 | 2031 | ||
2033 | /* | 2032 | /* |
2034 | * By the time we get here, we already hold the mm semaphore | 2033 | * By the time we get here, we already hold the mm semaphore |
2035 | */ | 2034 | */ |
2036 | int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, | 2035 | int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2037 | unsigned long address, int write_access) | 2036 | unsigned long address, int write_access) |
2038 | { | 2037 | { |
2039 | pgd_t *pgd; | 2038 | pgd_t *pgd; |
@@ -2048,100 +2047,66 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, | |||
2048 | if (unlikely(is_vm_hugetlb_page(vma))) | 2047 | if (unlikely(is_vm_hugetlb_page(vma))) |
2049 | return hugetlb_fault(mm, vma, address, write_access); | 2048 | return hugetlb_fault(mm, vma, address, write_access); |
2050 | 2049 | ||
2051 | /* | ||
2052 | * We need the page table lock to synchronize with kswapd | ||
2053 | * and the SMP-safe atomic PTE updates. | ||
2054 | */ | ||
2055 | pgd = pgd_offset(mm, address); | 2050 | pgd = pgd_offset(mm, address); |
2056 | spin_lock(&mm->page_table_lock); | ||
2057 | |||
2058 | pud = pud_alloc(mm, pgd, address); | 2051 | pud = pud_alloc(mm, pgd, address); |
2059 | if (!pud) | 2052 | if (!pud) |
2060 | goto oom; | 2053 | return VM_FAULT_OOM; |
2061 | |||
2062 | pmd = pmd_alloc(mm, pud, address); | 2054 | pmd = pmd_alloc(mm, pud, address); |
2063 | if (!pmd) | 2055 | if (!pmd) |
2064 | goto oom; | 2056 | return VM_FAULT_OOM; |
2065 | |||
2066 | pte = pte_alloc_map(mm, pmd, address); | 2057 | pte = pte_alloc_map(mm, pmd, address); |
2067 | if (!pte) | 2058 | if (!pte) |
2068 | goto oom; | 2059 | return VM_FAULT_OOM; |
2069 | |||
2070 | return handle_pte_fault(mm, vma, address, write_access, pte, pmd); | ||
2071 | 2060 | ||
2072 | oom: | 2061 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); |
2073 | spin_unlock(&mm->page_table_lock); | ||
2074 | return VM_FAULT_OOM; | ||
2075 | } | 2062 | } |
2076 | 2063 | ||
2077 | #ifndef __PAGETABLE_PUD_FOLDED | 2064 | #ifndef __PAGETABLE_PUD_FOLDED |
2078 | /* | 2065 | /* |
2079 | * Allocate page upper directory. | 2066 | * Allocate page upper directory. |
2080 | * | 2067 | * We've already handled the fast-path in-line. |
2081 | * We've already handled the fast-path in-line, and we own the | ||
2082 | * page table lock. | ||
2083 | */ | 2068 | */ |
2084 | pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | 2069 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) |
2085 | { | 2070 | { |
2086 | pud_t *new; | 2071 | pud_t *new = pud_alloc_one(mm, address); |
2087 | |||
2088 | spin_unlock(&mm->page_table_lock); | ||
2089 | new = pud_alloc_one(mm, address); | ||
2090 | spin_lock(&mm->page_table_lock); | ||
2091 | if (!new) | 2072 | if (!new) |
2092 | return NULL; | 2073 | return -ENOMEM; |
2093 | 2074 | ||
2094 | /* | 2075 | spin_lock(&mm->page_table_lock); |
2095 | * Because we dropped the lock, we should re-check the | 2076 | if (pgd_present(*pgd)) /* Another has populated it */ |
2096 | * entry, as somebody else could have populated it.. | ||
2097 | */ | ||
2098 | if (pgd_present(*pgd)) { | ||
2099 | pud_free(new); | 2077 | pud_free(new); |
2100 | goto out; | 2078 | else |
2101 | } | 2079 | pgd_populate(mm, pgd, new); |
2102 | pgd_populate(mm, pgd, new); | 2080 | spin_unlock(&mm->page_table_lock); |
2103 | out: | 2081 | return 0; |
2104 | return pud_offset(pgd, address); | ||
2105 | } | 2082 | } |
2106 | #endif /* __PAGETABLE_PUD_FOLDED */ | 2083 | #endif /* __PAGETABLE_PUD_FOLDED */ |
2107 | 2084 | ||
2108 | #ifndef __PAGETABLE_PMD_FOLDED | 2085 | #ifndef __PAGETABLE_PMD_FOLDED |
2109 | /* | 2086 | /* |
2110 | * Allocate page middle directory. | 2087 | * Allocate page middle directory. |
2111 | * | 2088 | * We've already handled the fast-path in-line. |
2112 | * We've already handled the fast-path in-line, and we own the | ||
2113 | * page table lock. | ||
2114 | */ | 2089 | */ |
2115 | pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | 2090 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) |
2116 | { | 2091 | { |
2117 | pmd_t *new; | 2092 | pmd_t *new = pmd_alloc_one(mm, address); |
2118 | |||
2119 | spin_unlock(&mm->page_table_lock); | ||
2120 | new = pmd_alloc_one(mm, address); | ||
2121 | spin_lock(&mm->page_table_lock); | ||
2122 | if (!new) | 2093 | if (!new) |
2123 | return NULL; | 2094 | return -ENOMEM; |
2124 | 2095 | ||
2125 | /* | 2096 | spin_lock(&mm->page_table_lock); |
2126 | * Because we dropped the lock, we should re-check the | ||
2127 | * entry, as somebody else could have populated it.. | ||
2128 | */ | ||
2129 | #ifndef __ARCH_HAS_4LEVEL_HACK | 2097 | #ifndef __ARCH_HAS_4LEVEL_HACK |
2130 | if (pud_present(*pud)) { | 2098 | if (pud_present(*pud)) /* Another has populated it */ |
2131 | pmd_free(new); | 2099 | pmd_free(new); |
2132 | goto out; | 2100 | else |
2133 | } | 2101 | pud_populate(mm, pud, new); |
2134 | pud_populate(mm, pud, new); | ||
2135 | #else | 2102 | #else |
2136 | if (pgd_present(*pud)) { | 2103 | if (pgd_present(*pud)) /* Another has populated it */ |
2137 | pmd_free(new); | 2104 | pmd_free(new); |
2138 | goto out; | 2105 | else |
2139 | } | 2106 | pgd_populate(mm, pud, new); |
2140 | pgd_populate(mm, pud, new); | ||
2141 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 2107 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
2142 | 2108 | spin_unlock(&mm->page_table_lock); | |
2143 | out: | 2109 | return 0; |
2144 | return pmd_offset(pud, address); | ||
2145 | } | 2110 | } |
2146 | #endif /* __PAGETABLE_PMD_FOLDED */ | 2111 | #endif /* __PAGETABLE_PMD_FOLDED */ |
2147 | 2112 | ||
@@ -2206,22 +2171,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) | |||
2206 | 2171 | ||
2207 | EXPORT_SYMBOL(vmalloc_to_pfn); | 2172 | EXPORT_SYMBOL(vmalloc_to_pfn); |
2208 | 2173 | ||
2209 | /* | ||
2210 | * update_mem_hiwater | ||
2211 | * - update per process rss and vm high water data | ||
2212 | */ | ||
2213 | void update_mem_hiwater(struct task_struct *tsk) | ||
2214 | { | ||
2215 | if (tsk->mm) { | ||
2216 | unsigned long rss = get_mm_counter(tsk->mm, rss); | ||
2217 | |||
2218 | if (tsk->mm->hiwater_rss < rss) | ||
2219 | tsk->mm->hiwater_rss = rss; | ||
2220 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
2221 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
2222 | } | ||
2223 | } | ||
2224 | |||
2225 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2174 | #if !defined(__HAVE_ARCH_GATE_AREA) |
2226 | 2175 | ||
2227 | #if defined(AT_SYSINFO_EHDR) | 2176 | #if defined(AT_SYSINFO_EHDR) |
@@ -2233,7 +2182,7 @@ static int __init gate_vma_init(void) | |||
2233 | gate_vma.vm_start = FIXADDR_USER_START; | 2182 | gate_vma.vm_start = FIXADDR_USER_START; |
2234 | gate_vma.vm_end = FIXADDR_USER_END; | 2183 | gate_vma.vm_end = FIXADDR_USER_END; |
2235 | gate_vma.vm_page_prot = PAGE_READONLY; | 2184 | gate_vma.vm_page_prot = PAGE_READONLY; |
2236 | gate_vma.vm_flags = 0; | 2185 | gate_vma.vm_flags = VM_RESERVED; |
2237 | return 0; | 2186 | return 0; |
2238 | } | 2187 | } |
2239 | __initcall(gate_vma_init); | 2188 | __initcall(gate_vma_init); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c new file mode 100644 index 000000000000..431a64f021c0 --- /dev/null +++ b/mm/memory_hotplug.c | |||
@@ -0,0 +1,138 @@ | |||
1 | /* | ||
2 | * linux/mm/memory_hotplug.c | ||
3 | * | ||
4 | * Copyright (C) | ||
5 | */ | ||
6 | |||
7 | #include <linux/config.h> | ||
8 | #include <linux/stddef.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/swap.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/pagemap.h> | ||
13 | #include <linux/bootmem.h> | ||
14 | #include <linux/compiler.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/pagevec.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/sysctl.h> | ||
19 | #include <linux/cpu.h> | ||
20 | #include <linux/memory.h> | ||
21 | #include <linux/memory_hotplug.h> | ||
22 | #include <linux/highmem.h> | ||
23 | #include <linux/vmalloc.h> | ||
24 | |||
25 | #include <asm/tlbflush.h> | ||
26 | |||
27 | extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | ||
28 | unsigned long size); | ||
29 | static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) | ||
30 | { | ||
31 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
32 | int nr_pages = PAGES_PER_SECTION; | ||
33 | int nid = pgdat->node_id; | ||
34 | int zone_type; | ||
35 | |||
36 | zone_type = zone - pgdat->node_zones; | ||
37 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | ||
38 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); | ||
39 | } | ||
40 | |||
41 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | ||
42 | int nr_pages); | ||
43 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | ||
44 | { | ||
45 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
46 | int nr_pages = PAGES_PER_SECTION; | ||
47 | int ret; | ||
48 | |||
49 | ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); | ||
50 | |||
51 | if (ret < 0) | ||
52 | return ret; | ||
53 | |||
54 | __add_zone(zone, phys_start_pfn); | ||
55 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * Reasonably generic function for adding memory. It is | ||
60 | * expected that archs that support memory hotplug will | ||
61 | * call this function after deciding the zone to which to | ||
62 | * add the new pages. | ||
63 | */ | ||
64 | int __add_pages(struct zone *zone, unsigned long phys_start_pfn, | ||
65 | unsigned long nr_pages) | ||
66 | { | ||
67 | unsigned long i; | ||
68 | int err = 0; | ||
69 | |||
70 | for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { | ||
71 | err = __add_section(zone, phys_start_pfn + i); | ||
72 | |||
73 | if (err) | ||
74 | break; | ||
75 | } | ||
76 | |||
77 | return err; | ||
78 | } | ||
79 | |||
80 | static void grow_zone_span(struct zone *zone, | ||
81 | unsigned long start_pfn, unsigned long end_pfn) | ||
82 | { | ||
83 | unsigned long old_zone_end_pfn; | ||
84 | |||
85 | zone_span_writelock(zone); | ||
86 | |||
87 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
88 | if (start_pfn < zone->zone_start_pfn) | ||
89 | zone->zone_start_pfn = start_pfn; | ||
90 | |||
91 | if (end_pfn > old_zone_end_pfn) | ||
92 | zone->spanned_pages = end_pfn - zone->zone_start_pfn; | ||
93 | |||
94 | zone_span_writeunlock(zone); | ||
95 | } | ||
96 | |||
97 | static void grow_pgdat_span(struct pglist_data *pgdat, | ||
98 | unsigned long start_pfn, unsigned long end_pfn) | ||
99 | { | ||
100 | unsigned long old_pgdat_end_pfn = | ||
101 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
102 | |||
103 | if (start_pfn < pgdat->node_start_pfn) | ||
104 | pgdat->node_start_pfn = start_pfn; | ||
105 | |||
106 | if (end_pfn > old_pgdat_end_pfn) | ||
107 | pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; | ||
108 | } | ||
109 | |||
110 | int online_pages(unsigned long pfn, unsigned long nr_pages) | ||
111 | { | ||
112 | unsigned long i; | ||
113 | unsigned long flags; | ||
114 | unsigned long onlined_pages = 0; | ||
115 | struct zone *zone; | ||
116 | |||
117 | /* | ||
118 | * This doesn't need a lock to do pfn_to_page(). | ||
119 | * The section can't be removed here because of the | ||
120 | * memory_block->state_sem. | ||
121 | */ | ||
122 | zone = page_zone(pfn_to_page(pfn)); | ||
123 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
124 | grow_zone_span(zone, pfn, pfn + nr_pages); | ||
125 | grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); | ||
126 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
127 | |||
128 | for (i = 0; i < nr_pages; i++) { | ||
129 | struct page *page = pfn_to_page(pfn + i); | ||
130 | online_page(page); | ||
131 | onlined_pages++; | ||
132 | } | ||
133 | zone->present_pages += onlined_pages; | ||
134 | |||
135 | setup_per_zone_pages_min(); | ||
136 | |||
137 | return 0; | ||
138 | } | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1d5c64df1653..2076b1542b8a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * Simple NUMA memory policy for the Linux kernel. | 2 | * Simple NUMA memory policy for the Linux kernel. |
3 | * | 3 | * |
4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. | 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
5 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | ||
5 | * Subject to the GNU Public License, version 2. | 6 | * Subject to the GNU Public License, version 2. |
6 | * | 7 | * |
7 | * NUMA policy allows the user to give hints in which node(s) memory should | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
@@ -17,13 +18,19 @@ | |||
17 | * offset into the backing object or offset into the mapping | 18 | * offset into the backing object or offset into the mapping |
18 | * for anonymous memory. For process policy an process counter | 19 | * for anonymous memory. For process policy an process counter |
19 | * is used. | 20 | * is used. |
21 | * | ||
20 | * bind Only allocate memory on a specific set of nodes, | 22 | * bind Only allocate memory on a specific set of nodes, |
21 | * no fallback. | 23 | * no fallback. |
24 | * FIXME: memory is allocated starting with the first node | ||
25 | * to the last. It would be better if bind would truly restrict | ||
26 | * the allocation to memory nodes instead | ||
27 | * | ||
22 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
23 | * As a special case node -1 here means do the allocation | 29 | * As a special case node -1 here means do the allocation |
24 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
25 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
26 | * process policy. | 32 | * process policy. |
33 | * | ||
27 | * default Allocate on the local node first, or when on a VMA | 34 | * default Allocate on the local node first, or when on a VMA |
28 | * use the process policy. This is what Linux always did | 35 | * use the process policy. This is what Linux always did |
29 | * in a NUMA aware kernel and still does by, ahem, default. | 36 | * in a NUMA aware kernel and still does by, ahem, default. |
@@ -93,23 +100,10 @@ struct mempolicy default_policy = { | |||
93 | .policy = MPOL_DEFAULT, | 100 | .policy = MPOL_DEFAULT, |
94 | }; | 101 | }; |
95 | 102 | ||
96 | /* Check if all specified nodes are online */ | ||
97 | static int nodes_online(unsigned long *nodes) | ||
98 | { | ||
99 | DECLARE_BITMAP(online2, MAX_NUMNODES); | ||
100 | |||
101 | bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); | ||
102 | if (bitmap_empty(online2, MAX_NUMNODES)) | ||
103 | set_bit(0, online2); | ||
104 | if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) | ||
105 | return -EINVAL; | ||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | /* Do sanity checking on a policy */ | 103 | /* Do sanity checking on a policy */ |
110 | static int mpol_check_policy(int mode, unsigned long *nodes) | 104 | static int mpol_check_policy(int mode, nodemask_t *nodes) |
111 | { | 105 | { |
112 | int empty = bitmap_empty(nodes, MAX_NUMNODES); | 106 | int empty = nodes_empty(*nodes); |
113 | 107 | ||
114 | switch (mode) { | 108 | switch (mode) { |
115 | case MPOL_DEFAULT: | 109 | case MPOL_DEFAULT: |
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes) | |||
124 | return -EINVAL; | 118 | return -EINVAL; |
125 | break; | 119 | break; |
126 | } | 120 | } |
127 | return nodes_online(nodes); | 121 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
128 | } | ||
129 | |||
130 | /* Copy a node mask from user space. */ | ||
131 | static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, | ||
132 | unsigned long maxnode, int mode) | ||
133 | { | ||
134 | unsigned long k; | ||
135 | unsigned long nlongs; | ||
136 | unsigned long endmask; | ||
137 | |||
138 | --maxnode; | ||
139 | bitmap_zero(nodes, MAX_NUMNODES); | ||
140 | if (maxnode == 0 || !nmask) | ||
141 | return 0; | ||
142 | |||
143 | nlongs = BITS_TO_LONGS(maxnode); | ||
144 | if ((maxnode % BITS_PER_LONG) == 0) | ||
145 | endmask = ~0UL; | ||
146 | else | ||
147 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
148 | |||
149 | /* When the user specified more nodes than supported just check | ||
150 | if the non supported part is all zero. */ | ||
151 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
152 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
153 | return -EINVAL; | ||
154 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
155 | unsigned long t; | ||
156 | if (get_user(t, nmask + k)) | ||
157 | return -EFAULT; | ||
158 | if (k == nlongs - 1) { | ||
159 | if (t & endmask) | ||
160 | return -EINVAL; | ||
161 | } else if (t) | ||
162 | return -EINVAL; | ||
163 | } | ||
164 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
165 | endmask = ~0UL; | ||
166 | } | ||
167 | |||
168 | if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) | ||
169 | return -EFAULT; | ||
170 | nodes[nlongs-1] &= endmask; | ||
171 | /* Update current mems_allowed */ | ||
172 | cpuset_update_current_mems_allowed(); | ||
173 | /* Ignore nodes not set in current->mems_allowed */ | ||
174 | cpuset_restrict_to_mems_allowed(nodes); | ||
175 | return mpol_check_policy(mode, nodes); | ||
176 | } | 122 | } |
177 | |||
178 | /* Generate a custom zonelist for the BIND policy. */ | 123 | /* Generate a custom zonelist for the BIND policy. */ |
179 | static struct zonelist *bind_zonelist(unsigned long *nodes) | 124 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
180 | { | 125 | { |
181 | struct zonelist *zl; | 126 | struct zonelist *zl; |
182 | int num, max, nd; | 127 | int num, max, nd; |
183 | 128 | ||
184 | max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); | 129 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
185 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | 130 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); |
186 | if (!zl) | 131 | if (!zl) |
187 | return NULL; | 132 | return NULL; |
188 | num = 0; | 133 | num = 0; |
189 | for (nd = find_first_bit(nodes, MAX_NUMNODES); | 134 | for_each_node_mask(nd, *nodes) { |
190 | nd < MAX_NUMNODES; | ||
191 | nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { | ||
192 | int k; | 135 | int k; |
193 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | 136 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { |
194 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 137 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; |
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes) | |||
199 | policy_zone = k; | 142 | policy_zone = k; |
200 | } | 143 | } |
201 | } | 144 | } |
202 | BUG_ON(num >= max); | ||
203 | zl->zones[num] = NULL; | 145 | zl->zones[num] = NULL; |
204 | return zl; | 146 | return zl; |
205 | } | 147 | } |
206 | 148 | ||
207 | /* Create a new policy */ | 149 | /* Create a new policy */ |
208 | static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | 150 | static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) |
209 | { | 151 | { |
210 | struct mempolicy *policy; | 152 | struct mempolicy *policy; |
211 | 153 | ||
212 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); | 154 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); |
213 | if (mode == MPOL_DEFAULT) | 155 | if (mode == MPOL_DEFAULT) |
214 | return NULL; | 156 | return NULL; |
215 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 157 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
218 | atomic_set(&policy->refcnt, 1); | 160 | atomic_set(&policy->refcnt, 1); |
219 | switch (mode) { | 161 | switch (mode) { |
220 | case MPOL_INTERLEAVE: | 162 | case MPOL_INTERLEAVE: |
221 | bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); | 163 | policy->v.nodes = *nodes; |
222 | break; | 164 | break; |
223 | case MPOL_PREFERRED: | 165 | case MPOL_PREFERRED: |
224 | policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); | 166 | policy->v.preferred_node = first_node(*nodes); |
225 | if (policy->v.preferred_node >= MAX_NUMNODES) | 167 | if (policy->v.preferred_node >= MAX_NUMNODES) |
226 | policy->v.preferred_node = -1; | 168 | policy->v.preferred_node = -1; |
227 | break; | 169 | break; |
@@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
238 | } | 180 | } |
239 | 181 | ||
240 | /* Ensure all existing pages follow the policy. */ | 182 | /* Ensure all existing pages follow the policy. */ |
241 | static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | 183 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
242 | unsigned long addr, unsigned long end, unsigned long *nodes) | 184 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
243 | { | 185 | { |
244 | pte_t *orig_pte; | 186 | pte_t *orig_pte; |
245 | pte_t *pte; | 187 | pte_t *pte; |
188 | spinlock_t *ptl; | ||
246 | 189 | ||
247 | spin_lock(&mm->page_table_lock); | 190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
248 | orig_pte = pte = pte_offset_map(pmd, addr); | ||
249 | do { | 191 | do { |
250 | unsigned long pfn; | 192 | unsigned long pfn; |
251 | unsigned int nid; | 193 | unsigned int nid; |
@@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
253 | if (!pte_present(*pte)) | 195 | if (!pte_present(*pte)) |
254 | continue; | 196 | continue; |
255 | pfn = pte_pfn(*pte); | 197 | pfn = pte_pfn(*pte); |
256 | if (!pfn_valid(pfn)) | 198 | if (!pfn_valid(pfn)) { |
199 | print_bad_pte(vma, *pte, addr); | ||
257 | continue; | 200 | continue; |
201 | } | ||
258 | nid = pfn_to_nid(pfn); | 202 | nid = pfn_to_nid(pfn); |
259 | if (!test_bit(nid, nodes)) | 203 | if (!node_isset(nid, *nodes)) |
260 | break; | 204 | break; |
261 | } while (pte++, addr += PAGE_SIZE, addr != end); | 205 | } while (pte++, addr += PAGE_SIZE, addr != end); |
262 | pte_unmap(orig_pte); | 206 | pte_unmap_unlock(orig_pte, ptl); |
263 | spin_unlock(&mm->page_table_lock); | ||
264 | return addr != end; | 207 | return addr != end; |
265 | } | 208 | } |
266 | 209 | ||
267 | static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | 210 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
268 | unsigned long addr, unsigned long end, unsigned long *nodes) | 211 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
269 | { | 212 | { |
270 | pmd_t *pmd; | 213 | pmd_t *pmd; |
271 | unsigned long next; | 214 | unsigned long next; |
@@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
275 | next = pmd_addr_end(addr, end); | 218 | next = pmd_addr_end(addr, end); |
276 | if (pmd_none_or_clear_bad(pmd)) | 219 | if (pmd_none_or_clear_bad(pmd)) |
277 | continue; | 220 | continue; |
278 | if (check_pte_range(mm, pmd, addr, next, nodes)) | 221 | if (check_pte_range(vma, pmd, addr, next, nodes)) |
279 | return -EIO; | 222 | return -EIO; |
280 | } while (pmd++, addr = next, addr != end); | 223 | } while (pmd++, addr = next, addr != end); |
281 | return 0; | 224 | return 0; |
282 | } | 225 | } |
283 | 226 | ||
284 | static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | 227 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
285 | unsigned long addr, unsigned long end, unsigned long *nodes) | 228 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
286 | { | 229 | { |
287 | pud_t *pud; | 230 | pud_t *pud; |
288 | unsigned long next; | 231 | unsigned long next; |
@@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
292 | next = pud_addr_end(addr, end); | 235 | next = pud_addr_end(addr, end); |
293 | if (pud_none_or_clear_bad(pud)) | 236 | if (pud_none_or_clear_bad(pud)) |
294 | continue; | 237 | continue; |
295 | if (check_pmd_range(mm, pud, addr, next, nodes)) | 238 | if (check_pmd_range(vma, pud, addr, next, nodes)) |
296 | return -EIO; | 239 | return -EIO; |
297 | } while (pud++, addr = next, addr != end); | 240 | } while (pud++, addr = next, addr != end); |
298 | return 0; | 241 | return 0; |
299 | } | 242 | } |
300 | 243 | ||
301 | static inline int check_pgd_range(struct mm_struct *mm, | 244 | static inline int check_pgd_range(struct vm_area_struct *vma, |
302 | unsigned long addr, unsigned long end, unsigned long *nodes) | 245 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
303 | { | 246 | { |
304 | pgd_t *pgd; | 247 | pgd_t *pgd; |
305 | unsigned long next; | 248 | unsigned long next; |
306 | 249 | ||
307 | pgd = pgd_offset(mm, addr); | 250 | pgd = pgd_offset(vma->vm_mm, addr); |
308 | do { | 251 | do { |
309 | next = pgd_addr_end(addr, end); | 252 | next = pgd_addr_end(addr, end); |
310 | if (pgd_none_or_clear_bad(pgd)) | 253 | if (pgd_none_or_clear_bad(pgd)) |
311 | continue; | 254 | continue; |
312 | if (check_pud_range(mm, pgd, addr, next, nodes)) | 255 | if (check_pud_range(vma, pgd, addr, next, nodes)) |
313 | return -EIO; | 256 | return -EIO; |
314 | } while (pgd++, addr = next, addr != end); | 257 | } while (pgd++, addr = next, addr != end); |
315 | return 0; | 258 | return 0; |
@@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm, | |||
318 | /* Step 1: check the range */ | 261 | /* Step 1: check the range */ |
319 | static struct vm_area_struct * | 262 | static struct vm_area_struct * |
320 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 263 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
321 | unsigned long *nodes, unsigned long flags) | 264 | nodemask_t *nodes, unsigned long flags) |
322 | { | 265 | { |
323 | int err; | 266 | int err; |
324 | struct vm_area_struct *first, *vma, *prev; | 267 | struct vm_area_struct *first, *vma, *prev; |
@@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
326 | first = find_vma(mm, start); | 269 | first = find_vma(mm, start); |
327 | if (!first) | 270 | if (!first) |
328 | return ERR_PTR(-EFAULT); | 271 | return ERR_PTR(-EFAULT); |
272 | if (first->vm_flags & VM_RESERVED) | ||
273 | return ERR_PTR(-EACCES); | ||
329 | prev = NULL; | 274 | prev = NULL; |
330 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 275 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
331 | if (!vma->vm_next && vma->vm_end < end) | 276 | if (!vma->vm_next && vma->vm_end < end) |
@@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
338 | endvma = end; | 283 | endvma = end; |
339 | if (vma->vm_start > start) | 284 | if (vma->vm_start > start) |
340 | start = vma->vm_start; | 285 | start = vma->vm_start; |
341 | err = check_pgd_range(vma->vm_mm, | 286 | err = check_pgd_range(vma, start, endvma, nodes); |
342 | start, endvma, nodes); | ||
343 | if (err) { | 287 | if (err) { |
344 | first = ERR_PTR(err); | 288 | first = ERR_PTR(err); |
345 | break; | 289 | break; |
@@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, | |||
393 | return err; | 337 | return err; |
394 | } | 338 | } |
395 | 339 | ||
396 | /* Change policy for a memory range */ | 340 | static int contextualize_policy(int mode, nodemask_t *nodes) |
397 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 341 | { |
398 | unsigned long mode, | 342 | if (!nodes) |
399 | unsigned long __user *nmask, unsigned long maxnode, | 343 | return 0; |
400 | unsigned flags) | 344 | |
345 | /* Update current mems_allowed */ | ||
346 | cpuset_update_current_mems_allowed(); | ||
347 | /* Ignore nodes not set in current->mems_allowed */ | ||
348 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
349 | return mpol_check_policy(mode, nodes); | ||
350 | } | ||
351 | |||
352 | long do_mbind(unsigned long start, unsigned long len, | ||
353 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
401 | { | 354 | { |
402 | struct vm_area_struct *vma; | 355 | struct vm_area_struct *vma; |
403 | struct mm_struct *mm = current->mm; | 356 | struct mm_struct *mm = current->mm; |
404 | struct mempolicy *new; | 357 | struct mempolicy *new; |
405 | unsigned long end; | 358 | unsigned long end; |
406 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
407 | int err; | 359 | int err; |
408 | 360 | ||
409 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | 361 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) |
@@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
418 | return -EINVAL; | 370 | return -EINVAL; |
419 | if (end == start) | 371 | if (end == start) |
420 | return 0; | 372 | return 0; |
421 | 373 | if (mpol_check_policy(mode, nmask)) | |
422 | err = get_nodes(nodes, nmask, maxnode, mode); | 374 | return -EINVAL; |
423 | if (err) | 375 | new = mpol_new(mode, nmask); |
424 | return err; | ||
425 | |||
426 | new = mpol_new(mode, nodes); | ||
427 | if (IS_ERR(new)) | 376 | if (IS_ERR(new)) |
428 | return PTR_ERR(new); | 377 | return PTR_ERR(new); |
429 | 378 | ||
430 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 379 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, |
431 | mode,nodes[0]); | 380 | mode,nodes_addr(nodes)[0]); |
432 | 381 | ||
433 | down_write(&mm->mmap_sem); | 382 | down_write(&mm->mmap_sem); |
434 | vma = check_range(mm, start, end, nodes, flags); | 383 | vma = check_range(mm, start, end, nmask, flags); |
435 | err = PTR_ERR(vma); | 384 | err = PTR_ERR(vma); |
436 | if (!IS_ERR(vma)) | 385 | if (!IS_ERR(vma)) |
437 | err = mbind_range(vma, start, end, new); | 386 | err = mbind_range(vma, start, end, new); |
@@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
441 | } | 390 | } |
442 | 391 | ||
443 | /* Set the process memory policy */ | 392 | /* Set the process memory policy */ |
444 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | 393 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
445 | unsigned long maxnode) | ||
446 | { | 394 | { |
447 | int err; | ||
448 | struct mempolicy *new; | 395 | struct mempolicy *new; |
449 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
450 | 396 | ||
451 | if (mode < 0 || mode > MPOL_MAX) | 397 | if (contextualize_policy(mode, nodes)) |
452 | return -EINVAL; | 398 | return -EINVAL; |
453 | err = get_nodes(nodes, nmask, maxnode, mode); | ||
454 | if (err) | ||
455 | return err; | ||
456 | new = mpol_new(mode, nodes); | 399 | new = mpol_new(mode, nodes); |
457 | if (IS_ERR(new)) | 400 | if (IS_ERR(new)) |
458 | return PTR_ERR(new); | 401 | return PTR_ERR(new); |
459 | mpol_free(current->mempolicy); | 402 | mpol_free(current->mempolicy); |
460 | current->mempolicy = new; | 403 | current->mempolicy = new; |
461 | if (new && new->policy == MPOL_INTERLEAVE) | 404 | if (new && new->policy == MPOL_INTERLEAVE) |
462 | current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); | 405 | current->il_next = first_node(new->v.nodes); |
463 | return 0; | 406 | return 0; |
464 | } | 407 | } |
465 | 408 | ||
466 | /* Fill a zone bitmap for a policy */ | 409 | /* Fill a zone bitmap for a policy */ |
467 | static void get_zonemask(struct mempolicy *p, unsigned long *nodes) | 410 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) |
468 | { | 411 | { |
469 | int i; | 412 | int i; |
470 | 413 | ||
471 | bitmap_zero(nodes, MAX_NUMNODES); | 414 | nodes_clear(*nodes); |
472 | switch (p->policy) { | 415 | switch (p->policy) { |
473 | case MPOL_BIND: | 416 | case MPOL_BIND: |
474 | for (i = 0; p->v.zonelist->zones[i]; i++) | 417 | for (i = 0; p->v.zonelist->zones[i]; i++) |
475 | __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); | 418 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, |
419 | *nodes); | ||
476 | break; | 420 | break; |
477 | case MPOL_DEFAULT: | 421 | case MPOL_DEFAULT: |
478 | break; | 422 | break; |
479 | case MPOL_INTERLEAVE: | 423 | case MPOL_INTERLEAVE: |
480 | bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); | 424 | *nodes = p->v.nodes; |
481 | break; | 425 | break; |
482 | case MPOL_PREFERRED: | 426 | case MPOL_PREFERRED: |
483 | /* or use current node instead of online map? */ | 427 | /* or use current node instead of online map? */ |
484 | if (p->v.preferred_node < 0) | 428 | if (p->v.preferred_node < 0) |
485 | bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); | 429 | *nodes = node_online_map; |
486 | else | 430 | else |
487 | __set_bit(p->v.preferred_node, nodes); | 431 | node_set(p->v.preferred_node, *nodes); |
488 | break; | 432 | break; |
489 | default: | 433 | default: |
490 | BUG(); | 434 | BUG(); |
@@ -504,37 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
504 | return err; | 448 | return err; |
505 | } | 449 | } |
506 | 450 | ||
507 | /* Copy a kernel node mask to user space */ | ||
508 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
509 | void *nodes, unsigned nbytes) | ||
510 | { | ||
511 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
512 | |||
513 | if (copy > nbytes) { | ||
514 | if (copy > PAGE_SIZE) | ||
515 | return -EINVAL; | ||
516 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
517 | return -EFAULT; | ||
518 | copy = nbytes; | ||
519 | } | ||
520 | return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; | ||
521 | } | ||
522 | |||
523 | /* Retrieve NUMA policy */ | 451 | /* Retrieve NUMA policy */ |
524 | asmlinkage long sys_get_mempolicy(int __user *policy, | 452 | long do_get_mempolicy(int *policy, nodemask_t *nmask, |
525 | unsigned long __user *nmask, | 453 | unsigned long addr, unsigned long flags) |
526 | unsigned long maxnode, | ||
527 | unsigned long addr, unsigned long flags) | ||
528 | { | 454 | { |
529 | int err, pval; | 455 | int err; |
530 | struct mm_struct *mm = current->mm; | 456 | struct mm_struct *mm = current->mm; |
531 | struct vm_area_struct *vma = NULL; | 457 | struct vm_area_struct *vma = NULL; |
532 | struct mempolicy *pol = current->mempolicy; | 458 | struct mempolicy *pol = current->mempolicy; |
533 | 459 | ||
534 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 460 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
535 | return -EINVAL; | 461 | return -EINVAL; |
536 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
537 | return -EINVAL; | ||
538 | if (flags & MPOL_F_ADDR) { | 462 | if (flags & MPOL_F_ADDR) { |
539 | down_read(&mm->mmap_sem); | 463 | down_read(&mm->mmap_sem); |
540 | vma = find_vma_intersection(mm, addr, addr+1); | 464 | vma = find_vma_intersection(mm, addr, addr+1); |
@@ -557,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
557 | err = lookup_node(mm, addr); | 481 | err = lookup_node(mm, addr); |
558 | if (err < 0) | 482 | if (err < 0) |
559 | goto out; | 483 | goto out; |
560 | pval = err; | 484 | *policy = err; |
561 | } else if (pol == current->mempolicy && | 485 | } else if (pol == current->mempolicy && |
562 | pol->policy == MPOL_INTERLEAVE) { | 486 | pol->policy == MPOL_INTERLEAVE) { |
563 | pval = current->il_next; | 487 | *policy = current->il_next; |
564 | } else { | 488 | } else { |
565 | err = -EINVAL; | 489 | err = -EINVAL; |
566 | goto out; | 490 | goto out; |
567 | } | 491 | } |
568 | } else | 492 | } else |
569 | pval = pol->policy; | 493 | *policy = pol->policy; |
570 | 494 | ||
571 | if (vma) { | 495 | if (vma) { |
572 | up_read(¤t->mm->mmap_sem); | 496 | up_read(¤t->mm->mmap_sem); |
573 | vma = NULL; | 497 | vma = NULL; |
574 | } | 498 | } |
575 | 499 | ||
576 | if (policy && put_user(pval, policy)) | ||
577 | return -EFAULT; | ||
578 | |||
579 | err = 0; | 500 | err = 0; |
580 | if (nmask) { | 501 | if (nmask) |
581 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | 502 | get_zonemask(pol, nmask); |
582 | get_zonemask(pol, nodes); | ||
583 | err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); | ||
584 | } | ||
585 | 503 | ||
586 | out: | 504 | out: |
587 | if (vma) | 505 | if (vma) |
@@ -589,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
589 | return err; | 507 | return err; |
590 | } | 508 | } |
591 | 509 | ||
510 | /* | ||
511 | * User space interface with variable sized bitmaps for nodelists. | ||
512 | */ | ||
513 | |||
514 | /* Copy a node mask from user space. */ | ||
515 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | ||
516 | unsigned long maxnode) | ||
517 | { | ||
518 | unsigned long k; | ||
519 | unsigned long nlongs; | ||
520 | unsigned long endmask; | ||
521 | |||
522 | --maxnode; | ||
523 | nodes_clear(*nodes); | ||
524 | if (maxnode == 0 || !nmask) | ||
525 | return 0; | ||
526 | |||
527 | nlongs = BITS_TO_LONGS(maxnode); | ||
528 | if ((maxnode % BITS_PER_LONG) == 0) | ||
529 | endmask = ~0UL; | ||
530 | else | ||
531 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
532 | |||
533 | /* When the user specified more nodes than supported just check | ||
534 | if the non supported part is all zero. */ | ||
535 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
536 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
537 | return -EINVAL; | ||
538 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
539 | unsigned long t; | ||
540 | if (get_user(t, nmask + k)) | ||
541 | return -EFAULT; | ||
542 | if (k == nlongs - 1) { | ||
543 | if (t & endmask) | ||
544 | return -EINVAL; | ||
545 | } else if (t) | ||
546 | return -EINVAL; | ||
547 | } | ||
548 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
549 | endmask = ~0UL; | ||
550 | } | ||
551 | |||
552 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | ||
553 | return -EFAULT; | ||
554 | nodes_addr(*nodes)[nlongs-1] &= endmask; | ||
555 | return 0; | ||
556 | } | ||
557 | |||
558 | /* Copy a kernel node mask to user space */ | ||
559 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
560 | nodemask_t *nodes) | ||
561 | { | ||
562 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
563 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | ||
564 | |||
565 | if (copy > nbytes) { | ||
566 | if (copy > PAGE_SIZE) | ||
567 | return -EINVAL; | ||
568 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
569 | return -EFAULT; | ||
570 | copy = nbytes; | ||
571 | } | ||
572 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | ||
573 | } | ||
574 | |||
575 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | ||
576 | unsigned long mode, | ||
577 | unsigned long __user *nmask, unsigned long maxnode, | ||
578 | unsigned flags) | ||
579 | { | ||
580 | nodemask_t nodes; | ||
581 | int err; | ||
582 | |||
583 | err = get_nodes(&nodes, nmask, maxnode); | ||
584 | if (err) | ||
585 | return err; | ||
586 | return do_mbind(start, len, mode, &nodes, flags); | ||
587 | } | ||
588 | |||
589 | /* Set the process memory policy */ | ||
590 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | ||
591 | unsigned long maxnode) | ||
592 | { | ||
593 | int err; | ||
594 | nodemask_t nodes; | ||
595 | |||
596 | if (mode < 0 || mode > MPOL_MAX) | ||
597 | return -EINVAL; | ||
598 | err = get_nodes(&nodes, nmask, maxnode); | ||
599 | if (err) | ||
600 | return err; | ||
601 | return do_set_mempolicy(mode, &nodes); | ||
602 | } | ||
603 | |||
604 | /* Retrieve NUMA policy */ | ||
605 | asmlinkage long sys_get_mempolicy(int __user *policy, | ||
606 | unsigned long __user *nmask, | ||
607 | unsigned long maxnode, | ||
608 | unsigned long addr, unsigned long flags) | ||
609 | { | ||
610 | int err, pval; | ||
611 | nodemask_t nodes; | ||
612 | |||
613 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
614 | return -EINVAL; | ||
615 | |||
616 | err = do_get_mempolicy(&pval, &nodes, addr, flags); | ||
617 | |||
618 | if (err) | ||
619 | return err; | ||
620 | |||
621 | if (policy && put_user(pval, policy)) | ||
622 | return -EFAULT; | ||
623 | |||
624 | if (nmask) | ||
625 | err = copy_nodes_to_user(nmask, maxnode, &nodes); | ||
626 | |||
627 | return err; | ||
628 | } | ||
629 | |||
592 | #ifdef CONFIG_COMPAT | 630 | #ifdef CONFIG_COMPAT |
593 | 631 | ||
594 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, | 632 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, |
@@ -649,15 +687,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
649 | long err = 0; | 687 | long err = 0; |
650 | unsigned long __user *nm = NULL; | 688 | unsigned long __user *nm = NULL; |
651 | unsigned long nr_bits, alloc_size; | 689 | unsigned long nr_bits, alloc_size; |
652 | DECLARE_BITMAP(bm, MAX_NUMNODES); | 690 | nodemask_t bm; |
653 | 691 | ||
654 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | 692 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); |
655 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | 693 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; |
656 | 694 | ||
657 | if (nmask) { | 695 | if (nmask) { |
658 | err = compat_get_bitmap(bm, nmask, nr_bits); | 696 | err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); |
659 | nm = compat_alloc_user_space(alloc_size); | 697 | nm = compat_alloc_user_space(alloc_size); |
660 | err |= copy_to_user(nm, bm, alloc_size); | 698 | err |= copy_to_user(nm, nodes_addr(bm), alloc_size); |
661 | } | 699 | } |
662 | 700 | ||
663 | if (err) | 701 | if (err) |
@@ -676,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo | |||
676 | 714 | ||
677 | if (vma) { | 715 | if (vma) { |
678 | if (vma->vm_ops && vma->vm_ops->get_policy) | 716 | if (vma->vm_ops && vma->vm_ops->get_policy) |
679 | pol = vma->vm_ops->get_policy(vma, addr); | 717 | pol = vma->vm_ops->get_policy(vma, addr); |
680 | else if (vma->vm_policy && | 718 | else if (vma->vm_policy && |
681 | vma->vm_policy->policy != MPOL_DEFAULT) | 719 | vma->vm_policy->policy != MPOL_DEFAULT) |
682 | pol = vma->vm_policy; | 720 | pol = vma->vm_policy; |
@@ -722,10 +760,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
722 | struct task_struct *me = current; | 760 | struct task_struct *me = current; |
723 | 761 | ||
724 | nid = me->il_next; | 762 | nid = me->il_next; |
725 | BUG_ON(nid >= MAX_NUMNODES); | 763 | next = next_node(nid, policy->v.nodes); |
726 | next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); | ||
727 | if (next >= MAX_NUMNODES) | 764 | if (next >= MAX_NUMNODES) |
728 | next = find_first_bit(policy->v.nodes, MAX_NUMNODES); | 765 | next = first_node(policy->v.nodes); |
729 | me->il_next = next; | 766 | me->il_next = next; |
730 | return nid; | 767 | return nid; |
731 | } | 768 | } |
@@ -734,29 +771,27 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
734 | static unsigned offset_il_node(struct mempolicy *pol, | 771 | static unsigned offset_il_node(struct mempolicy *pol, |
735 | struct vm_area_struct *vma, unsigned long off) | 772 | struct vm_area_struct *vma, unsigned long off) |
736 | { | 773 | { |
737 | unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); | 774 | unsigned nnodes = nodes_weight(pol->v.nodes); |
738 | unsigned target = (unsigned)off % nnodes; | 775 | unsigned target = (unsigned)off % nnodes; |
739 | int c; | 776 | int c; |
740 | int nid = -1; | 777 | int nid = -1; |
741 | 778 | ||
742 | c = 0; | 779 | c = 0; |
743 | do { | 780 | do { |
744 | nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); | 781 | nid = next_node(nid, pol->v.nodes); |
745 | c++; | 782 | c++; |
746 | } while (c <= target); | 783 | } while (c <= target); |
747 | BUG_ON(nid >= MAX_NUMNODES); | ||
748 | BUG_ON(!test_bit(nid, pol->v.nodes)); | ||
749 | return nid; | 784 | return nid; |
750 | } | 785 | } |
751 | 786 | ||
752 | /* Allocate a page in interleaved policy. | 787 | /* Allocate a page in interleaved policy. |
753 | Own path because it needs to do special accounting. */ | 788 | Own path because it needs to do special accounting. */ |
754 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) | 789 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
790 | unsigned nid) | ||
755 | { | 791 | { |
756 | struct zonelist *zl; | 792 | struct zonelist *zl; |
757 | struct page *page; | 793 | struct page *page; |
758 | 794 | ||
759 | BUG_ON(!node_online(nid)); | ||
760 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 795 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); |
761 | page = __alloc_pages(gfp, order, zl); | 796 | page = __alloc_pages(gfp, order, zl); |
762 | if (page && page_zone(page) == zl->zones[0]) { | 797 | if (page && page_zone(page) == zl->zones[0]) { |
@@ -799,8 +834,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
799 | unsigned nid; | 834 | unsigned nid; |
800 | if (vma) { | 835 | if (vma) { |
801 | unsigned long off; | 836 | unsigned long off; |
802 | BUG_ON(addr >= vma->vm_end); | ||
803 | BUG_ON(addr < vma->vm_start); | ||
804 | off = vma->vm_pgoff; | 837 | off = vma->vm_pgoff; |
805 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | 838 | off += (addr - vma->vm_start) >> PAGE_SHIFT; |
806 | nid = offset_il_node(pol, vma, off); | 839 | nid = offset_il_node(pol, vma, off); |
@@ -878,7 +911,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
878 | case MPOL_DEFAULT: | 911 | case MPOL_DEFAULT: |
879 | return 1; | 912 | return 1; |
880 | case MPOL_INTERLEAVE: | 913 | case MPOL_INTERLEAVE: |
881 | return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); | 914 | return nodes_equal(a->v.nodes, b->v.nodes); |
882 | case MPOL_PREFERRED: | 915 | case MPOL_PREFERRED: |
883 | return a->v.preferred_node == b->v.preferred_node; | 916 | return a->v.preferred_node == b->v.preferred_node; |
884 | case MPOL_BIND: { | 917 | case MPOL_BIND: { |
@@ -1117,7 +1150,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
1117 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", | 1150 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", |
1118 | vma->vm_pgoff, | 1151 | vma->vm_pgoff, |
1119 | sz, npol? npol->policy : -1, | 1152 | sz, npol? npol->policy : -1, |
1120 | npol ? npol->v.nodes[0] : -1); | 1153 | npol ? nodes_addr(npol->v.nodes)[0] : -1); |
1121 | 1154 | ||
1122 | if (npol) { | 1155 | if (npol) { |
1123 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 1156 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
@@ -1164,14 +1197,12 @@ void __init numa_policy_init(void) | |||
1164 | /* Set interleaving policy for system init. This way not all | 1197 | /* Set interleaving policy for system init. This way not all |
1165 | the data structures allocated at system boot end up in node zero. */ | 1198 | the data structures allocated at system boot end up in node zero. */ |
1166 | 1199 | ||
1167 | if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), | 1200 | if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) |
1168 | MAX_NUMNODES) < 0) | ||
1169 | printk("numa_policy_init: interleaving failed\n"); | 1201 | printk("numa_policy_init: interleaving failed\n"); |
1170 | } | 1202 | } |
1171 | 1203 | ||
1172 | /* Reset policy of current process to default. | 1204 | /* Reset policy of current process to default */ |
1173 | * Assumes fs == KERNEL_DS */ | ||
1174 | void numa_default_policy(void) | 1205 | void numa_default_policy(void) |
1175 | { | 1206 | { |
1176 | sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); | 1207 | do_set_mempolicy(MPOL_DEFAULT, NULL); |
1177 | } | 1208 | } |
@@ -181,26 +181,36 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
181 | } | 181 | } |
182 | 182 | ||
183 | /* | 183 | /* |
184 | * Remove one vm structure and free it. | 184 | * Unlink a file-based vm structure from its prio_tree, to hide |
185 | * vma from rmap and vmtruncate before freeing its page tables. | ||
185 | */ | 186 | */ |
186 | static void remove_vm_struct(struct vm_area_struct *vma) | 187 | void unlink_file_vma(struct vm_area_struct *vma) |
187 | { | 188 | { |
188 | struct file *file = vma->vm_file; | 189 | struct file *file = vma->vm_file; |
189 | 190 | ||
190 | might_sleep(); | ||
191 | if (file) { | 191 | if (file) { |
192 | struct address_space *mapping = file->f_mapping; | 192 | struct address_space *mapping = file->f_mapping; |
193 | spin_lock(&mapping->i_mmap_lock); | 193 | spin_lock(&mapping->i_mmap_lock); |
194 | __remove_shared_vm_struct(vma, file, mapping); | 194 | __remove_shared_vm_struct(vma, file, mapping); |
195 | spin_unlock(&mapping->i_mmap_lock); | 195 | spin_unlock(&mapping->i_mmap_lock); |
196 | } | 196 | } |
197 | } | ||
198 | |||
199 | /* | ||
200 | * Close a vm structure and free it, returning the next. | ||
201 | */ | ||
202 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | ||
203 | { | ||
204 | struct vm_area_struct *next = vma->vm_next; | ||
205 | |||
206 | might_sleep(); | ||
197 | if (vma->vm_ops && vma->vm_ops->close) | 207 | if (vma->vm_ops && vma->vm_ops->close) |
198 | vma->vm_ops->close(vma); | 208 | vma->vm_ops->close(vma); |
199 | if (file) | 209 | if (vma->vm_file) |
200 | fput(file); | 210 | fput(vma->vm_file); |
201 | anon_vma_unlink(vma); | ||
202 | mpol_free(vma_policy(vma)); | 211 | mpol_free(vma_policy(vma)); |
203 | kmem_cache_free(vm_area_cachep, vma); | 212 | kmem_cache_free(vm_area_cachep, vma); |
213 | return next; | ||
204 | } | 214 | } |
205 | 215 | ||
206 | asmlinkage unsigned long sys_brk(unsigned long brk) | 216 | asmlinkage unsigned long sys_brk(unsigned long brk) |
@@ -832,7 +842,7 @@ none: | |||
832 | } | 842 | } |
833 | 843 | ||
834 | #ifdef CONFIG_PROC_FS | 844 | #ifdef CONFIG_PROC_FS |
835 | void __vm_stat_account(struct mm_struct *mm, unsigned long flags, | 845 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, |
836 | struct file *file, long pages) | 846 | struct file *file, long pages) |
837 | { | 847 | { |
838 | const unsigned long stack_flags | 848 | const unsigned long stack_flags |
@@ -1070,6 +1080,17 @@ munmap_back: | |||
1070 | error = file->f_op->mmap(file, vma); | 1080 | error = file->f_op->mmap(file, vma); |
1071 | if (error) | 1081 | if (error) |
1072 | goto unmap_and_free_vma; | 1082 | goto unmap_and_free_vma; |
1083 | if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED)) | ||
1084 | == (VM_WRITE | VM_RESERVED)) { | ||
1085 | printk(KERN_WARNING "program %s is using MAP_PRIVATE, " | ||
1086 | "PROT_WRITE mmap of VM_RESERVED memory, which " | ||
1087 | "is deprecated. Please report this to " | ||
1088 | "linux-kernel@vger.kernel.org\n",current->comm); | ||
1089 | if (vma->vm_ops && vma->vm_ops->close) | ||
1090 | vma->vm_ops->close(vma); | ||
1091 | error = -EACCES; | ||
1092 | goto unmap_and_free_vma; | ||
1093 | } | ||
1073 | } else if (vm_flags & VM_SHARED) { | 1094 | } else if (vm_flags & VM_SHARED) { |
1074 | error = shmem_zero_setup(vma); | 1095 | error = shmem_zero_setup(vma); |
1075 | if (error) | 1096 | if (error) |
@@ -1110,7 +1131,7 @@ munmap_back: | |||
1110 | } | 1131 | } |
1111 | out: | 1132 | out: |
1112 | mm->total_vm += len >> PAGE_SHIFT; | 1133 | mm->total_vm += len >> PAGE_SHIFT; |
1113 | __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1134 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1114 | if (vm_flags & VM_LOCKED) { | 1135 | if (vm_flags & VM_LOCKED) { |
1115 | mm->locked_vm += len >> PAGE_SHIFT; | 1136 | mm->locked_vm += len >> PAGE_SHIFT; |
1116 | make_pages_present(addr, addr + len); | 1137 | make_pages_present(addr, addr + len); |
@@ -1475,15 +1496,19 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un | |||
1475 | mm->total_vm += grow; | 1496 | mm->total_vm += grow; |
1476 | if (vma->vm_flags & VM_LOCKED) | 1497 | if (vma->vm_flags & VM_LOCKED) |
1477 | mm->locked_vm += grow; | 1498 | mm->locked_vm += grow; |
1478 | __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 1499 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
1479 | return 0; | 1500 | return 0; |
1480 | } | 1501 | } |
1481 | 1502 | ||
1482 | #ifdef CONFIG_STACK_GROWSUP | 1503 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) |
1483 | /* | 1504 | /* |
1484 | * vma is the first one with address > vma->vm_end. Have to extend vma. | 1505 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. |
1506 | * vma is the last one with address > vma->vm_end. Have to extend vma. | ||
1485 | */ | 1507 | */ |
1486 | int expand_stack(struct vm_area_struct * vma, unsigned long address) | 1508 | #ifdef CONFIG_STACK_GROWSUP |
1509 | static inline | ||
1510 | #endif | ||
1511 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | ||
1487 | { | 1512 | { |
1488 | int error; | 1513 | int error; |
1489 | 1514 | ||
@@ -1521,6 +1546,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address) | |||
1521 | anon_vma_unlock(vma); | 1546 | anon_vma_unlock(vma); |
1522 | return error; | 1547 | return error; |
1523 | } | 1548 | } |
1549 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | ||
1550 | |||
1551 | #ifdef CONFIG_STACK_GROWSUP | ||
1552 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
1553 | { | ||
1554 | return expand_upwards(vma, address); | ||
1555 | } | ||
1524 | 1556 | ||
1525 | struct vm_area_struct * | 1557 | struct vm_area_struct * |
1526 | find_extend_vma(struct mm_struct *mm, unsigned long addr) | 1558 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
@@ -1603,36 +1635,24 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1603 | } | 1635 | } |
1604 | #endif | 1636 | #endif |
1605 | 1637 | ||
1606 | /* Normal function to fix up a mapping | ||
1607 | * This function is the default for when an area has no specific | ||
1608 | * function. This may be used as part of a more specific routine. | ||
1609 | * | ||
1610 | * By the time this function is called, the area struct has been | ||
1611 | * removed from the process mapping list. | ||
1612 | */ | ||
1613 | static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) | ||
1614 | { | ||
1615 | size_t len = area->vm_end - area->vm_start; | ||
1616 | |||
1617 | area->vm_mm->total_vm -= len >> PAGE_SHIFT; | ||
1618 | if (area->vm_flags & VM_LOCKED) | ||
1619 | area->vm_mm->locked_vm -= len >> PAGE_SHIFT; | ||
1620 | vm_stat_unaccount(area); | ||
1621 | remove_vm_struct(area); | ||
1622 | } | ||
1623 | |||
1624 | /* | 1638 | /* |
1625 | * Update the VMA and inode share lists. | 1639 | * Ok - we have the memory areas we should free on the vma list, |
1626 | * | ||
1627 | * Ok - we have the memory areas we should free on the 'free' list, | ||
1628 | * so release them, and do the vma updates. | 1640 | * so release them, and do the vma updates. |
1641 | * | ||
1642 | * Called with the mm semaphore held. | ||
1629 | */ | 1643 | */ |
1630 | static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 1644 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
1631 | { | 1645 | { |
1646 | /* Update high watermark before we lower total_vm */ | ||
1647 | update_hiwater_vm(mm); | ||
1632 | do { | 1648 | do { |
1633 | struct vm_area_struct *next = vma->vm_next; | 1649 | long nrpages = vma_pages(vma); |
1634 | unmap_vma(mm, vma); | 1650 | |
1635 | vma = next; | 1651 | mm->total_vm -= nrpages; |
1652 | if (vma->vm_flags & VM_LOCKED) | ||
1653 | mm->locked_vm -= nrpages; | ||
1654 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | ||
1655 | vma = remove_vma(vma); | ||
1636 | } while (vma); | 1656 | } while (vma); |
1637 | validate_mm(mm); | 1657 | validate_mm(mm); |
1638 | } | 1658 | } |
@@ -1651,14 +1671,13 @@ static void unmap_region(struct mm_struct *mm, | |||
1651 | unsigned long nr_accounted = 0; | 1671 | unsigned long nr_accounted = 0; |
1652 | 1672 | ||
1653 | lru_add_drain(); | 1673 | lru_add_drain(); |
1654 | spin_lock(&mm->page_table_lock); | ||
1655 | tlb = tlb_gather_mmu(mm, 0); | 1674 | tlb = tlb_gather_mmu(mm, 0); |
1656 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); | 1675 | update_hiwater_rss(mm); |
1676 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | ||
1657 | vm_unacct_memory(nr_accounted); | 1677 | vm_unacct_memory(nr_accounted); |
1658 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, | 1678 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
1659 | next? next->vm_start: 0); | 1679 | next? next->vm_start: 0); |
1660 | tlb_finish_mmu(tlb, start, end); | 1680 | tlb_finish_mmu(tlb, start, end); |
1661 | spin_unlock(&mm->page_table_lock); | ||
1662 | } | 1681 | } |
1663 | 1682 | ||
1664 | /* | 1683 | /* |
@@ -1799,7 +1818,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1799 | unmap_region(mm, vma, prev, start, end); | 1818 | unmap_region(mm, vma, prev, start, end); |
1800 | 1819 | ||
1801 | /* Fix up all other VM information */ | 1820 | /* Fix up all other VM information */ |
1802 | unmap_vma_list(mm, vma); | 1821 | remove_vma_list(mm, vma); |
1803 | 1822 | ||
1804 | return 0; | 1823 | return 0; |
1805 | } | 1824 | } |
@@ -1933,34 +1952,21 @@ void exit_mmap(struct mm_struct *mm) | |||
1933 | unsigned long end; | 1952 | unsigned long end; |
1934 | 1953 | ||
1935 | lru_add_drain(); | 1954 | lru_add_drain(); |
1936 | |||
1937 | spin_lock(&mm->page_table_lock); | ||
1938 | |||
1939 | flush_cache_mm(mm); | 1955 | flush_cache_mm(mm); |
1940 | tlb = tlb_gather_mmu(mm, 1); | 1956 | tlb = tlb_gather_mmu(mm, 1); |
1957 | /* Don't update_hiwater_rss(mm) here, do_exit already did */ | ||
1941 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 1958 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
1942 | end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); | 1959 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
1943 | vm_unacct_memory(nr_accounted); | 1960 | vm_unacct_memory(nr_accounted); |
1944 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 1961 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); |
1945 | tlb_finish_mmu(tlb, 0, end); | 1962 | tlb_finish_mmu(tlb, 0, end); |
1946 | 1963 | ||
1947 | mm->mmap = mm->mmap_cache = NULL; | ||
1948 | mm->mm_rb = RB_ROOT; | ||
1949 | set_mm_counter(mm, rss, 0); | ||
1950 | mm->total_vm = 0; | ||
1951 | mm->locked_vm = 0; | ||
1952 | |||
1953 | spin_unlock(&mm->page_table_lock); | ||
1954 | |||
1955 | /* | 1964 | /* |
1956 | * Walk the list again, actually closing and freeing it | 1965 | * Walk the list again, actually closing and freeing it, |
1957 | * without holding any MM locks. | 1966 | * with preemption enabled, without holding any MM locks. |
1958 | */ | 1967 | */ |
1959 | while (vma) { | 1968 | while (vma) |
1960 | struct vm_area_struct *next = vma->vm_next; | 1969 | vma = remove_vma(vma); |
1961 | remove_vm_struct(vma); | ||
1962 | vma = next; | ||
1963 | } | ||
1964 | 1970 | ||
1965 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 1971 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); |
1966 | } | 1972 | } |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 57577f63b305..17a2b52b753b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
29 | unsigned long addr, unsigned long end, pgprot_t newprot) | 29 | unsigned long addr, unsigned long end, pgprot_t newprot) |
30 | { | 30 | { |
31 | pte_t *pte; | 31 | pte_t *pte; |
32 | spinlock_t *ptl; | ||
32 | 33 | ||
33 | pte = pte_offset_map(pmd, addr); | 34 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
34 | do { | 35 | do { |
35 | if (pte_present(*pte)) { | 36 | if (pte_present(*pte)) { |
36 | pte_t ptent; | 37 | pte_t ptent; |
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
44 | lazy_mmu_prot_update(ptent); | 45 | lazy_mmu_prot_update(ptent); |
45 | } | 46 | } |
46 | } while (pte++, addr += PAGE_SIZE, addr != end); | 47 | } while (pte++, addr += PAGE_SIZE, addr != end); |
47 | pte_unmap(pte - 1); | 48 | pte_unmap_unlock(pte - 1, ptl); |
48 | } | 49 | } |
49 | 50 | ||
50 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 51 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, |
@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma, | |||
88 | BUG_ON(addr >= end); | 89 | BUG_ON(addr >= end); |
89 | pgd = pgd_offset(mm, addr); | 90 | pgd = pgd_offset(mm, addr); |
90 | flush_cache_range(vma, addr, end); | 91 | flush_cache_range(vma, addr, end); |
91 | spin_lock(&mm->page_table_lock); | ||
92 | do { | 92 | do { |
93 | next = pgd_addr_end(addr, end); | 93 | next = pgd_addr_end(addr, end); |
94 | if (pgd_none_or_clear_bad(pgd)) | 94 | if (pgd_none_or_clear_bad(pgd)) |
@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma, | |||
96 | change_pud_range(mm, pgd, addr, next, newprot); | 96 | change_pud_range(mm, pgd, addr, next, newprot); |
97 | } while (pgd++, addr = next, addr != end); | 97 | } while (pgd++, addr = next, addr != end); |
98 | flush_tlb_range(vma, start, end); | 98 | flush_tlb_range(vma, start, end); |
99 | spin_unlock(&mm->page_table_lock); | ||
100 | } | 99 | } |
101 | 100 | ||
102 | static int | 101 | static int |
@@ -125,6 +124,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
125 | * a MAP_NORESERVE private mapping to writable will now reserve. | 124 | * a MAP_NORESERVE private mapping to writable will now reserve. |
126 | */ | 125 | */ |
127 | if (newflags & VM_WRITE) { | 126 | if (newflags & VM_WRITE) { |
127 | if (oldflags & VM_RESERVED) { | ||
128 | BUG_ON(oldflags & VM_WRITE); | ||
129 | printk(KERN_WARNING "program %s is using MAP_PRIVATE, " | ||
130 | "PROT_WRITE mprotect of VM_RESERVED memory, " | ||
131 | "which is deprecated. Please report this to " | ||
132 | "linux-kernel@vger.kernel.org\n",current->comm); | ||
133 | return -EACCES; | ||
134 | } | ||
128 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { | 135 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { |
129 | charged = nrpages; | 136 | charged = nrpages; |
130 | if (security_vm_enough_memory(charged)) | 137 | if (security_vm_enough_memory(charged)) |
@@ -168,8 +175,8 @@ success: | |||
168 | vma->vm_flags = newflags; | 175 | vma->vm_flags = newflags; |
169 | vma->vm_page_prot = newprot; | 176 | vma->vm_page_prot = newprot; |
170 | change_protection(vma, start, end, newprot); | 177 | change_protection(vma, start, end, newprot); |
171 | __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 178 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
172 | __vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 179 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
173 | return 0; | 180 | return 0; |
174 | 181 | ||
175 | fail: | 182 | fail: |
diff --git a/mm/mremap.c b/mm/mremap.c index f343fc73a8bd..b535438c363c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -22,35 +22,7 @@ | |||
22 | #include <asm/cacheflush.h> | 22 | #include <asm/cacheflush.h> |
23 | #include <asm/tlbflush.h> | 23 | #include <asm/tlbflush.h> |
24 | 24 | ||
25 | static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) | 25 | static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) |
26 | { | ||
27 | pgd_t *pgd; | ||
28 | pud_t *pud; | ||
29 | pmd_t *pmd; | ||
30 | pte_t *pte = NULL; | ||
31 | |||
32 | pgd = pgd_offset(mm, addr); | ||
33 | if (pgd_none_or_clear_bad(pgd)) | ||
34 | goto end; | ||
35 | |||
36 | pud = pud_offset(pgd, addr); | ||
37 | if (pud_none_or_clear_bad(pud)) | ||
38 | goto end; | ||
39 | |||
40 | pmd = pmd_offset(pud, addr); | ||
41 | if (pmd_none_or_clear_bad(pmd)) | ||
42 | goto end; | ||
43 | |||
44 | pte = pte_offset_map_nested(pmd, addr); | ||
45 | if (pte_none(*pte)) { | ||
46 | pte_unmap_nested(pte); | ||
47 | pte = NULL; | ||
48 | } | ||
49 | end: | ||
50 | return pte; | ||
51 | } | ||
52 | |||
53 | static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) | ||
54 | { | 26 | { |
55 | pgd_t *pgd; | 27 | pgd_t *pgd; |
56 | pud_t *pud; | 28 | pud_t *pud; |
@@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) | |||
68 | if (pmd_none_or_clear_bad(pmd)) | 40 | if (pmd_none_or_clear_bad(pmd)) |
69 | return NULL; | 41 | return NULL; |
70 | 42 | ||
71 | return pte_offset_map(pmd, addr); | 43 | return pmd; |
72 | } | 44 | } |
73 | 45 | ||
74 | static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) | 46 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) |
75 | { | 47 | { |
76 | pgd_t *pgd; | 48 | pgd_t *pgd; |
77 | pud_t *pud; | 49 | pud_t *pud; |
78 | pmd_t *pmd; | 50 | pmd_t *pmd; |
79 | pte_t *pte = NULL; | ||
80 | 51 | ||
81 | pgd = pgd_offset(mm, addr); | 52 | pgd = pgd_offset(mm, addr); |
82 | |||
83 | pud = pud_alloc(mm, pgd, addr); | 53 | pud = pud_alloc(mm, pgd, addr); |
84 | if (!pud) | 54 | if (!pud) |
85 | return NULL; | 55 | return NULL; |
56 | |||
86 | pmd = pmd_alloc(mm, pud, addr); | 57 | pmd = pmd_alloc(mm, pud, addr); |
87 | if (pmd) | 58 | if (!pmd) |
88 | pte = pte_alloc_map(mm, pmd, addr); | 59 | return NULL; |
89 | return pte; | 60 | |
61 | if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) | ||
62 | return NULL; | ||
63 | |||
64 | return pmd; | ||
90 | } | 65 | } |
91 | 66 | ||
92 | static int | 67 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, |
93 | move_one_page(struct vm_area_struct *vma, unsigned long old_addr, | 68 | unsigned long old_addr, unsigned long old_end, |
94 | struct vm_area_struct *new_vma, unsigned long new_addr) | 69 | struct vm_area_struct *new_vma, pmd_t *new_pmd, |
70 | unsigned long new_addr) | ||
95 | { | 71 | { |
96 | struct address_space *mapping = NULL; | 72 | struct address_space *mapping = NULL; |
97 | struct mm_struct *mm = vma->vm_mm; | 73 | struct mm_struct *mm = vma->vm_mm; |
98 | int error = 0; | 74 | pte_t *old_pte, *new_pte, pte; |
99 | pte_t *src, *dst; | 75 | spinlock_t *old_ptl, *new_ptl; |
100 | 76 | ||
101 | if (vma->vm_file) { | 77 | if (vma->vm_file) { |
102 | /* | 78 | /* |
@@ -111,74 +87,69 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr, | |||
111 | new_vma->vm_truncate_count != vma->vm_truncate_count) | 87 | new_vma->vm_truncate_count != vma->vm_truncate_count) |
112 | new_vma->vm_truncate_count = 0; | 88 | new_vma->vm_truncate_count = 0; |
113 | } | 89 | } |
114 | spin_lock(&mm->page_table_lock); | ||
115 | 90 | ||
116 | src = get_one_pte_map_nested(mm, old_addr); | 91 | /* |
117 | if (src) { | 92 | * We don't have to worry about the ordering of src and dst |
118 | /* | 93 | * pte locks because exclusive mmap_sem prevents deadlock. |
119 | * Look to see whether alloc_one_pte_map needs to perform a | 94 | */ |
120 | * memory allocation. If it does then we need to drop the | 95 | old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); |
121 | * atomic kmap | 96 | new_pte = pte_offset_map_nested(new_pmd, new_addr); |
122 | */ | 97 | new_ptl = pte_lockptr(mm, new_pmd); |
123 | dst = get_one_pte_map(mm, new_addr); | 98 | if (new_ptl != old_ptl) |
124 | if (unlikely(!dst)) { | 99 | spin_lock(new_ptl); |
125 | pte_unmap_nested(src); | 100 | |
126 | if (mapping) | 101 | for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, |
127 | spin_unlock(&mapping->i_mmap_lock); | 102 | new_pte++, new_addr += PAGE_SIZE) { |
128 | dst = alloc_one_pte_map(mm, new_addr); | 103 | if (pte_none(*old_pte)) |
129 | if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { | 104 | continue; |
130 | spin_unlock(&mm->page_table_lock); | 105 | pte = ptep_clear_flush(vma, old_addr, old_pte); |
131 | spin_lock(&mapping->i_mmap_lock); | 106 | /* ZERO_PAGE can be dependant on virtual addr */ |
132 | spin_lock(&mm->page_table_lock); | 107 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); |
133 | } | 108 | set_pte_at(mm, new_addr, new_pte, pte); |
134 | src = get_one_pte_map_nested(mm, old_addr); | ||
135 | } | ||
136 | /* | ||
137 | * Since alloc_one_pte_map can drop and re-acquire | ||
138 | * page_table_lock, we should re-check the src entry... | ||
139 | */ | ||
140 | if (src) { | ||
141 | if (dst) { | ||
142 | pte_t pte; | ||
143 | pte = ptep_clear_flush(vma, old_addr, src); | ||
144 | |||
145 | /* ZERO_PAGE can be dependant on virtual addr */ | ||
146 | pte = move_pte(pte, new_vma->vm_page_prot, | ||
147 | old_addr, new_addr); | ||
148 | set_pte_at(mm, new_addr, dst, pte); | ||
149 | } else | ||
150 | error = -ENOMEM; | ||
151 | pte_unmap_nested(src); | ||
152 | } | ||
153 | if (dst) | ||
154 | pte_unmap(dst); | ||
155 | } | 109 | } |
156 | spin_unlock(&mm->page_table_lock); | 110 | |
111 | if (new_ptl != old_ptl) | ||
112 | spin_unlock(new_ptl); | ||
113 | pte_unmap_nested(new_pte - 1); | ||
114 | pte_unmap_unlock(old_pte - 1, old_ptl); | ||
157 | if (mapping) | 115 | if (mapping) |
158 | spin_unlock(&mapping->i_mmap_lock); | 116 | spin_unlock(&mapping->i_mmap_lock); |
159 | return error; | ||
160 | } | 117 | } |
161 | 118 | ||
119 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | ||
120 | |||
162 | static unsigned long move_page_tables(struct vm_area_struct *vma, | 121 | static unsigned long move_page_tables(struct vm_area_struct *vma, |
163 | unsigned long old_addr, struct vm_area_struct *new_vma, | 122 | unsigned long old_addr, struct vm_area_struct *new_vma, |
164 | unsigned long new_addr, unsigned long len) | 123 | unsigned long new_addr, unsigned long len) |
165 | { | 124 | { |
166 | unsigned long offset; | 125 | unsigned long extent, next, old_end; |
126 | pmd_t *old_pmd, *new_pmd; | ||
167 | 127 | ||
168 | flush_cache_range(vma, old_addr, old_addr + len); | 128 | old_end = old_addr + len; |
129 | flush_cache_range(vma, old_addr, old_end); | ||
169 | 130 | ||
170 | /* | 131 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
171 | * This is not the clever way to do this, but we're taking the | ||
172 | * easy way out on the assumption that most remappings will be | ||
173 | * only a few pages.. This also makes error recovery easier. | ||
174 | */ | ||
175 | for (offset = 0; offset < len; offset += PAGE_SIZE) { | ||
176 | if (move_one_page(vma, old_addr + offset, | ||
177 | new_vma, new_addr + offset) < 0) | ||
178 | break; | ||
179 | cond_resched(); | 132 | cond_resched(); |
133 | next = (old_addr + PMD_SIZE) & PMD_MASK; | ||
134 | if (next - 1 > old_end) | ||
135 | next = old_end; | ||
136 | extent = next - old_addr; | ||
137 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); | ||
138 | if (!old_pmd) | ||
139 | continue; | ||
140 | new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); | ||
141 | if (!new_pmd) | ||
142 | break; | ||
143 | next = (new_addr + PMD_SIZE) & PMD_MASK; | ||
144 | if (extent > next - new_addr) | ||
145 | extent = next - new_addr; | ||
146 | if (extent > LATENCY_LIMIT) | ||
147 | extent = LATENCY_LIMIT; | ||
148 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | ||
149 | new_vma, new_pmd, new_addr); | ||
180 | } | 150 | } |
181 | return offset; | 151 | |
152 | return len + old_addr - old_end; /* how much done */ | ||
182 | } | 153 | } |
183 | 154 | ||
184 | static unsigned long move_vma(struct vm_area_struct *vma, | 155 | static unsigned long move_vma(struct vm_area_struct *vma, |
@@ -191,6 +162,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
191 | unsigned long new_pgoff; | 162 | unsigned long new_pgoff; |
192 | unsigned long moved_len; | 163 | unsigned long moved_len; |
193 | unsigned long excess = 0; | 164 | unsigned long excess = 0; |
165 | unsigned long hiwater_vm; | ||
194 | int split = 0; | 166 | int split = 0; |
195 | 167 | ||
196 | /* | 168 | /* |
@@ -229,17 +201,24 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
229 | } | 201 | } |
230 | 202 | ||
231 | /* | 203 | /* |
232 | * if we failed to move page tables we still do total_vm increment | 204 | * If we failed to move page tables we still do total_vm increment |
233 | * since do_munmap() will decrement it by old_len == new_len | 205 | * since do_munmap() will decrement it by old_len == new_len. |
206 | * | ||
207 | * Since total_vm is about to be raised artificially high for a | ||
208 | * moment, we need to restore high watermark afterwards: if stats | ||
209 | * are taken meanwhile, total_vm and hiwater_vm appear too high. | ||
210 | * If this were a serious issue, we'd add a flag to do_munmap(). | ||
234 | */ | 211 | */ |
212 | hiwater_vm = mm->hiwater_vm; | ||
235 | mm->total_vm += new_len >> PAGE_SHIFT; | 213 | mm->total_vm += new_len >> PAGE_SHIFT; |
236 | __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | 214 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
237 | 215 | ||
238 | if (do_munmap(mm, old_addr, old_len) < 0) { | 216 | if (do_munmap(mm, old_addr, old_len) < 0) { |
239 | /* OOM: unable to split vma, just get accounts right */ | 217 | /* OOM: unable to split vma, just get accounts right */ |
240 | vm_unacct_memory(excess >> PAGE_SHIFT); | 218 | vm_unacct_memory(excess >> PAGE_SHIFT); |
241 | excess = 0; | 219 | excess = 0; |
242 | } | 220 | } |
221 | mm->hiwater_vm = hiwater_vm; | ||
243 | 222 | ||
244 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ | 223 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ |
245 | if (excess) { | 224 | if (excess) { |
@@ -269,6 +248,7 @@ unsigned long do_mremap(unsigned long addr, | |||
269 | unsigned long old_len, unsigned long new_len, | 248 | unsigned long old_len, unsigned long new_len, |
270 | unsigned long flags, unsigned long new_addr) | 249 | unsigned long flags, unsigned long new_addr) |
271 | { | 250 | { |
251 | struct mm_struct *mm = current->mm; | ||
272 | struct vm_area_struct *vma; | 252 | struct vm_area_struct *vma; |
273 | unsigned long ret = -EINVAL; | 253 | unsigned long ret = -EINVAL; |
274 | unsigned long charged = 0; | 254 | unsigned long charged = 0; |
@@ -309,7 +289,7 @@ unsigned long do_mremap(unsigned long addr, | |||
309 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | 289 | if ((addr <= new_addr) && (addr+old_len) > new_addr) |
310 | goto out; | 290 | goto out; |
311 | 291 | ||
312 | ret = do_munmap(current->mm, new_addr, new_len); | 292 | ret = do_munmap(mm, new_addr, new_len); |
313 | if (ret) | 293 | if (ret) |
314 | goto out; | 294 | goto out; |
315 | } | 295 | } |
@@ -320,7 +300,7 @@ unsigned long do_mremap(unsigned long addr, | |||
320 | * do_munmap does all the needed commit accounting | 300 | * do_munmap does all the needed commit accounting |
321 | */ | 301 | */ |
322 | if (old_len >= new_len) { | 302 | if (old_len >= new_len) { |
323 | ret = do_munmap(current->mm, addr+new_len, old_len - new_len); | 303 | ret = do_munmap(mm, addr+new_len, old_len - new_len); |
324 | if (ret && old_len != new_len) | 304 | if (ret && old_len != new_len) |
325 | goto out; | 305 | goto out; |
326 | ret = addr; | 306 | ret = addr; |
@@ -333,7 +313,7 @@ unsigned long do_mremap(unsigned long addr, | |||
333 | * Ok, we need to grow.. or relocate. | 313 | * Ok, we need to grow.. or relocate. |
334 | */ | 314 | */ |
335 | ret = -EFAULT; | 315 | ret = -EFAULT; |
336 | vma = find_vma(current->mm, addr); | 316 | vma = find_vma(mm, addr); |
337 | if (!vma || vma->vm_start > addr) | 317 | if (!vma || vma->vm_start > addr) |
338 | goto out; | 318 | goto out; |
339 | if (is_vm_hugetlb_page(vma)) { | 319 | if (is_vm_hugetlb_page(vma)) { |
@@ -349,14 +329,14 @@ unsigned long do_mremap(unsigned long addr, | |||
349 | } | 329 | } |
350 | if (vma->vm_flags & VM_LOCKED) { | 330 | if (vma->vm_flags & VM_LOCKED) { |
351 | unsigned long locked, lock_limit; | 331 | unsigned long locked, lock_limit; |
352 | locked = current->mm->locked_vm << PAGE_SHIFT; | 332 | locked = mm->locked_vm << PAGE_SHIFT; |
353 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 333 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
354 | locked += new_len - old_len; | 334 | locked += new_len - old_len; |
355 | ret = -EAGAIN; | 335 | ret = -EAGAIN; |
356 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 336 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
357 | goto out; | 337 | goto out; |
358 | } | 338 | } |
359 | if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) { | 339 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { |
360 | ret = -ENOMEM; | 340 | ret = -ENOMEM; |
361 | goto out; | 341 | goto out; |
362 | } | 342 | } |
@@ -383,11 +363,10 @@ unsigned long do_mremap(unsigned long addr, | |||
383 | vma_adjust(vma, vma->vm_start, | 363 | vma_adjust(vma, vma->vm_start, |
384 | addr + new_len, vma->vm_pgoff, NULL); | 364 | addr + new_len, vma->vm_pgoff, NULL); |
385 | 365 | ||
386 | current->mm->total_vm += pages; | 366 | mm->total_vm += pages; |
387 | __vm_stat_account(vma->vm_mm, vma->vm_flags, | 367 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
388 | vma->vm_file, pages); | ||
389 | if (vma->vm_flags & VM_LOCKED) { | 368 | if (vma->vm_flags & VM_LOCKED) { |
390 | current->mm->locked_vm += pages; | 369 | mm->locked_vm += pages; |
391 | make_pages_present(addr + old_len, | 370 | make_pages_present(addr + old_len, |
392 | addr + new_len); | 371 | addr + new_len); |
393 | } | 372 | } |
diff --git a/mm/msync.c b/mm/msync.c index d0f5a1bce7cb..0e040e9c39d8 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -17,40 +17,48 @@ | |||
17 | #include <asm/pgtable.h> | 17 | #include <asm/pgtable.h> |
18 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
19 | 19 | ||
20 | /* | 20 | static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
21 | * Called with mm->page_table_lock held to protect against other | ||
22 | * threads/the swapper from ripping pte's out from under us. | ||
23 | */ | ||
24 | |||
25 | static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | ||
26 | unsigned long addr, unsigned long end) | 21 | unsigned long addr, unsigned long end) |
27 | { | 22 | { |
28 | pte_t *pte; | 23 | pte_t *pte; |
24 | spinlock_t *ptl; | ||
25 | int progress = 0; | ||
29 | 26 | ||
30 | pte = pte_offset_map(pmd, addr); | 27 | again: |
28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
31 | do { | 29 | do { |
32 | unsigned long pfn; | 30 | unsigned long pfn; |
33 | struct page *page; | 31 | struct page *page; |
34 | 32 | ||
33 | if (progress >= 64) { | ||
34 | progress = 0; | ||
35 | if (need_resched() || need_lockbreak(ptl)) | ||
36 | break; | ||
37 | } | ||
38 | progress++; | ||
35 | if (!pte_present(*pte)) | 39 | if (!pte_present(*pte)) |
36 | continue; | 40 | continue; |
37 | if (!pte_maybe_dirty(*pte)) | 41 | if (!pte_maybe_dirty(*pte)) |
38 | continue; | 42 | continue; |
39 | pfn = pte_pfn(*pte); | 43 | pfn = pte_pfn(*pte); |
40 | if (!pfn_valid(pfn)) | 44 | if (unlikely(!pfn_valid(pfn))) { |
45 | print_bad_pte(vma, *pte, addr); | ||
41 | continue; | 46 | continue; |
47 | } | ||
42 | page = pfn_to_page(pfn); | 48 | page = pfn_to_page(pfn); |
43 | if (PageReserved(page)) | ||
44 | continue; | ||
45 | 49 | ||
46 | if (ptep_clear_flush_dirty(vma, addr, pte) || | 50 | if (ptep_clear_flush_dirty(vma, addr, pte) || |
47 | page_test_and_clear_dirty(page)) | 51 | page_test_and_clear_dirty(page)) |
48 | set_page_dirty(page); | 52 | set_page_dirty(page); |
53 | progress += 3; | ||
49 | } while (pte++, addr += PAGE_SIZE, addr != end); | 54 | } while (pte++, addr += PAGE_SIZE, addr != end); |
50 | pte_unmap(pte - 1); | 55 | pte_unmap_unlock(pte - 1, ptl); |
56 | cond_resched(); | ||
57 | if (addr != end) | ||
58 | goto again; | ||
51 | } | 59 | } |
52 | 60 | ||
53 | static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 61 | static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
54 | unsigned long addr, unsigned long end) | 62 | unsigned long addr, unsigned long end) |
55 | { | 63 | { |
56 | pmd_t *pmd; | 64 | pmd_t *pmd; |
@@ -61,11 +69,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
61 | next = pmd_addr_end(addr, end); | 69 | next = pmd_addr_end(addr, end); |
62 | if (pmd_none_or_clear_bad(pmd)) | 70 | if (pmd_none_or_clear_bad(pmd)) |
63 | continue; | 71 | continue; |
64 | sync_pte_range(vma, pmd, addr, next); | 72 | msync_pte_range(vma, pmd, addr, next); |
65 | } while (pmd++, addr = next, addr != end); | 73 | } while (pmd++, addr = next, addr != end); |
66 | } | 74 | } |
67 | 75 | ||
68 | static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 76 | static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
69 | unsigned long addr, unsigned long end) | 77 | unsigned long addr, unsigned long end) |
70 | { | 78 | { |
71 | pud_t *pud; | 79 | pud_t *pud; |
@@ -76,58 +84,34 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
76 | next = pud_addr_end(addr, end); | 84 | next = pud_addr_end(addr, end); |
77 | if (pud_none_or_clear_bad(pud)) | 85 | if (pud_none_or_clear_bad(pud)) |
78 | continue; | 86 | continue; |
79 | sync_pmd_range(vma, pud, addr, next); | 87 | msync_pmd_range(vma, pud, addr, next); |
80 | } while (pud++, addr = next, addr != end); | 88 | } while (pud++, addr = next, addr != end); |
81 | } | 89 | } |
82 | 90 | ||
83 | static void sync_page_range(struct vm_area_struct *vma, | 91 | static void msync_page_range(struct vm_area_struct *vma, |
84 | unsigned long addr, unsigned long end) | 92 | unsigned long addr, unsigned long end) |
85 | { | 93 | { |
86 | struct mm_struct *mm = vma->vm_mm; | ||
87 | pgd_t *pgd; | 94 | pgd_t *pgd; |
88 | unsigned long next; | 95 | unsigned long next; |
89 | 96 | ||
90 | /* For hugepages we can't go walking the page table normally, | 97 | /* For hugepages we can't go walking the page table normally, |
91 | * but that's ok, hugetlbfs is memory based, so we don't need | 98 | * but that's ok, hugetlbfs is memory based, so we don't need |
92 | * to do anything more on an msync() */ | 99 | * to do anything more on an msync(). |
93 | if (is_vm_hugetlb_page(vma)) | 100 | * Can't do anything with VM_RESERVED regions either. |
101 | */ | ||
102 | if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) | ||
94 | return; | 103 | return; |
95 | 104 | ||
96 | BUG_ON(addr >= end); | 105 | BUG_ON(addr >= end); |
97 | pgd = pgd_offset(mm, addr); | 106 | pgd = pgd_offset(vma->vm_mm, addr); |
98 | flush_cache_range(vma, addr, end); | 107 | flush_cache_range(vma, addr, end); |
99 | spin_lock(&mm->page_table_lock); | ||
100 | do { | 108 | do { |
101 | next = pgd_addr_end(addr, end); | 109 | next = pgd_addr_end(addr, end); |
102 | if (pgd_none_or_clear_bad(pgd)) | 110 | if (pgd_none_or_clear_bad(pgd)) |
103 | continue; | 111 | continue; |
104 | sync_pud_range(vma, pgd, addr, next); | 112 | msync_pud_range(vma, pgd, addr, next); |
105 | } while (pgd++, addr = next, addr != end); | 113 | } while (pgd++, addr = next, addr != end); |
106 | spin_unlock(&mm->page_table_lock); | ||
107 | } | ||
108 | |||
109 | #ifdef CONFIG_PREEMPT | ||
110 | static inline void filemap_sync(struct vm_area_struct *vma, | ||
111 | unsigned long addr, unsigned long end) | ||
112 | { | ||
113 | const size_t chunk = 64 * 1024; /* bytes */ | ||
114 | unsigned long next; | ||
115 | |||
116 | do { | ||
117 | next = addr + chunk; | ||
118 | if (next > end || next < addr) | ||
119 | next = end; | ||
120 | sync_page_range(vma, addr, next); | ||
121 | cond_resched(); | ||
122 | } while (addr = next, addr != end); | ||
123 | } | ||
124 | #else | ||
125 | static inline void filemap_sync(struct vm_area_struct *vma, | ||
126 | unsigned long addr, unsigned long end) | ||
127 | { | ||
128 | sync_page_range(vma, addr, end); | ||
129 | } | 114 | } |
130 | #endif | ||
131 | 115 | ||
132 | /* | 116 | /* |
133 | * MS_SYNC syncs the entire file - including mappings. | 117 | * MS_SYNC syncs the entire file - including mappings. |
@@ -150,7 +134,7 @@ static int msync_interval(struct vm_area_struct *vma, | |||
150 | return -EBUSY; | 134 | return -EBUSY; |
151 | 135 | ||
152 | if (file && (vma->vm_flags & VM_SHARED)) { | 136 | if (file && (vma->vm_flags & VM_SHARED)) { |
153 | filemap_sync(vma, addr, end); | 137 | msync_page_range(vma, addr, end); |
154 | 138 | ||
155 | if (flags & MS_SYNC) { | 139 | if (flags & MS_SYNC) { |
156 | struct address_space *mapping = file->f_mapping; | 140 | struct address_space *mapping = file->f_mapping; |
diff --git a/mm/nommu.c b/mm/nommu.c index 0ef241ae3763..d1e076a487cb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
931 | realalloc -= kobjsize(vml); | 931 | realalloc -= kobjsize(vml); |
932 | askedalloc -= sizeof(*vml); | 932 | askedalloc -= sizeof(*vml); |
933 | kfree(vml); | 933 | kfree(vml); |
934 | |||
935 | update_hiwater_vm(mm); | ||
934 | mm->total_vm -= len >> PAGE_SHIFT; | 936 | mm->total_vm -= len >> PAGE_SHIFT; |
935 | 937 | ||
936 | #ifdef DEBUG | 938 | #ifdef DEBUG |
@@ -1047,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1047 | 1049 | ||
1048 | EXPORT_SYMBOL(find_vma); | 1050 | EXPORT_SYMBOL(find_vma); |
1049 | 1051 | ||
1050 | struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) | 1052 | struct page *follow_page(struct mm_struct *mm, unsigned long address, |
1053 | unsigned int foll_flags) | ||
1051 | { | 1054 | { |
1052 | return NULL; | 1055 | return NULL; |
1053 | } | 1056 | } |
@@ -1078,19 +1081,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | |||
1078 | { | 1081 | { |
1079 | } | 1082 | } |
1080 | 1083 | ||
1081 | void update_mem_hiwater(struct task_struct *tsk) | ||
1082 | { | ||
1083 | unsigned long rss; | ||
1084 | |||
1085 | if (likely(tsk->mm)) { | ||
1086 | rss = get_mm_counter(tsk->mm, rss); | ||
1087 | if (tsk->mm->hiwater_rss < rss) | ||
1088 | tsk->mm->hiwater_rss = rss; | ||
1089 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
1090 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
1091 | } | ||
1092 | } | ||
1093 | |||
1094 | void unmap_mapping_range(struct address_space *mapping, | 1084 | void unmap_mapping_range(struct address_space *mapping, |
1095 | loff_t const holebegin, loff_t const holelen, | 1085 | loff_t const holebegin, loff_t const holelen, |
1096 | int even_cows) | 1086 | int even_cows) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94c864eac9c4..2dbdd98426fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/sysctl.h> | 33 | #include <linux/sysctl.h> |
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/cpuset.h> | 35 | #include <linux/cpuset.h> |
36 | #include <linux/memory_hotplug.h> | ||
36 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> |
37 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
38 | 39 | ||
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024; | |||
78 | unsigned long __initdata nr_kernel_pages; | 79 | unsigned long __initdata nr_kernel_pages; |
79 | unsigned long __initdata nr_all_pages; | 80 | unsigned long __initdata nr_all_pages; |
80 | 81 | ||
82 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | ||
83 | { | ||
84 | int ret = 0; | ||
85 | unsigned seq; | ||
86 | unsigned long pfn = page_to_pfn(page); | ||
87 | |||
88 | do { | ||
89 | seq = zone_span_seqbegin(zone); | ||
90 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | ||
91 | ret = 1; | ||
92 | else if (pfn < zone->zone_start_pfn) | ||
93 | ret = 1; | ||
94 | } while (zone_span_seqretry(zone, seq)); | ||
95 | |||
96 | return ret; | ||
97 | } | ||
98 | |||
99 | static int page_is_consistent(struct zone *zone, struct page *page) | ||
100 | { | ||
101 | #ifdef CONFIG_HOLES_IN_ZONE | ||
102 | if (!pfn_valid(page_to_pfn(page))) | ||
103 | return 0; | ||
104 | #endif | ||
105 | if (zone != page_zone(page)) | ||
106 | return 0; | ||
107 | |||
108 | return 1; | ||
109 | } | ||
81 | /* | 110 | /* |
82 | * Temporary debugging check for pages not lying within a given zone. | 111 | * Temporary debugging check for pages not lying within a given zone. |
83 | */ | 112 | */ |
84 | static int bad_range(struct zone *zone, struct page *page) | 113 | static int bad_range(struct zone *zone, struct page *page) |
85 | { | 114 | { |
86 | if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) | 115 | if (page_outside_zone_boundaries(zone, page)) |
87 | return 1; | 116 | return 1; |
88 | if (page_to_pfn(page) < zone->zone_start_pfn) | 117 | if (!page_is_consistent(zone, page)) |
89 | return 1; | ||
90 | #ifdef CONFIG_HOLES_IN_ZONE | ||
91 | if (!pfn_valid(page_to_pfn(page))) | ||
92 | return 1; | ||
93 | #endif | ||
94 | if (zone != page_zone(page)) | ||
95 | return 1; | 118 | return 1; |
119 | |||
96 | return 0; | 120 | return 0; |
97 | } | 121 | } |
98 | 122 | ||
@@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page) | |||
114 | 1 << PG_reclaim | | 138 | 1 << PG_reclaim | |
115 | 1 << PG_slab | | 139 | 1 << PG_slab | |
116 | 1 << PG_swapcache | | 140 | 1 << PG_swapcache | |
117 | 1 << PG_writeback); | 141 | 1 << PG_writeback | |
142 | 1 << PG_reserved ); | ||
118 | set_page_count(page, 0); | 143 | set_page_count(page, 0); |
119 | reset_page_mapcount(page); | 144 | reset_page_mapcount(page); |
120 | page->mapping = NULL; | 145 | page->mapping = NULL; |
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
153 | struct page *p = page + i; | 178 | struct page *p = page + i; |
154 | 179 | ||
155 | SetPageCompound(p); | 180 | SetPageCompound(p); |
156 | p->private = (unsigned long)page; | 181 | set_page_private(p, (unsigned long)page); |
157 | } | 182 | } |
158 | } | 183 | } |
159 | 184 | ||
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
173 | 198 | ||
174 | if (!PageCompound(p)) | 199 | if (!PageCompound(p)) |
175 | bad_page(__FUNCTION__, page); | 200 | bad_page(__FUNCTION__, page); |
176 | if (p->private != (unsigned long)page) | 201 | if (page_private(p) != (unsigned long)page) |
177 | bad_page(__FUNCTION__, page); | 202 | bad_page(__FUNCTION__, page); |
178 | ClearPageCompound(p); | 203 | ClearPageCompound(p); |
179 | } | 204 | } |
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
186 | * So, we don't need atomic page->flags operations here. | 211 | * So, we don't need atomic page->flags operations here. |
187 | */ | 212 | */ |
188 | static inline unsigned long page_order(struct page *page) { | 213 | static inline unsigned long page_order(struct page *page) { |
189 | return page->private; | 214 | return page_private(page); |
190 | } | 215 | } |
191 | 216 | ||
192 | static inline void set_page_order(struct page *page, int order) { | 217 | static inline void set_page_order(struct page *page, int order) { |
193 | page->private = order; | 218 | set_page_private(page, order); |
194 | __SetPagePrivate(page); | 219 | __SetPagePrivate(page); |
195 | } | 220 | } |
196 | 221 | ||
197 | static inline void rmv_page_order(struct page *page) | 222 | static inline void rmv_page_order(struct page *page) |
198 | { | 223 | { |
199 | __ClearPagePrivate(page); | 224 | __ClearPagePrivate(page); |
200 | page->private = 0; | 225 | set_page_private(page, 0); |
201 | } | 226 | } |
202 | 227 | ||
203 | /* | 228 | /* |
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
237 | * (a) the buddy is free && | 262 | * (a) the buddy is free && |
238 | * (b) the buddy is on the buddy system && | 263 | * (b) the buddy is on the buddy system && |
239 | * (c) a page and its buddy have the same order. | 264 | * (c) a page and its buddy have the same order. |
240 | * for recording page's order, we use page->private and PG_private. | 265 | * for recording page's order, we use page_private(page) and PG_private. |
241 | * | 266 | * |
242 | */ | 267 | */ |
243 | static inline int page_is_buddy(struct page *page, int order) | 268 | static inline int page_is_buddy(struct page *page, int order) |
244 | { | 269 | { |
245 | if (PagePrivate(page) && | 270 | if (PagePrivate(page) && |
246 | (page_order(page) == order) && | 271 | (page_order(page) == order) && |
247 | !PageReserved(page) && | ||
248 | page_count(page) == 0) | 272 | page_count(page) == 0) |
249 | return 1; | 273 | return 1; |
250 | return 0; | 274 | return 0; |
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order) | |||
264 | * parts of the VM system. | 288 | * parts of the VM system. |
265 | * At each level, we keep a list of pages, which are heads of continuous | 289 | * At each level, we keep a list of pages, which are heads of continuous |
266 | * free pages of length of (1 << order) and marked with PG_Private.Page's | 290 | * free pages of length of (1 << order) and marked with PG_Private.Page's |
267 | * order is recorded in page->private field. | 291 | * order is recorded in page_private(page) field. |
268 | * So when we are allocating or freeing one, we can derive the state of the | 292 | * So when we are allocating or freeing one, we can derive the state of the |
269 | * other. That is, if we allocate a small block, and both were | 293 | * other. That is, if we allocate a small block, and both were |
270 | * free, the remainder of the region must be split into blocks. | 294 | * free, the remainder of the region must be split into blocks. |
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page) | |||
327 | 1 << PG_reclaim | | 351 | 1 << PG_reclaim | |
328 | 1 << PG_slab | | 352 | 1 << PG_slab | |
329 | 1 << PG_swapcache | | 353 | 1 << PG_swapcache | |
330 | 1 << PG_writeback ))) | 354 | 1 << PG_writeback | |
355 | 1 << PG_reserved ))) | ||
331 | bad_page(function, page); | 356 | bad_page(function, page); |
332 | if (PageDirty(page)) | 357 | if (PageDirty(page)) |
333 | __ClearPageDirty(page); | 358 | __ClearPageDirty(page); |
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order) | |||
455 | 1 << PG_reclaim | | 480 | 1 << PG_reclaim | |
456 | 1 << PG_slab | | 481 | 1 << PG_slab | |
457 | 1 << PG_swapcache | | 482 | 1 << PG_swapcache | |
458 | 1 << PG_writeback ))) | 483 | 1 << PG_writeback | |
484 | 1 << PG_reserved ))) | ||
459 | bad_page(__FUNCTION__, page); | 485 | bad_page(__FUNCTION__, page); |
460 | 486 | ||
461 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | | 487 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | |
462 | 1 << PG_referenced | 1 << PG_arch_1 | | 488 | 1 << PG_referenced | 1 << PG_arch_1 | |
463 | 1 << PG_checked | 1 << PG_mappedtodisk); | 489 | 1 << PG_checked | 1 << PG_mappedtodisk); |
464 | page->private = 0; | 490 | set_page_private(page, 0); |
465 | set_page_refs(page, order); | 491 | set_page_refs(page, order); |
466 | kernel_map_pages(page, 1 << order, 1); | 492 | kernel_map_pages(page, 1 << order, 1); |
467 | } | 493 | } |
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec) | |||
1016 | 1042 | ||
1017 | fastcall void __free_pages(struct page *page, unsigned int order) | 1043 | fastcall void __free_pages(struct page *page, unsigned int order) |
1018 | { | 1044 | { |
1019 | if (!PageReserved(page) && put_page_testzero(page)) { | 1045 | if (put_page_testzero(page)) { |
1020 | if (order == 0) | 1046 | if (order == 0) |
1021 | free_hot_page(page); | 1047 | free_hot_page(page); |
1022 | else | 1048 | else |
@@ -1305,12 +1331,9 @@ void show_free_areas(void) | |||
1305 | } else | 1331 | } else |
1306 | printk("\n"); | 1332 | printk("\n"); |
1307 | 1333 | ||
1308 | for (cpu = 0; cpu < NR_CPUS; ++cpu) { | 1334 | for_each_cpu(cpu) { |
1309 | struct per_cpu_pageset *pageset; | 1335 | struct per_cpu_pageset *pageset; |
1310 | 1336 | ||
1311 | if (!cpu_possible(cpu)) | ||
1312 | continue; | ||
1313 | |||
1314 | pageset = zone_pcp(zone, cpu); | 1337 | pageset = zone_pcp(zone, cpu); |
1315 | 1338 | ||
1316 | for (temperature = 0; temperature < 2; temperature++) | 1339 | for (temperature = 0; temperature < 2; temperature++) |
@@ -1660,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | |||
1660 | * up by free_all_bootmem() once the early boot process is | 1683 | * up by free_all_bootmem() once the early boot process is |
1661 | * done. Non-atomic initialization, single-pass. | 1684 | * done. Non-atomic initialization, single-pass. |
1662 | */ | 1685 | */ |
1663 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1686 | void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
1664 | unsigned long start_pfn) | 1687 | unsigned long start_pfn) |
1665 | { | 1688 | { |
1666 | struct page *page; | 1689 | struct page *page; |
@@ -1674,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1674 | continue; | 1697 | continue; |
1675 | page = pfn_to_page(pfn); | 1698 | page = pfn_to_page(pfn); |
1676 | set_page_links(page, zone, nid, pfn); | 1699 | set_page_links(page, zone, nid, pfn); |
1677 | set_page_count(page, 0); | 1700 | set_page_count(page, 1); |
1678 | reset_page_mapcount(page); | 1701 | reset_page_mapcount(page); |
1679 | SetPageReserved(page); | 1702 | SetPageReserved(page); |
1680 | INIT_LIST_HEAD(&page->lru); | 1703 | INIT_LIST_HEAD(&page->lru); |
@@ -1721,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone) | |||
1721 | 1744 | ||
1722 | /* | 1745 | /* |
1723 | * The per-cpu-pages pools are set to around 1000th of the | 1746 | * The per-cpu-pages pools are set to around 1000th of the |
1724 | * size of the zone. But no more than 1/4 of a meg - there's | 1747 | * size of the zone. But no more than 1/2 of a meg. |
1725 | * no point in going beyond the size of L2 cache. | ||
1726 | * | 1748 | * |
1727 | * OK, so we don't know how big the cache is. So guess. | 1749 | * OK, so we don't know how big the cache is. So guess. |
1728 | */ | 1750 | */ |
1729 | batch = zone->present_pages / 1024; | 1751 | batch = zone->present_pages / 1024; |
1730 | if (batch * PAGE_SIZE > 256 * 1024) | 1752 | if (batch * PAGE_SIZE > 512 * 1024) |
1731 | batch = (256 * 1024) / PAGE_SIZE; | 1753 | batch = (512 * 1024) / PAGE_SIZE; |
1732 | batch /= 4; /* We effectively *= 4 below */ | 1754 | batch /= 4; /* We effectively *= 4 below */ |
1733 | if (batch < 1) | 1755 | if (batch < 1) |
1734 | batch = 1; | 1756 | batch = 1; |
1735 | 1757 | ||
1736 | /* | 1758 | /* |
1737 | * Clamp the batch to a 2^n - 1 value. Having a power | 1759 | * We will be trying to allcoate bigger chunks of contiguous |
1738 | * of 2 value was found to be more likely to have | 1760 | * memory of the order of fls(batch). This should result in |
1739 | * suboptimal cache aliasing properties in some cases. | 1761 | * better cache coloring. |
1740 | * | 1762 | * |
1741 | * For example if 2 tasks are alternately allocating | 1763 | * A sanity check also to ensure that batch is still in limits. |
1742 | * batches of pages, one task can end up with a lot | ||
1743 | * of pages of one half of the possible page colors | ||
1744 | * and the other with pages of the other colors. | ||
1745 | */ | 1764 | */ |
1746 | batch = (1 << fls(batch + batch/2)) - 1; | 1765 | batch = (1 << fls(batch + batch/2)); |
1766 | |||
1767 | if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) | ||
1768 | batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); | ||
1769 | |||
1747 | return batch; | 1770 | return batch; |
1748 | } | 1771 | } |
1749 | 1772 | ||
@@ -1755,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
1755 | 1778 | ||
1756 | pcp = &p->pcp[0]; /* hot */ | 1779 | pcp = &p->pcp[0]; /* hot */ |
1757 | pcp->count = 0; | 1780 | pcp->count = 0; |
1758 | pcp->low = 2 * batch; | 1781 | pcp->low = 0; |
1759 | pcp->high = 6 * batch; | 1782 | pcp->high = 6 * batch; |
1760 | pcp->batch = max(1UL, 1 * batch); | 1783 | pcp->batch = max(1UL, 1 * batch); |
1761 | INIT_LIST_HEAD(&pcp->list); | 1784 | INIT_LIST_HEAD(&pcp->list); |
@@ -1764,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
1764 | pcp->count = 0; | 1787 | pcp->count = 0; |
1765 | pcp->low = 0; | 1788 | pcp->low = 0; |
1766 | pcp->high = 2 * batch; | 1789 | pcp->high = 2 * batch; |
1767 | pcp->batch = max(1UL, 1 * batch); | 1790 | pcp->batch = max(1UL, batch/2); |
1768 | INIT_LIST_HEAD(&pcp->list); | 1791 | INIT_LIST_HEAD(&pcp->list); |
1769 | } | 1792 | } |
1770 | 1793 | ||
@@ -1873,6 +1896,60 @@ void __init setup_per_cpu_pageset() | |||
1873 | 1896 | ||
1874 | #endif | 1897 | #endif |
1875 | 1898 | ||
1899 | static __devinit | ||
1900 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | ||
1901 | { | ||
1902 | int i; | ||
1903 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1904 | |||
1905 | /* | ||
1906 | * The per-page waitqueue mechanism uses hashed waitqueues | ||
1907 | * per zone. | ||
1908 | */ | ||
1909 | zone->wait_table_size = wait_table_size(zone_size_pages); | ||
1910 | zone->wait_table_bits = wait_table_bits(zone->wait_table_size); | ||
1911 | zone->wait_table = (wait_queue_head_t *) | ||
1912 | alloc_bootmem_node(pgdat, zone->wait_table_size | ||
1913 | * sizeof(wait_queue_head_t)); | ||
1914 | |||
1915 | for(i = 0; i < zone->wait_table_size; ++i) | ||
1916 | init_waitqueue_head(zone->wait_table + i); | ||
1917 | } | ||
1918 | |||
1919 | static __devinit void zone_pcp_init(struct zone *zone) | ||
1920 | { | ||
1921 | int cpu; | ||
1922 | unsigned long batch = zone_batchsize(zone); | ||
1923 | |||
1924 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
1925 | #ifdef CONFIG_NUMA | ||
1926 | /* Early boot. Slab allocator not functional yet */ | ||
1927 | zone->pageset[cpu] = &boot_pageset[cpu]; | ||
1928 | setup_pageset(&boot_pageset[cpu],0); | ||
1929 | #else | ||
1930 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
1931 | #endif | ||
1932 | } | ||
1933 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | ||
1934 | zone->name, zone->present_pages, batch); | ||
1935 | } | ||
1936 | |||
1937 | static __devinit void init_currently_empty_zone(struct zone *zone, | ||
1938 | unsigned long zone_start_pfn, unsigned long size) | ||
1939 | { | ||
1940 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1941 | |||
1942 | zone_wait_table_init(zone, size); | ||
1943 | pgdat->nr_zones = zone_idx(zone) + 1; | ||
1944 | |||
1945 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); | ||
1946 | zone->zone_start_pfn = zone_start_pfn; | ||
1947 | |||
1948 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | ||
1949 | |||
1950 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | ||
1951 | } | ||
1952 | |||
1876 | /* | 1953 | /* |
1877 | * Set up the zone data structures: | 1954 | * Set up the zone data structures: |
1878 | * - mark all pages reserved | 1955 | * - mark all pages reserved |
@@ -1882,10 +1959,11 @@ void __init setup_per_cpu_pageset() | |||
1882 | static void __init free_area_init_core(struct pglist_data *pgdat, | 1959 | static void __init free_area_init_core(struct pglist_data *pgdat, |
1883 | unsigned long *zones_size, unsigned long *zholes_size) | 1960 | unsigned long *zones_size, unsigned long *zholes_size) |
1884 | { | 1961 | { |
1885 | unsigned long i, j; | 1962 | unsigned long j; |
1886 | int cpu, nid = pgdat->node_id; | 1963 | int nid = pgdat->node_id; |
1887 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 1964 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
1888 | 1965 | ||
1966 | pgdat_resize_init(pgdat); | ||
1889 | pgdat->nr_zones = 0; | 1967 | pgdat->nr_zones = 0; |
1890 | init_waitqueue_head(&pgdat->kswapd_wait); | 1968 | init_waitqueue_head(&pgdat->kswapd_wait); |
1891 | pgdat->kswapd_max_order = 0; | 1969 | pgdat->kswapd_max_order = 0; |
@@ -1893,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1893 | for (j = 0; j < MAX_NR_ZONES; j++) { | 1971 | for (j = 0; j < MAX_NR_ZONES; j++) { |
1894 | struct zone *zone = pgdat->node_zones + j; | 1972 | struct zone *zone = pgdat->node_zones + j; |
1895 | unsigned long size, realsize; | 1973 | unsigned long size, realsize; |
1896 | unsigned long batch; | ||
1897 | 1974 | ||
1898 | realsize = size = zones_size[j]; | 1975 | realsize = size = zones_size[j]; |
1899 | if (zholes_size) | 1976 | if (zholes_size) |
@@ -1908,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1908 | zone->name = zone_names[j]; | 1985 | zone->name = zone_names[j]; |
1909 | spin_lock_init(&zone->lock); | 1986 | spin_lock_init(&zone->lock); |
1910 | spin_lock_init(&zone->lru_lock); | 1987 | spin_lock_init(&zone->lru_lock); |
1988 | zone_seqlock_init(zone); | ||
1911 | zone->zone_pgdat = pgdat; | 1989 | zone->zone_pgdat = pgdat; |
1912 | zone->free_pages = 0; | 1990 | zone->free_pages = 0; |
1913 | 1991 | ||
1914 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 1992 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; |
1915 | 1993 | ||
1916 | batch = zone_batchsize(zone); | 1994 | zone_pcp_init(zone); |
1917 | |||
1918 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
1919 | #ifdef CONFIG_NUMA | ||
1920 | /* Early boot. Slab allocator not functional yet */ | ||
1921 | zone->pageset[cpu] = &boot_pageset[cpu]; | ||
1922 | setup_pageset(&boot_pageset[cpu],0); | ||
1923 | #else | ||
1924 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
1925 | #endif | ||
1926 | } | ||
1927 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | ||
1928 | zone_names[j], realsize, batch); | ||
1929 | INIT_LIST_HEAD(&zone->active_list); | 1995 | INIT_LIST_HEAD(&zone->active_list); |
1930 | INIT_LIST_HEAD(&zone->inactive_list); | 1996 | INIT_LIST_HEAD(&zone->inactive_list); |
1931 | zone->nr_scan_active = 0; | 1997 | zone->nr_scan_active = 0; |
@@ -1936,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1936 | if (!size) | 2002 | if (!size) |
1937 | continue; | 2003 | continue; |
1938 | 2004 | ||
1939 | /* | ||
1940 | * The per-page waitqueue mechanism uses hashed waitqueues | ||
1941 | * per zone. | ||
1942 | */ | ||
1943 | zone->wait_table_size = wait_table_size(size); | ||
1944 | zone->wait_table_bits = | ||
1945 | wait_table_bits(zone->wait_table_size); | ||
1946 | zone->wait_table = (wait_queue_head_t *) | ||
1947 | alloc_bootmem_node(pgdat, zone->wait_table_size | ||
1948 | * sizeof(wait_queue_head_t)); | ||
1949 | |||
1950 | for(i = 0; i < zone->wait_table_size; ++i) | ||
1951 | init_waitqueue_head(zone->wait_table + i); | ||
1952 | |||
1953 | pgdat->nr_zones = j+1; | ||
1954 | |||
1955 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); | ||
1956 | zone->zone_start_pfn = zone_start_pfn; | ||
1957 | |||
1958 | memmap_init(size, nid, j, zone_start_pfn); | ||
1959 | |||
1960 | zonetable_add(zone, nid, j, zone_start_pfn, size); | 2005 | zonetable_add(zone, nid, j, zone_start_pfn, size); |
1961 | 2006 | init_currently_empty_zone(zone, zone_start_pfn, size); | |
1962 | zone_start_pfn += size; | 2007 | zone_start_pfn += size; |
1963 | |||
1964 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | ||
1965 | } | 2008 | } |
1966 | } | 2009 | } |
1967 | 2010 | ||
@@ -2361,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
2361 | * that the pages_{min,low,high} values for each zone are set correctly | 2404 | * that the pages_{min,low,high} values for each zone are set correctly |
2362 | * with respect to min_free_kbytes. | 2405 | * with respect to min_free_kbytes. |
2363 | */ | 2406 | */ |
2364 | static void setup_per_zone_pages_min(void) | 2407 | void setup_per_zone_pages_min(void) |
2365 | { | 2408 | { |
2366 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 2409 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
2367 | unsigned long lowmem_pages = 0; | 2410 | unsigned long lowmem_pages = 0; |
diff --git a/mm/page_io.c b/mm/page_io.c index 330e00d6db00..bb2b0d53889c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
91 | unlock_page(page); | 91 | unlock_page(page); |
92 | goto out; | 92 | goto out; |
93 | } | 93 | } |
94 | bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); | 94 | bio = get_swap_bio(GFP_NOIO, page_private(page), page, |
95 | end_swap_bio_write); | ||
95 | if (bio == NULL) { | 96 | if (bio == NULL) { |
96 | set_page_dirty(page); | 97 | set_page_dirty(page); |
97 | unlock_page(page); | 98 | unlock_page(page); |
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page) | |||
115 | 116 | ||
116 | BUG_ON(!PageLocked(page)); | 117 | BUG_ON(!PageLocked(page)); |
117 | ClearPageUptodate(page); | 118 | ClearPageUptodate(page); |
118 | bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); | 119 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, |
120 | end_swap_bio_read); | ||
119 | if (bio == NULL) { | 121 | if (bio == NULL) { |
120 | unlock_page(page); | 122 | unlock_page(page); |
121 | ret = -ENOMEM; | 123 | ret = -ENOMEM; |
@@ -32,7 +32,7 @@ | |||
32 | * page->flags PG_locked (lock_page) | 32 | * page->flags PG_locked (lock_page) |
33 | * mapping->i_mmap_lock | 33 | * mapping->i_mmap_lock |
34 | * anon_vma->lock | 34 | * anon_vma->lock |
35 | * mm->page_table_lock | 35 | * mm->page_table_lock or pte_lock |
36 | * zone->lru_lock (in mark_page_accessed) | 36 | * zone->lru_lock (in mark_page_accessed) |
37 | * swap_lock (in swap_duplicate, swap_info_get) | 37 | * swap_lock (in swap_duplicate, swap_info_get) |
38 | * mmlist_lock (in mmput, drain_mmlist and others) | 38 | * mmlist_lock (in mmput, drain_mmlist and others) |
@@ -244,37 +244,44 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
244 | /* | 244 | /* |
245 | * Check that @page is mapped at @address into @mm. | 245 | * Check that @page is mapped at @address into @mm. |
246 | * | 246 | * |
247 | * On success returns with mapped pte and locked mm->page_table_lock. | 247 | * On success returns with pte mapped and locked. |
248 | */ | 248 | */ |
249 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, | 249 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, |
250 | unsigned long address) | 250 | unsigned long address, spinlock_t **ptlp) |
251 | { | 251 | { |
252 | pgd_t *pgd; | 252 | pgd_t *pgd; |
253 | pud_t *pud; | 253 | pud_t *pud; |
254 | pmd_t *pmd; | 254 | pmd_t *pmd; |
255 | pte_t *pte; | 255 | pte_t *pte; |
256 | spinlock_t *ptl; | ||
256 | 257 | ||
257 | /* | ||
258 | * We need the page_table_lock to protect us from page faults, | ||
259 | * munmap, fork, etc... | ||
260 | */ | ||
261 | spin_lock(&mm->page_table_lock); | ||
262 | pgd = pgd_offset(mm, address); | 258 | pgd = pgd_offset(mm, address); |
263 | if (likely(pgd_present(*pgd))) { | 259 | if (!pgd_present(*pgd)) |
264 | pud = pud_offset(pgd, address); | 260 | return NULL; |
265 | if (likely(pud_present(*pud))) { | 261 | |
266 | pmd = pmd_offset(pud, address); | 262 | pud = pud_offset(pgd, address); |
267 | if (likely(pmd_present(*pmd))) { | 263 | if (!pud_present(*pud)) |
268 | pte = pte_offset_map(pmd, address); | 264 | return NULL; |
269 | if (likely(pte_present(*pte) && | 265 | |
270 | page_to_pfn(page) == pte_pfn(*pte))) | 266 | pmd = pmd_offset(pud, address); |
271 | return pte; | 267 | if (!pmd_present(*pmd)) |
272 | pte_unmap(pte); | 268 | return NULL; |
273 | } | 269 | |
274 | } | 270 | pte = pte_offset_map(pmd, address); |
271 | /* Make a quick check before getting the lock */ | ||
272 | if (!pte_present(*pte)) { | ||
273 | pte_unmap(pte); | ||
274 | return NULL; | ||
275 | } | ||
276 | |||
277 | ptl = pte_lockptr(mm, pmd); | ||
278 | spin_lock(ptl); | ||
279 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { | ||
280 | *ptlp = ptl; | ||
281 | return pte; | ||
275 | } | 282 | } |
276 | spin_unlock(&mm->page_table_lock); | 283 | pte_unmap_unlock(pte, ptl); |
277 | return ERR_PTR(-ENOENT); | 284 | return NULL; |
278 | } | 285 | } |
279 | 286 | ||
280 | /* | 287 | /* |
@@ -287,24 +294,28 @@ static int page_referenced_one(struct page *page, | |||
287 | struct mm_struct *mm = vma->vm_mm; | 294 | struct mm_struct *mm = vma->vm_mm; |
288 | unsigned long address; | 295 | unsigned long address; |
289 | pte_t *pte; | 296 | pte_t *pte; |
297 | spinlock_t *ptl; | ||
290 | int referenced = 0; | 298 | int referenced = 0; |
291 | 299 | ||
292 | address = vma_address(page, vma); | 300 | address = vma_address(page, vma); |
293 | if (address == -EFAULT) | 301 | if (address == -EFAULT) |
294 | goto out; | 302 | goto out; |
295 | 303 | ||
296 | pte = page_check_address(page, mm, address); | 304 | pte = page_check_address(page, mm, address, &ptl); |
297 | if (!IS_ERR(pte)) { | 305 | if (!pte) |
298 | if (ptep_clear_flush_young(vma, address, pte)) | 306 | goto out; |
299 | referenced++; | ||
300 | 307 | ||
301 | if (mm != current->mm && !ignore_token && has_swap_token(mm)) | 308 | if (ptep_clear_flush_young(vma, address, pte)) |
302 | referenced++; | 309 | referenced++; |
303 | 310 | ||
304 | (*mapcount)--; | 311 | /* Pretend the page is referenced if the task has the |
305 | pte_unmap(pte); | 312 | swap token and is in the middle of a page fault. */ |
306 | spin_unlock(&mm->page_table_lock); | 313 | if (mm != current->mm && !ignore_token && has_swap_token(mm) && |
307 | } | 314 | rwsem_is_locked(&mm->mmap_sem)) |
315 | referenced++; | ||
316 | |||
317 | (*mapcount)--; | ||
318 | pte_unmap_unlock(pte, ptl); | ||
308 | out: | 319 | out: |
309 | return referenced; | 320 | return referenced; |
310 | } | 321 | } |
@@ -434,15 +445,11 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) | |||
434 | * @vma: the vm area in which the mapping is added | 445 | * @vma: the vm area in which the mapping is added |
435 | * @address: the user virtual address mapped | 446 | * @address: the user virtual address mapped |
436 | * | 447 | * |
437 | * The caller needs to hold the mm->page_table_lock. | 448 | * The caller needs to hold the pte lock. |
438 | */ | 449 | */ |
439 | void page_add_anon_rmap(struct page *page, | 450 | void page_add_anon_rmap(struct page *page, |
440 | struct vm_area_struct *vma, unsigned long address) | 451 | struct vm_area_struct *vma, unsigned long address) |
441 | { | 452 | { |
442 | BUG_ON(PageReserved(page)); | ||
443 | |||
444 | inc_mm_counter(vma->vm_mm, anon_rss); | ||
445 | |||
446 | if (atomic_inc_and_test(&page->_mapcount)) { | 453 | if (atomic_inc_and_test(&page->_mapcount)) { |
447 | struct anon_vma *anon_vma = vma->anon_vma; | 454 | struct anon_vma *anon_vma = vma->anon_vma; |
448 | 455 | ||
@@ -461,13 +468,12 @@ void page_add_anon_rmap(struct page *page, | |||
461 | * page_add_file_rmap - add pte mapping to a file page | 468 | * page_add_file_rmap - add pte mapping to a file page |
462 | * @page: the page to add the mapping to | 469 | * @page: the page to add the mapping to |
463 | * | 470 | * |
464 | * The caller needs to hold the mm->page_table_lock. | 471 | * The caller needs to hold the pte lock. |
465 | */ | 472 | */ |
466 | void page_add_file_rmap(struct page *page) | 473 | void page_add_file_rmap(struct page *page) |
467 | { | 474 | { |
468 | BUG_ON(PageAnon(page)); | 475 | BUG_ON(PageAnon(page)); |
469 | if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) | 476 | BUG_ON(!pfn_valid(page_to_pfn(page))); |
470 | return; | ||
471 | 477 | ||
472 | if (atomic_inc_and_test(&page->_mapcount)) | 478 | if (atomic_inc_and_test(&page->_mapcount)) |
473 | inc_page_state(nr_mapped); | 479 | inc_page_state(nr_mapped); |
@@ -477,12 +483,10 @@ void page_add_file_rmap(struct page *page) | |||
477 | * page_remove_rmap - take down pte mapping from a page | 483 | * page_remove_rmap - take down pte mapping from a page |
478 | * @page: page to remove mapping from | 484 | * @page: page to remove mapping from |
479 | * | 485 | * |
480 | * Caller needs to hold the mm->page_table_lock. | 486 | * The caller needs to hold the pte lock. |
481 | */ | 487 | */ |
482 | void page_remove_rmap(struct page *page) | 488 | void page_remove_rmap(struct page *page) |
483 | { | 489 | { |
484 | BUG_ON(PageReserved(page)); | ||
485 | |||
486 | if (atomic_add_negative(-1, &page->_mapcount)) { | 490 | if (atomic_add_negative(-1, &page->_mapcount)) { |
487 | BUG_ON(page_mapcount(page) < 0); | 491 | BUG_ON(page_mapcount(page) < 0); |
488 | /* | 492 | /* |
@@ -510,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
510 | unsigned long address; | 514 | unsigned long address; |
511 | pte_t *pte; | 515 | pte_t *pte; |
512 | pte_t pteval; | 516 | pte_t pteval; |
517 | spinlock_t *ptl; | ||
513 | int ret = SWAP_AGAIN; | 518 | int ret = SWAP_AGAIN; |
514 | 519 | ||
515 | address = vma_address(page, vma); | 520 | address = vma_address(page, vma); |
516 | if (address == -EFAULT) | 521 | if (address == -EFAULT) |
517 | goto out; | 522 | goto out; |
518 | 523 | ||
519 | pte = page_check_address(page, mm, address); | 524 | pte = page_check_address(page, mm, address, &ptl); |
520 | if (IS_ERR(pte)) | 525 | if (!pte) |
521 | goto out; | 526 | goto out; |
522 | 527 | ||
523 | /* | 528 | /* |
@@ -541,8 +546,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
541 | if (pte_dirty(pteval)) | 546 | if (pte_dirty(pteval)) |
542 | set_page_dirty(page); | 547 | set_page_dirty(page); |
543 | 548 | ||
549 | /* Update high watermark before we lower rss */ | ||
550 | update_hiwater_rss(mm); | ||
551 | |||
544 | if (PageAnon(page)) { | 552 | if (PageAnon(page)) { |
545 | swp_entry_t entry = { .val = page->private }; | 553 | swp_entry_t entry = { .val = page_private(page) }; |
546 | /* | 554 | /* |
547 | * Store the swap location in the pte. | 555 | * Store the swap location in the pte. |
548 | * See handle_pte_fault() ... | 556 | * See handle_pte_fault() ... |
@@ -551,21 +559,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
551 | swap_duplicate(entry); | 559 | swap_duplicate(entry); |
552 | if (list_empty(&mm->mmlist)) { | 560 | if (list_empty(&mm->mmlist)) { |
553 | spin_lock(&mmlist_lock); | 561 | spin_lock(&mmlist_lock); |
554 | list_add(&mm->mmlist, &init_mm.mmlist); | 562 | if (list_empty(&mm->mmlist)) |
563 | list_add(&mm->mmlist, &init_mm.mmlist); | ||
555 | spin_unlock(&mmlist_lock); | 564 | spin_unlock(&mmlist_lock); |
556 | } | 565 | } |
557 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 566 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
558 | BUG_ON(pte_file(*pte)); | 567 | BUG_ON(pte_file(*pte)); |
559 | dec_mm_counter(mm, anon_rss); | 568 | dec_mm_counter(mm, anon_rss); |
560 | } | 569 | } else |
570 | dec_mm_counter(mm, file_rss); | ||
561 | 571 | ||
562 | dec_mm_counter(mm, rss); | ||
563 | page_remove_rmap(page); | 572 | page_remove_rmap(page); |
564 | page_cache_release(page); | 573 | page_cache_release(page); |
565 | 574 | ||
566 | out_unmap: | 575 | out_unmap: |
567 | pte_unmap(pte); | 576 | pte_unmap_unlock(pte, ptl); |
568 | spin_unlock(&mm->page_table_lock); | ||
569 | out: | 577 | out: |
570 | return ret; | 578 | return ret; |
571 | } | 579 | } |
@@ -599,19 +607,14 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
599 | pgd_t *pgd; | 607 | pgd_t *pgd; |
600 | pud_t *pud; | 608 | pud_t *pud; |
601 | pmd_t *pmd; | 609 | pmd_t *pmd; |
602 | pte_t *pte, *original_pte; | 610 | pte_t *pte; |
603 | pte_t pteval; | 611 | pte_t pteval; |
612 | spinlock_t *ptl; | ||
604 | struct page *page; | 613 | struct page *page; |
605 | unsigned long address; | 614 | unsigned long address; |
606 | unsigned long end; | 615 | unsigned long end; |
607 | unsigned long pfn; | 616 | unsigned long pfn; |
608 | 617 | ||
609 | /* | ||
610 | * We need the page_table_lock to protect us from page faults, | ||
611 | * munmap, fork, etc... | ||
612 | */ | ||
613 | spin_lock(&mm->page_table_lock); | ||
614 | |||
615 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 618 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
616 | end = address + CLUSTER_SIZE; | 619 | end = address + CLUSTER_SIZE; |
617 | if (address < vma->vm_start) | 620 | if (address < vma->vm_start) |
@@ -621,30 +624,33 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
621 | 624 | ||
622 | pgd = pgd_offset(mm, address); | 625 | pgd = pgd_offset(mm, address); |
623 | if (!pgd_present(*pgd)) | 626 | if (!pgd_present(*pgd)) |
624 | goto out_unlock; | 627 | return; |
625 | 628 | ||
626 | pud = pud_offset(pgd, address); | 629 | pud = pud_offset(pgd, address); |
627 | if (!pud_present(*pud)) | 630 | if (!pud_present(*pud)) |
628 | goto out_unlock; | 631 | return; |
629 | 632 | ||
630 | pmd = pmd_offset(pud, address); | 633 | pmd = pmd_offset(pud, address); |
631 | if (!pmd_present(*pmd)) | 634 | if (!pmd_present(*pmd)) |
632 | goto out_unlock; | 635 | return; |
636 | |||
637 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
633 | 638 | ||
634 | for (original_pte = pte = pte_offset_map(pmd, address); | 639 | /* Update high watermark before we lower rss */ |
635 | address < end; pte++, address += PAGE_SIZE) { | 640 | update_hiwater_rss(mm); |
636 | 641 | ||
642 | for (; address < end; pte++, address += PAGE_SIZE) { | ||
637 | if (!pte_present(*pte)) | 643 | if (!pte_present(*pte)) |
638 | continue; | 644 | continue; |
639 | 645 | ||
640 | pfn = pte_pfn(*pte); | 646 | pfn = pte_pfn(*pte); |
641 | if (!pfn_valid(pfn)) | 647 | if (unlikely(!pfn_valid(pfn))) { |
648 | print_bad_pte(vma, *pte, address); | ||
642 | continue; | 649 | continue; |
650 | } | ||
643 | 651 | ||
644 | page = pfn_to_page(pfn); | 652 | page = pfn_to_page(pfn); |
645 | BUG_ON(PageAnon(page)); | 653 | BUG_ON(PageAnon(page)); |
646 | if (PageReserved(page)) | ||
647 | continue; | ||
648 | 654 | ||
649 | if (ptep_clear_flush_young(vma, address, pte)) | 655 | if (ptep_clear_flush_young(vma, address, pte)) |
650 | continue; | 656 | continue; |
@@ -663,13 +669,10 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
663 | 669 | ||
664 | page_remove_rmap(page); | 670 | page_remove_rmap(page); |
665 | page_cache_release(page); | 671 | page_cache_release(page); |
666 | dec_mm_counter(mm, rss); | 672 | dec_mm_counter(mm, file_rss); |
667 | (*mapcount)--; | 673 | (*mapcount)--; |
668 | } | 674 | } |
669 | 675 | pte_unmap_unlock(pte - 1, ptl); | |
670 | pte_unmap(original_pte); | ||
671 | out_unlock: | ||
672 | spin_unlock(&mm->page_table_lock); | ||
673 | } | 676 | } |
674 | 677 | ||
675 | static int try_to_unmap_anon(struct page *page) | 678 | static int try_to_unmap_anon(struct page *page) |
@@ -806,7 +809,6 @@ int try_to_unmap(struct page *page) | |||
806 | { | 809 | { |
807 | int ret; | 810 | int ret; |
808 | 811 | ||
809 | BUG_ON(PageReserved(page)); | ||
810 | BUG_ON(!PageLocked(page)); | 812 | BUG_ON(!PageLocked(page)); |
811 | 813 | ||
812 | if (PageAnon(page)) | 814 | if (PageAnon(page)) |
diff --git a/mm/shmem.c b/mm/shmem.c index 55e04a0734c1..dc25565a61e9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -71,9 +71,6 @@ | |||
71 | /* Pretend that each entry is of this size in directory's i_size */ | 71 | /* Pretend that each entry is of this size in directory's i_size */ |
72 | #define BOGO_DIRENT_SIZE 20 | 72 | #define BOGO_DIRENT_SIZE 20 |
73 | 73 | ||
74 | /* Keep swapped page count in private field of indirect struct page */ | ||
75 | #define nr_swapped private | ||
76 | |||
77 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 74 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ |
78 | enum sgp_type { | 75 | enum sgp_type { |
79 | SGP_QUICK, /* don't try more than file page cache lookup */ | 76 | SGP_QUICK, /* don't try more than file page cache lookup */ |
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns | |||
324 | 321 | ||
325 | entry->val = value; | 322 | entry->val = value; |
326 | info->swapped += incdec; | 323 | info->swapped += incdec; |
327 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) | 324 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { |
328 | kmap_atomic_to_page(entry)->nr_swapped += incdec; | 325 | struct page *page = kmap_atomic_to_page(entry); |
326 | set_page_private(page, page_private(page) + incdec); | ||
327 | } | ||
329 | } | 328 | } |
330 | 329 | ||
331 | /* | 330 | /* |
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long | |||
368 | 367 | ||
369 | spin_unlock(&info->lock); | 368 | spin_unlock(&info->lock); |
370 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); | 369 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); |
371 | if (page) { | 370 | if (page) |
372 | page->nr_swapped = 0; | 371 | set_page_private(page, 0); |
373 | } | ||
374 | spin_lock(&info->lock); | 372 | spin_lock(&info->lock); |
375 | 373 | ||
376 | if (!page) { | 374 | if (!page) { |
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode) | |||
561 | diroff = 0; | 559 | diroff = 0; |
562 | } | 560 | } |
563 | subdir = dir[diroff]; | 561 | subdir = dir[diroff]; |
564 | if (subdir && subdir->nr_swapped) { | 562 | if (subdir && page_private(subdir)) { |
565 | size = limit - idx; | 563 | size = limit - idx; |
566 | if (size > ENTRIES_PER_PAGE) | 564 | if (size > ENTRIES_PER_PAGE) |
567 | size = ENTRIES_PER_PAGE; | 565 | size = ENTRIES_PER_PAGE; |
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode) | |||
572 | nr_swaps_freed += freed; | 570 | nr_swaps_freed += freed; |
573 | if (offset) | 571 | if (offset) |
574 | spin_lock(&info->lock); | 572 | spin_lock(&info->lock); |
575 | subdir->nr_swapped -= freed; | 573 | set_page_private(subdir, page_private(subdir) - freed); |
576 | if (offset) | 574 | if (offset) |
577 | spin_unlock(&info->lock); | 575 | spin_unlock(&info->lock); |
578 | BUG_ON(subdir->nr_swapped > offset); | 576 | BUG_ON(page_private(subdir) > offset); |
579 | } | 577 | } |
580 | if (offset) | 578 | if (offset) |
581 | offset = 0; | 579 | offset = 0; |
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
743 | dir = shmem_dir_map(subdir); | 741 | dir = shmem_dir_map(subdir); |
744 | } | 742 | } |
745 | subdir = *dir; | 743 | subdir = *dir; |
746 | if (subdir && subdir->nr_swapped) { | 744 | if (subdir && page_private(subdir)) { |
747 | ptr = shmem_swp_map(subdir); | 745 | ptr = shmem_swp_map(subdir); |
748 | size = limit - idx; | 746 | size = limit - idx; |
749 | if (size > ENTRIES_PER_PAGE) | 747 | if (size > ENTRIES_PER_PAGE) |
@@ -1201,7 +1199,7 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
1201 | page_cache_release(page); | 1199 | page_cache_release(page); |
1202 | return err; | 1200 | return err; |
1203 | } | 1201 | } |
1204 | } else { | 1202 | } else if (vma->vm_flags & VM_NONLINEAR) { |
1205 | /* No page was found just because we can't read it in | 1203 | /* No page was found just because we can't read it in |
1206 | * now (being here implies nonblock != 0), but the page | 1204 | * now (being here implies nonblock != 0), but the page |
1207 | * may exist, so set the PTE to fault it in later. */ | 1205 | * may exist, so set the PTE to fault it in later. */ |
@@ -1506,8 +1504,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1506 | */ | 1504 | */ |
1507 | if (!offset) | 1505 | if (!offset) |
1508 | mark_page_accessed(page); | 1506 | mark_page_accessed(page); |
1509 | } else | 1507 | } else { |
1510 | page = ZERO_PAGE(0); | 1508 | page = ZERO_PAGE(0); |
1509 | page_cache_get(page); | ||
1510 | } | ||
1511 | 1511 | ||
1512 | /* | 1512 | /* |
1513 | * Ok, we have the page, and it's up-to-date, so | 1513 | * Ok, we have the page, and it's up-to-date, so |
@@ -2419,6 +2419,7 @@ retry: | |||
2419 | next = slab_bufctl(slabp)[slabp->free]; | 2419 | next = slab_bufctl(slabp)[slabp->free]; |
2420 | #if DEBUG | 2420 | #if DEBUG |
2421 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; | 2421 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; |
2422 | WARN_ON(numa_node_id() != slabp->nodeid); | ||
2422 | #endif | 2423 | #endif |
2423 | slabp->free = next; | 2424 | slabp->free = next; |
2424 | } | 2425 | } |
@@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n | |||
2633 | check_spinlock_acquired_node(cachep, node); | 2634 | check_spinlock_acquired_node(cachep, node); |
2634 | check_slabp(cachep, slabp); | 2635 | check_slabp(cachep, slabp); |
2635 | 2636 | ||
2636 | |||
2637 | #if DEBUG | 2637 | #if DEBUG |
2638 | /* Verify that the slab belongs to the intended node */ | ||
2639 | WARN_ON(slabp->nodeid != node); | ||
2640 | |||
2638 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | 2641 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { |
2639 | printk(KERN_ERR "slab: double free detected in cache " | 2642 | printk(KERN_ERR "slab: double free detected in cache " |
2640 | "'%s', objp %p\n", cachep->name, objp); | 2643 | "'%s', objp %p\n", cachep->name, objp); |
diff --git a/mm/sparse.c b/mm/sparse.c index 347249a4917a..72079b538e2d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -5,8 +5,10 @@ | |||
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
7 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
8 | #include <linux/highmem.h> | ||
8 | #include <linux/module.h> | 9 | #include <linux/module.h> |
9 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <linux/vmalloc.h> | ||
10 | #include <asm/dma.h> | 12 | #include <asm/dma.h> |
11 | 13 | ||
12 | /* | 14 | /* |
@@ -72,6 +74,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) | |||
72 | } | 74 | } |
73 | #endif | 75 | #endif |
74 | 76 | ||
77 | /* | ||
78 | * Although written for the SPARSEMEM_EXTREME case, this happens | ||
79 | * to also work for the flat array case becase | ||
80 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. | ||
81 | */ | ||
82 | int __section_nr(struct mem_section* ms) | ||
83 | { | ||
84 | unsigned long root_nr; | ||
85 | struct mem_section* root; | ||
86 | |||
87 | for (root_nr = 0; | ||
88 | root_nr < NR_MEM_SECTIONS; | ||
89 | root_nr += SECTIONS_PER_ROOT) { | ||
90 | root = __nr_to_section(root_nr); | ||
91 | |||
92 | if (!root) | ||
93 | continue; | ||
94 | |||
95 | if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) | ||
96 | break; | ||
97 | } | ||
98 | |||
99 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | ||
100 | } | ||
101 | |||
75 | /* Record a memory area against a node. */ | 102 | /* Record a memory area against a node. */ |
76 | void memory_present(int nid, unsigned long start, unsigned long end) | 103 | void memory_present(int nid, unsigned long start, unsigned long end) |
77 | { | 104 | { |
@@ -162,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum) | |||
162 | return NULL; | 189 | return NULL; |
163 | } | 190 | } |
164 | 191 | ||
192 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | ||
193 | { | ||
194 | struct page *page, *ret; | ||
195 | unsigned long memmap_size = sizeof(struct page) * nr_pages; | ||
196 | |||
197 | page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); | ||
198 | if (page) | ||
199 | goto got_map_page; | ||
200 | |||
201 | ret = vmalloc(memmap_size); | ||
202 | if (ret) | ||
203 | goto got_map_ptr; | ||
204 | |||
205 | return NULL; | ||
206 | got_map_page: | ||
207 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); | ||
208 | got_map_ptr: | ||
209 | memset(ret, 0, memmap_size); | ||
210 | |||
211 | return ret; | ||
212 | } | ||
213 | |||
214 | static int vaddr_in_vmalloc_area(void *addr) | ||
215 | { | ||
216 | if (addr >= (void *)VMALLOC_START && | ||
217 | addr < (void *)VMALLOC_END) | ||
218 | return 1; | ||
219 | return 0; | ||
220 | } | ||
221 | |||
222 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | ||
223 | { | ||
224 | if (vaddr_in_vmalloc_area(memmap)) | ||
225 | vfree(memmap); | ||
226 | else | ||
227 | free_pages((unsigned long)memmap, | ||
228 | get_order(sizeof(struct page) * nr_pages)); | ||
229 | } | ||
230 | |||
165 | /* | 231 | /* |
166 | * Allocate the accumulated non-linear sections, allocate a mem_map | 232 | * Allocate the accumulated non-linear sections, allocate a mem_map |
167 | * for each and record the physical to section mapping. | 233 | * for each and record the physical to section mapping. |
@@ -187,14 +253,37 @@ void sparse_init(void) | |||
187 | * set. If this is <=0, then that means that the passed-in | 253 | * set. If this is <=0, then that means that the passed-in |
188 | * map was not consumed and must be freed. | 254 | * map was not consumed and must be freed. |
189 | */ | 255 | */ |
190 | int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) | 256 | int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, |
257 | int nr_pages) | ||
191 | { | 258 | { |
192 | struct mem_section *ms = __pfn_to_section(start_pfn); | 259 | unsigned long section_nr = pfn_to_section_nr(start_pfn); |
260 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
261 | struct mem_section *ms; | ||
262 | struct page *memmap; | ||
263 | unsigned long flags; | ||
264 | int ret; | ||
193 | 265 | ||
194 | if (ms->section_mem_map & SECTION_MARKED_PRESENT) | 266 | /* |
195 | return -EEXIST; | 267 | * no locking for this, because it does its own |
268 | * plus, it does a kmalloc | ||
269 | */ | ||
270 | sparse_index_init(section_nr, pgdat->node_id); | ||
271 | memmap = __kmalloc_section_memmap(nr_pages); | ||
272 | |||
273 | pgdat_resize_lock(pgdat, &flags); | ||
196 | 274 | ||
275 | ms = __pfn_to_section(start_pfn); | ||
276 | if (ms->section_mem_map & SECTION_MARKED_PRESENT) { | ||
277 | ret = -EEXIST; | ||
278 | goto out; | ||
279 | } | ||
197 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 280 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
198 | 281 | ||
199 | return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); | 282 | ret = sparse_init_one_section(ms, section_nr, memmap); |
283 | |||
284 | if (ret <= 0) | ||
285 | __kfree_section_memmap(memmap, nr_pages); | ||
286 | out: | ||
287 | pgdat_resize_unlock(pgdat, &flags); | ||
288 | return ret; | ||
200 | } | 289 | } |
@@ -39,7 +39,7 @@ int page_cluster; | |||
39 | void put_page(struct page *page) | 39 | void put_page(struct page *page) |
40 | { | 40 | { |
41 | if (unlikely(PageCompound(page))) { | 41 | if (unlikely(PageCompound(page))) { |
42 | page = (struct page *)page->private; | 42 | page = (struct page *)page_private(page); |
43 | if (put_page_testzero(page)) { | 43 | if (put_page_testzero(page)) { |
44 | void (*dtor)(struct page *page); | 44 | void (*dtor)(struct page *page); |
45 | 45 | ||
@@ -48,7 +48,7 @@ void put_page(struct page *page) | |||
48 | } | 48 | } |
49 | return; | 49 | return; |
50 | } | 50 | } |
51 | if (!PageReserved(page) && put_page_testzero(page)) | 51 | if (put_page_testzero(page)) |
52 | __page_cache_release(page); | 52 | __page_cache_release(page); |
53 | } | 53 | } |
54 | EXPORT_SYMBOL(put_page); | 54 | EXPORT_SYMBOL(put_page); |
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
215 | struct page *page = pages[i]; | 215 | struct page *page = pages[i]; |
216 | struct zone *pagezone; | 216 | struct zone *pagezone; |
217 | 217 | ||
218 | if (PageReserved(page) || !put_page_testzero(page)) | 218 | if (!put_page_testzero(page)) |
219 | continue; | 219 | continue; |
220 | 220 | ||
221 | pagezone = page_zone(page); | 221 | pagezone = page_zone(page); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 132164f7d0a7..dfd9a46755b8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
83 | page_cache_get(page); | 83 | page_cache_get(page); |
84 | SetPageLocked(page); | 84 | SetPageLocked(page); |
85 | SetPageSwapCache(page); | 85 | SetPageSwapCache(page); |
86 | page->private = entry.val; | 86 | set_page_private(page, entry.val); |
87 | total_swapcache_pages++; | 87 | total_swapcache_pages++; |
88 | pagecache_acct(1); | 88 | pagecache_acct(1); |
89 | } | 89 | } |
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page) | |||
126 | BUG_ON(PageWriteback(page)); | 126 | BUG_ON(PageWriteback(page)); |
127 | BUG_ON(PagePrivate(page)); | 127 | BUG_ON(PagePrivate(page)); |
128 | 128 | ||
129 | radix_tree_delete(&swapper_space.page_tree, page->private); | 129 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); |
130 | page->private = 0; | 130 | set_page_private(page, 0); |
131 | ClearPageSwapCache(page); | 131 | ClearPageSwapCache(page); |
132 | total_swapcache_pages--; | 132 | total_swapcache_pages--; |
133 | pagecache_acct(-1); | 133 | pagecache_acct(-1); |
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page) | |||
197 | { | 197 | { |
198 | swp_entry_t entry; | 198 | swp_entry_t entry; |
199 | 199 | ||
200 | entry.val = page->private; | 200 | entry.val = page_private(page); |
201 | 201 | ||
202 | write_lock_irq(&swapper_space.tree_lock); | 202 | write_lock_irq(&swapper_space.tree_lock); |
203 | __delete_from_swap_cache(page); | 203 | __delete_from_swap_cache(page); |
@@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page) | |||
259 | 259 | ||
260 | /* | 260 | /* |
261 | * Perform a free_page(), also freeing any swap cache associated with | 261 | * Perform a free_page(), also freeing any swap cache associated with |
262 | * this page if it is the last user of the page. Can not do a lock_page, | 262 | * this page if it is the last user of the page. |
263 | * as we are holding the page_table_lock spinlock. | ||
264 | */ | 263 | */ |
265 | void free_page_and_swap_cache(struct page *page) | 264 | void free_page_and_swap_cache(struct page *page) |
266 | { | 265 | { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1dcaeda039f4..8970c0b74194 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
61 | swp_entry_t entry; | 61 | swp_entry_t entry; |
62 | 62 | ||
63 | down_read(&swap_unplug_sem); | 63 | down_read(&swap_unplug_sem); |
64 | entry.val = page->private; | 64 | entry.val = page_private(page); |
65 | if (PageSwapCache(page)) { | 65 | if (PageSwapCache(page)) { |
66 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 66 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; |
67 | struct backing_dev_info *bdi; | 67 | struct backing_dev_info *bdi; |
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
69 | /* | 69 | /* |
70 | * If the page is removed from swapcache from under us (with a | 70 | * If the page is removed from swapcache from under us (with a |
71 | * racy try_to_unuse/swapoff) we need an additional reference | 71 | * racy try_to_unuse/swapoff) we need an additional reference |
72 | * count to avoid reading garbage from page->private above. If | 72 | * count to avoid reading garbage from page_private(page) above. |
73 | * the WARN_ON triggers during a swapoff it maybe the race | 73 | * If the WARN_ON triggers during a swapoff it maybe the race |
74 | * condition and it's harmless. However if it triggers without | 74 | * condition and it's harmless. However if it triggers without |
75 | * swapoff it signals a problem. | 75 | * swapoff it signals a problem. |
76 | */ | 76 | */ |
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page) | |||
294 | struct swap_info_struct *p; | 294 | struct swap_info_struct *p; |
295 | swp_entry_t entry; | 295 | swp_entry_t entry; |
296 | 296 | ||
297 | entry.val = page->private; | 297 | entry.val = page_private(page); |
298 | p = swap_info_get(entry); | 298 | p = swap_info_get(entry); |
299 | if (p) { | 299 | if (p) { |
300 | /* Subtract the 1 for the swap cache itself */ | 300 | /* Subtract the 1 for the swap cache itself */ |
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page) | |||
339 | if (page_count(page) != 2) /* 2: us + cache */ | 339 | if (page_count(page) != 2) /* 2: us + cache */ |
340 | return 0; | 340 | return 0; |
341 | 341 | ||
342 | entry.val = page->private; | 342 | entry.val = page_private(page); |
343 | p = swap_info_get(entry); | 343 | p = swap_info_get(entry); |
344 | if (!p) | 344 | if (!p) |
345 | return 0; | 345 | return 0; |
@@ -398,17 +398,14 @@ void free_swap_and_cache(swp_entry_t entry) | |||
398 | } | 398 | } |
399 | 399 | ||
400 | /* | 400 | /* |
401 | * Always set the resulting pte to be nowrite (the same as COW pages | 401 | * No need to decide whether this PTE shares the swap entry with others, |
402 | * after one process has exited). We don't know just how many PTEs will | 402 | * just let do_wp_page work it out if a write is requested later - to |
403 | * share this swap entry, so be cautious and let do_wp_page work out | 403 | * force COW, vm_page_prot omits write permission from any private vma. |
404 | * what to do if a write is requested later. | ||
405 | * | ||
406 | * vma->vm_mm->page_table_lock is held. | ||
407 | */ | 404 | */ |
408 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | 405 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, |
409 | unsigned long addr, swp_entry_t entry, struct page *page) | 406 | unsigned long addr, swp_entry_t entry, struct page *page) |
410 | { | 407 | { |
411 | inc_mm_counter(vma->vm_mm, rss); | 408 | inc_mm_counter(vma->vm_mm, anon_rss); |
412 | get_page(page); | 409 | get_page(page); |
413 | set_pte_at(vma->vm_mm, addr, pte, | 410 | set_pte_at(vma->vm_mm, addr, pte, |
414 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 411 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
@@ -425,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
425 | unsigned long addr, unsigned long end, | 422 | unsigned long addr, unsigned long end, |
426 | swp_entry_t entry, struct page *page) | 423 | swp_entry_t entry, struct page *page) |
427 | { | 424 | { |
428 | pte_t *pte; | ||
429 | pte_t swp_pte = swp_entry_to_pte(entry); | 425 | pte_t swp_pte = swp_entry_to_pte(entry); |
426 | pte_t *pte; | ||
427 | spinlock_t *ptl; | ||
428 | int found = 0; | ||
430 | 429 | ||
431 | pte = pte_offset_map(pmd, addr); | 430 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
432 | do { | 431 | do { |
433 | /* | 432 | /* |
434 | * swapoff spends a _lot_ of time in this loop! | 433 | * swapoff spends a _lot_ of time in this loop! |
435 | * Test inline before going to call unuse_pte. | 434 | * Test inline before going to call unuse_pte. |
436 | */ | 435 | */ |
437 | if (unlikely(pte_same(*pte, swp_pte))) { | 436 | if (unlikely(pte_same(*pte, swp_pte))) { |
438 | unuse_pte(vma, pte, addr, entry, page); | 437 | unuse_pte(vma, pte++, addr, entry, page); |
439 | pte_unmap(pte); | 438 | found = 1; |
440 | return 1; | 439 | break; |
441 | } | 440 | } |
442 | } while (pte++, addr += PAGE_SIZE, addr != end); | 441 | } while (pte++, addr += PAGE_SIZE, addr != end); |
443 | pte_unmap(pte - 1); | 442 | pte_unmap_unlock(pte - 1, ptl); |
444 | return 0; | 443 | return found; |
445 | } | 444 | } |
446 | 445 | ||
447 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 446 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
@@ -523,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm, | |||
523 | down_read(&mm->mmap_sem); | 522 | down_read(&mm->mmap_sem); |
524 | lock_page(page); | 523 | lock_page(page); |
525 | } | 524 | } |
526 | spin_lock(&mm->page_table_lock); | ||
527 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 525 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
528 | if (vma->anon_vma && unuse_vma(vma, entry, page)) | 526 | if (vma->anon_vma && unuse_vma(vma, entry, page)) |
529 | break; | 527 | break; |
530 | } | 528 | } |
531 | spin_unlock(&mm->page_table_lock); | ||
532 | up_read(&mm->mmap_sem); | 529 | up_read(&mm->mmap_sem); |
533 | /* | 530 | /* |
534 | * Currently unuse_mm cannot fail, but leave error handling | 531 | * Currently unuse_mm cannot fail, but leave error handling |
@@ -1045,7 +1042,7 @@ int page_queue_congested(struct page *page) | |||
1045 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ | 1042 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ |
1046 | 1043 | ||
1047 | if (PageSwapCache(page)) { | 1044 | if (PageSwapCache(page)) { |
1048 | swp_entry_t entry = { .val = page->private }; | 1045 | swp_entry_t entry = { .val = page_private(page) }; |
1049 | struct swap_info_struct *sis; | 1046 | struct swap_info_struct *sis; |
1050 | 1047 | ||
1051 | sis = get_swap_info_struct(swp_type(entry)); | 1048 | sis = get_swap_info_struct(swp_type(entry)); |
diff --git a/mm/thrash.c b/mm/thrash.c index 11461f7ad830..eff3c18c33a1 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -19,7 +19,7 @@ static unsigned long swap_token_check; | |||
19 | struct mm_struct * swap_token_mm = &init_mm; | 19 | struct mm_struct * swap_token_mm = &init_mm; |
20 | 20 | ||
21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) | 21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) |
22 | #define SWAP_TOKEN_TIMEOUT 0 | 22 | #define SWAP_TOKEN_TIMEOUT (300 * HZ) |
23 | /* | 23 | /* |
24 | * Currently disabled; Needs further code to work at HZ * 300. | 24 | * Currently disabled; Needs further code to work at HZ * 300. |
25 | */ | 25 | */ |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1150229b6366..54a90e83cb31 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | 5 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
6 | * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 | 6 | * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 |
7 | * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 | 7 | * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 |
8 | * Numa awareness, Christoph Lameter, SGI, June 2005 | ||
8 | */ | 9 | */ |
9 | 10 | ||
10 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
@@ -88,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | |||
88 | { | 89 | { |
89 | pte_t *pte; | 90 | pte_t *pte; |
90 | 91 | ||
91 | pte = pte_alloc_kernel(&init_mm, pmd, addr); | 92 | pte = pte_alloc_kernel(pmd, addr); |
92 | if (!pte) | 93 | if (!pte) |
93 | return -ENOMEM; | 94 | return -ENOMEM; |
94 | do { | 95 | do { |
@@ -146,20 +147,18 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | |||
146 | 147 | ||
147 | BUG_ON(addr >= end); | 148 | BUG_ON(addr >= end); |
148 | pgd = pgd_offset_k(addr); | 149 | pgd = pgd_offset_k(addr); |
149 | spin_lock(&init_mm.page_table_lock); | ||
150 | do { | 150 | do { |
151 | next = pgd_addr_end(addr, end); | 151 | next = pgd_addr_end(addr, end); |
152 | err = vmap_pud_range(pgd, addr, next, prot, pages); | 152 | err = vmap_pud_range(pgd, addr, next, prot, pages); |
153 | if (err) | 153 | if (err) |
154 | break; | 154 | break; |
155 | } while (pgd++, addr = next, addr != end); | 155 | } while (pgd++, addr = next, addr != end); |
156 | spin_unlock(&init_mm.page_table_lock); | ||
157 | flush_cache_vmap((unsigned long) area->addr, end); | 156 | flush_cache_vmap((unsigned long) area->addr, end); |
158 | return err; | 157 | return err; |
159 | } | 158 | } |
160 | 159 | ||
161 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 160 | struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, |
162 | unsigned long start, unsigned long end) | 161 | unsigned long start, unsigned long end, int node) |
163 | { | 162 | { |
164 | struct vm_struct **p, *tmp, *area; | 163 | struct vm_struct **p, *tmp, *area; |
165 | unsigned long align = 1; | 164 | unsigned long align = 1; |
@@ -178,7 +177,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | |||
178 | addr = ALIGN(start, align); | 177 | addr = ALIGN(start, align); |
179 | size = PAGE_ALIGN(size); | 178 | size = PAGE_ALIGN(size); |
180 | 179 | ||
181 | area = kmalloc(sizeof(*area), GFP_KERNEL); | 180 | area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); |
182 | if (unlikely(!area)) | 181 | if (unlikely(!area)) |
183 | return NULL; | 182 | return NULL; |
184 | 183 | ||
@@ -231,6 +230,12 @@ out: | |||
231 | return NULL; | 230 | return NULL; |
232 | } | 231 | } |
233 | 232 | ||
233 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | ||
234 | unsigned long start, unsigned long end) | ||
235 | { | ||
236 | return __get_vm_area_node(size, flags, start, end, -1); | ||
237 | } | ||
238 | |||
234 | /** | 239 | /** |
235 | * get_vm_area - reserve a contingous kernel virtual area | 240 | * get_vm_area - reserve a contingous kernel virtual area |
236 | * | 241 | * |
@@ -246,6 +251,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | |||
246 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); | 251 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); |
247 | } | 252 | } |
248 | 253 | ||
254 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) | ||
255 | { | ||
256 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); | ||
257 | } | ||
258 | |||
249 | /* Caller must hold vmlist_lock */ | 259 | /* Caller must hold vmlist_lock */ |
250 | struct vm_struct *__remove_vm_area(void *addr) | 260 | struct vm_struct *__remove_vm_area(void *addr) |
251 | { | 261 | { |
@@ -342,7 +352,6 @@ void vfree(void *addr) | |||
342 | BUG_ON(in_interrupt()); | 352 | BUG_ON(in_interrupt()); |
343 | __vunmap(addr, 1); | 353 | __vunmap(addr, 1); |
344 | } | 354 | } |
345 | |||
346 | EXPORT_SYMBOL(vfree); | 355 | EXPORT_SYMBOL(vfree); |
347 | 356 | ||
348 | /** | 357 | /** |
@@ -360,7 +369,6 @@ void vunmap(void *addr) | |||
360 | BUG_ON(in_interrupt()); | 369 | BUG_ON(in_interrupt()); |
361 | __vunmap(addr, 0); | 370 | __vunmap(addr, 0); |
362 | } | 371 | } |
363 | |||
364 | EXPORT_SYMBOL(vunmap); | 372 | EXPORT_SYMBOL(vunmap); |
365 | 373 | ||
366 | /** | 374 | /** |
@@ -392,10 +400,10 @@ void *vmap(struct page **pages, unsigned int count, | |||
392 | 400 | ||
393 | return area->addr; | 401 | return area->addr; |
394 | } | 402 | } |
395 | |||
396 | EXPORT_SYMBOL(vmap); | 403 | EXPORT_SYMBOL(vmap); |
397 | 404 | ||
398 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | 405 | void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
406 | pgprot_t prot, int node) | ||
399 | { | 407 | { |
400 | struct page **pages; | 408 | struct page **pages; |
401 | unsigned int nr_pages, array_size, i; | 409 | unsigned int nr_pages, array_size, i; |
@@ -406,9 +414,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
406 | area->nr_pages = nr_pages; | 414 | area->nr_pages = nr_pages; |
407 | /* Please note that the recursion is strictly bounded. */ | 415 | /* Please note that the recursion is strictly bounded. */ |
408 | if (array_size > PAGE_SIZE) | 416 | if (array_size > PAGE_SIZE) |
409 | pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); | 417 | pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); |
410 | else | 418 | else |
411 | pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); | 419 | pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); |
412 | area->pages = pages; | 420 | area->pages = pages; |
413 | if (!area->pages) { | 421 | if (!area->pages) { |
414 | remove_vm_area(area->addr); | 422 | remove_vm_area(area->addr); |
@@ -418,7 +426,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
418 | memset(area->pages, 0, array_size); | 426 | memset(area->pages, 0, array_size); |
419 | 427 | ||
420 | for (i = 0; i < area->nr_pages; i++) { | 428 | for (i = 0; i < area->nr_pages; i++) { |
421 | area->pages[i] = alloc_page(gfp_mask); | 429 | if (node < 0) |
430 | area->pages[i] = alloc_page(gfp_mask); | ||
431 | else | ||
432 | area->pages[i] = alloc_pages_node(node, gfp_mask, 0); | ||
422 | if (unlikely(!area->pages[i])) { | 433 | if (unlikely(!area->pages[i])) { |
423 | /* Successfully allocated i pages, free them in __vunmap() */ | 434 | /* Successfully allocated i pages, free them in __vunmap() */ |
424 | area->nr_pages = i; | 435 | area->nr_pages = i; |
@@ -435,18 +446,25 @@ fail: | |||
435 | return NULL; | 446 | return NULL; |
436 | } | 447 | } |
437 | 448 | ||
449 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | ||
450 | { | ||
451 | return __vmalloc_area_node(area, gfp_mask, prot, -1); | ||
452 | } | ||
453 | |||
438 | /** | 454 | /** |
439 | * __vmalloc - allocate virtually contiguous memory | 455 | * __vmalloc_node - allocate virtually contiguous memory |
440 | * | 456 | * |
441 | * @size: allocation size | 457 | * @size: allocation size |
442 | * @gfp_mask: flags for the page level allocator | 458 | * @gfp_mask: flags for the page level allocator |
443 | * @prot: protection mask for the allocated pages | 459 | * @prot: protection mask for the allocated pages |
460 | * @node node to use for allocation or -1 | ||
444 | * | 461 | * |
445 | * Allocate enough pages to cover @size from the page level | 462 | * Allocate enough pages to cover @size from the page level |
446 | * allocator with @gfp_mask flags. Map them into contiguous | 463 | * allocator with @gfp_mask flags. Map them into contiguous |
447 | * kernel virtual space, using a pagetable protection of @prot. | 464 | * kernel virtual space, using a pagetable protection of @prot. |
448 | */ | 465 | */ |
449 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 466 | void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, |
467 | int node) | ||
450 | { | 468 | { |
451 | struct vm_struct *area; | 469 | struct vm_struct *area; |
452 | 470 | ||
@@ -454,13 +472,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
454 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | 472 | if (!size || (size >> PAGE_SHIFT) > num_physpages) |
455 | return NULL; | 473 | return NULL; |
456 | 474 | ||
457 | area = get_vm_area(size, VM_ALLOC); | 475 | area = get_vm_area_node(size, VM_ALLOC, node); |
458 | if (!area) | 476 | if (!area) |
459 | return NULL; | 477 | return NULL; |
460 | 478 | ||
461 | return __vmalloc_area(area, gfp_mask, prot); | 479 | return __vmalloc_area_node(area, gfp_mask, prot, node); |
462 | } | 480 | } |
481 | EXPORT_SYMBOL(__vmalloc_node); | ||
463 | 482 | ||
483 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | ||
484 | { | ||
485 | return __vmalloc_node(size, gfp_mask, prot, -1); | ||
486 | } | ||
464 | EXPORT_SYMBOL(__vmalloc); | 487 | EXPORT_SYMBOL(__vmalloc); |
465 | 488 | ||
466 | /** | 489 | /** |
@@ -478,9 +501,26 @@ void *vmalloc(unsigned long size) | |||
478 | { | 501 | { |
479 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | 502 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); |
480 | } | 503 | } |
481 | |||
482 | EXPORT_SYMBOL(vmalloc); | 504 | EXPORT_SYMBOL(vmalloc); |
483 | 505 | ||
506 | /** | ||
507 | * vmalloc_node - allocate memory on a specific node | ||
508 | * | ||
509 | * @size: allocation size | ||
510 | * @node; numa node | ||
511 | * | ||
512 | * Allocate enough pages to cover @size from the page level | ||
513 | * allocator and map them into contiguous kernel virtual space. | ||
514 | * | ||
515 | * For tight cotrol over page level allocator and protection flags | ||
516 | * use __vmalloc() instead. | ||
517 | */ | ||
518 | void *vmalloc_node(unsigned long size, int node) | ||
519 | { | ||
520 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); | ||
521 | } | ||
522 | EXPORT_SYMBOL(vmalloc_node); | ||
523 | |||
484 | #ifndef PAGE_KERNEL_EXEC | 524 | #ifndef PAGE_KERNEL_EXEC |
485 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | 525 | # define PAGE_KERNEL_EXEC PAGE_KERNEL |
486 | #endif | 526 | #endif |
@@ -515,7 +555,6 @@ void *vmalloc_32(unsigned long size) | |||
515 | { | 555 | { |
516 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); | 556 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); |
517 | } | 557 | } |
518 | |||
519 | EXPORT_SYMBOL(vmalloc_32); | 558 | EXPORT_SYMBOL(vmalloc_32); |
520 | 559 | ||
521 | long vread(char *buf, char *addr, unsigned long count) | 560 | long vread(char *buf, char *addr, unsigned long count) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 843c87d1e61f..135bf8ca96ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
417 | * Anonymous process memory has backing store? | 417 | * Anonymous process memory has backing store? |
418 | * Try to allocate it some swap space here. | 418 | * Try to allocate it some swap space here. |
419 | */ | 419 | */ |
420 | if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { | 420 | if (PageAnon(page) && !PageSwapCache(page)) { |
421 | if (!sc->may_swap) | ||
422 | goto keep_locked; | ||
421 | if (!add_to_swap(page)) | 423 | if (!add_to_swap(page)) |
422 | goto activate_locked; | 424 | goto activate_locked; |
423 | } | 425 | } |
@@ -519,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
519 | 521 | ||
520 | #ifdef CONFIG_SWAP | 522 | #ifdef CONFIG_SWAP |
521 | if (PageSwapCache(page)) { | 523 | if (PageSwapCache(page)) { |
522 | swp_entry_t swap = { .val = page->private }; | 524 | swp_entry_t swap = { .val = page_private(page) }; |
523 | __delete_from_swap_cache(page); | 525 | __delete_from_swap_cache(page); |
524 | write_unlock_irq(&mapping->tree_lock); | 526 | write_unlock_irq(&mapping->tree_lock); |
525 | swap_free(swap); | 527 | swap_free(swap); |
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 67abebabf83e..e97b2d162cc7 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c | |||
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns | |||
2949 | return NOPAGE_OOM; | 2949 | return NOPAGE_OOM; |
2950 | runtime = substream->runtime; | 2950 | runtime = substream->runtime; |
2951 | page = virt_to_page(runtime->status); | 2951 | page = virt_to_page(runtime->status); |
2952 | if (!PageReserved(page)) | 2952 | get_page(page); |
2953 | get_page(page); | ||
2954 | if (type) | 2953 | if (type) |
2955 | *type = VM_FAULT_MINOR; | 2954 | *type = VM_FAULT_MINOR; |
2956 | return page; | 2955 | return page; |
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un | |||
2992 | return NOPAGE_OOM; | 2991 | return NOPAGE_OOM; |
2993 | runtime = substream->runtime; | 2992 | runtime = substream->runtime; |
2994 | page = virt_to_page(runtime->control); | 2993 | page = virt_to_page(runtime->control); |
2995 | if (!PageReserved(page)) | 2994 | get_page(page); |
2996 | get_page(page); | ||
2997 | if (type) | 2995 | if (type) |
2998 | *type = VM_FAULT_MINOR; | 2996 | *type = VM_FAULT_MINOR; |
2999 | return page; | 2997 | return page; |
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign | |||
3066 | vaddr = runtime->dma_area + offset; | 3064 | vaddr = runtime->dma_area + offset; |
3067 | page = virt_to_page(vaddr); | 3065 | page = virt_to_page(vaddr); |
3068 | } | 3066 | } |
3069 | if (!PageReserved(page)) | 3067 | get_page(page); |
3070 | get_page(page); | ||
3071 | if (type) | 3068 | if (type) |
3072 | *type = VM_FAULT_MINOR; | 3069 | *type = VM_FAULT_MINOR; |
3073 | return page; | 3070 | return page; |