aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c234
1 files changed, 168 insertions, 66 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 0a2010a9518c..baa999e87cd2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,9 @@
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h> 54#include <linux/mmu_notifier.h>
55#include <linux/kallsyms.h>
56#include <linux/swapops.h>
57#include <linux/elf.h>
55 58
56#include <asm/pgalloc.h> 59#include <asm/pgalloc.h>
57#include <asm/uaccess.h> 60#include <asm/uaccess.h>
@@ -59,9 +62,6 @@
59#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
60#include <asm/pgtable.h> 63#include <asm/pgtable.h>
61 64
62#include <linux/swapops.h>
63#include <linux/elf.h>
64
65#include "internal.h" 65#include "internal.h"
66 66
67#ifndef CONFIG_NEED_MULTIPLE_NODES 67#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
375 * 375 *
376 * The calling function must still handle the error. 376 * The calling function must still handle the error.
377 */ 377 */
378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, 378static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
379 unsigned long vaddr) 379 pte_t pte, struct page *page)
380{ 380{
381 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
382 "vm_flags = %lx, vaddr = %lx\n", 382 pud_t *pud = pud_offset(pgd, addr);
383 (long long)pte_val(pte), 383 pmd_t *pmd = pmd_offset(pud, addr);
384 (vma->vm_mm == current->mm ? current->comm : "???"), 384 struct address_space *mapping;
385 vma->vm_flags, vaddr); 385 pgoff_t index;
386 static unsigned long resume;
387 static unsigned long nr_shown;
388 static unsigned long nr_unshown;
389
390 /*
391 * Allow a burst of 60 reports, then keep quiet for that minute;
392 * or allow a steady drip of one report per second.
393 */
394 if (nr_shown == 60) {
395 if (time_before(jiffies, resume)) {
396 nr_unshown++;
397 return;
398 }
399 if (nr_unshown) {
400 printk(KERN_ALERT
401 "BUG: Bad page map: %lu messages suppressed\n",
402 nr_unshown);
403 nr_unshown = 0;
404 }
405 nr_shown = 0;
406 }
407 if (nr_shown++ == 0)
408 resume = jiffies + 60 * HZ;
409
410 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
411 index = linear_page_index(vma, addr);
412
413 printk(KERN_ALERT
414 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
415 current->comm,
416 (long long)pte_val(pte), (long long)pmd_val(*pmd));
417 if (page) {
418 printk(KERN_ALERT
419 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
420 page, (void *)page->flags, page_count(page),
421 page_mapcount(page), page->mapping, page->index);
422 }
423 printk(KERN_ALERT
424 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
425 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
426 /*
427 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
428 */
429 if (vma->vm_ops)
430 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
431 (unsigned long)vma->vm_ops->fault);
432 if (vma->vm_file && vma->vm_file->f_op)
433 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
434 (unsigned long)vma->vm_file->f_op->mmap);
386 dump_stack(); 435 dump_stack();
436 add_taint(TAINT_BAD_PAGE);
387} 437}
388 438
389static inline int is_cow_mapping(unsigned int flags) 439static inline int is_cow_mapping(unsigned int flags)
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags)
441struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 491struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
442 pte_t pte) 492 pte_t pte)
443{ 493{
444 unsigned long pfn; 494 unsigned long pfn = pte_pfn(pte);
445 495
446 if (HAVE_PTE_SPECIAL) { 496 if (HAVE_PTE_SPECIAL) {
447 if (likely(!pte_special(pte))) { 497 if (likely(!pte_special(pte)))
448 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 498 goto check_pfn;
449 return pte_page(pte); 499 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
450 } 500 print_bad_pte(vma, addr, pte, NULL);
451 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
452 return NULL; 501 return NULL;
453 } 502 }
454 503
455 /* !HAVE_PTE_SPECIAL case follows: */ 504 /* !HAVE_PTE_SPECIAL case follows: */
456 505
457 pfn = pte_pfn(pte);
458
459 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 506 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
460 if (vma->vm_flags & VM_MIXEDMAP) { 507 if (vma->vm_flags & VM_MIXEDMAP) {
461 if (!pfn_valid(pfn)) 508 if (!pfn_valid(pfn))
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
471 } 518 }
472 } 519 }
473 520
474 VM_BUG_ON(!pfn_valid(pfn)); 521check_pfn:
522 if (unlikely(pfn > highest_memmap_pfn)) {
523 print_bad_pte(vma, addr, pte, NULL);
524 return NULL;
525 }
475 526
476 /* 527 /*
477 * NOTE! We still have PageReserved() pages in the page tables. 528 * NOTE! We still have PageReserved() pages in the page tables.
478 *
479 * eg. VDSO mappings can cause them to exist. 529 * eg. VDSO mappings can cause them to exist.
480 */ 530 */
481out: 531out:
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
767 else { 817 else {
768 if (pte_dirty(ptent)) 818 if (pte_dirty(ptent))
769 set_page_dirty(page); 819 set_page_dirty(page);
770 if (pte_young(ptent)) 820 if (pte_young(ptent) &&
771 SetPageReferenced(page); 821 likely(!VM_SequentialReadHint(vma)))
822 mark_page_accessed(page);
772 file_rss--; 823 file_rss--;
773 } 824 }
774 page_remove_rmap(page, vma); 825 page_remove_rmap(page);
826 if (unlikely(page_mapcount(page) < 0))
827 print_bad_pte(vma, addr, ptent, page);
775 tlb_remove_page(tlb, page); 828 tlb_remove_page(tlb, page);
776 continue; 829 continue;
777 } 830 }
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
781 */ 834 */
782 if (unlikely(details)) 835 if (unlikely(details))
783 continue; 836 continue;
784 if (!pte_file(ptent)) 837 if (pte_file(ptent)) {
785 free_swap_and_cache(pte_to_swp_entry(ptent)); 838 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
839 print_bad_pte(vma, addr, ptent, NULL);
840 } else if
841 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
842 print_bad_pte(vma, addr, ptent, NULL);
786 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 843 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
787 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 844 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
788 845
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1153 int write = !!(flags & GUP_FLAGS_WRITE); 1210 int write = !!(flags & GUP_FLAGS_WRITE);
1154 int force = !!(flags & GUP_FLAGS_FORCE); 1211 int force = !!(flags & GUP_FLAGS_FORCE);
1155 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); 1212 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1213 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1156 1214
1157 if (len <= 0) 1215 if (len <= 0)
1158 return 0; 1216 return 0;
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1231 struct page *page; 1289 struct page *page;
1232 1290
1233 /* 1291 /*
1234 * If tsk is ooming, cut off its access to large memory 1292 * If we have a pending SIGKILL, don't keep faulting
1235 * allocations. It has a pending SIGKILL, but it can't 1293 * pages and potentially allocating memory, unless
1236 * be processed until returning to user space. 1294 * current is handling munlock--e.g., on exit. In
1295 * that case, we are not allocating memory. Rather,
1296 * we're only unlocking already resident/mapped pages.
1237 */ 1297 */
1238 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) 1298 if (unlikely(!ignore_sigkill &&
1239 return i ? i : -ENOMEM; 1299 fatal_signal_pending(current)))
1300 return i ? i : -ERESTARTSYS;
1240 1301
1241 if (write) 1302 if (write)
1242 foll_flags |= FOLL_WRITE; 1303 foll_flags |= FOLL_WRITE;
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1263 * do_wp_page has broken COW when necessary, 1324 * do_wp_page has broken COW when necessary,
1264 * even if maybe_mkwrite decided not to set 1325 * even if maybe_mkwrite decided not to set
1265 * pte_write. We can thus safely do subsequent 1326 * pte_write. We can thus safely do subsequent
1266 * page lookups as if they were reads. 1327 * page lookups as if they were reads. But only
1328 * do so when looping for pte_write is futile:
1329 * in some cases userspace may also be wanting
1330 * to write to the gotten user page, which a
1331 * read fault here might prevent (a readonly
1332 * page might get reCOWed by userspace write).
1267 */ 1333 */
1268 if (ret & VM_FAULT_WRITE) 1334 if ((ret & VM_FAULT_WRITE) &&
1335 !(vma->vm_flags & VM_WRITE))
1269 foll_flags &= ~FOLL_WRITE; 1336 foll_flags &= ~FOLL_WRITE;
1270 1337
1271 cond_resched(); 1338 cond_resched();
@@ -1444,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1444 unsigned long pfn) 1511 unsigned long pfn)
1445{ 1512{
1446 int ret; 1513 int ret;
1514 pgprot_t pgprot = vma->vm_page_prot;
1447 /* 1515 /*
1448 * Technically, architectures with pte_special can avoid all these 1516 * Technically, architectures with pte_special can avoid all these
1449 * restrictions (same for remap_pfn_range). However we would like 1517 * restrictions (same for remap_pfn_range). However we would like
@@ -1458,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1458 1526
1459 if (addr < vma->vm_start || addr >= vma->vm_end) 1527 if (addr < vma->vm_start || addr >= vma->vm_end)
1460 return -EFAULT; 1528 return -EFAULT;
1461 if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) 1529 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
1462 return -EINVAL; 1530 return -EINVAL;
1463 1531
1464 ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1532 ret = insert_pfn(vma, addr, pfn, pgprot);
1465 1533
1466 if (ret) 1534 if (ret)
1467 untrack_pfn_vma(vma, pfn, PAGE_SIZE); 1535 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
@@ -1604,9 +1672,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1604 1672
1605 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1673 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1606 1674
1607 err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); 1675 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
1608 if (err) 1676 if (err) {
1677 /*
1678 * To indicate that track_pfn related cleanup is not
1679 * needed from higher level routine calling unmap_vmas
1680 */
1681 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1609 return -EINVAL; 1682 return -EINVAL;
1683 }
1610 1684
1611 BUG_ON(addr >= end); 1685 BUG_ON(addr >= end);
1612 pfn -= addr >> PAGE_SHIFT; 1686 pfn -= addr >> PAGE_SHIFT;
@@ -1644,6 +1718,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1644 1718
1645 BUG_ON(pmd_huge(*pmd)); 1719 BUG_ON(pmd_huge(*pmd));
1646 1720
1721 arch_enter_lazy_mmu_mode();
1722
1647 token = pmd_pgtable(*pmd); 1723 token = pmd_pgtable(*pmd);
1648 1724
1649 do { 1725 do {
@@ -1652,6 +1728,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1652 break; 1728 break;
1653 } while (pte++, addr += PAGE_SIZE, addr != end); 1729 } while (pte++, addr += PAGE_SIZE, addr != end);
1654 1730
1731 arch_leave_lazy_mmu_mode();
1732
1655 if (mm != &init_mm) 1733 if (mm != &init_mm)
1656 pte_unmap_unlock(pte-1, ptl); 1734 pte_unmap_unlock(pte-1, ptl);
1657 return err; 1735 return err;
@@ -1837,10 +1915,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1837 * not dirty accountable. 1915 * not dirty accountable.
1838 */ 1916 */
1839 if (PageAnon(old_page)) { 1917 if (PageAnon(old_page)) {
1840 if (trylock_page(old_page)) { 1918 if (!trylock_page(old_page)) {
1841 reuse = can_share_swap_page(old_page); 1919 page_cache_get(old_page);
1842 unlock_page(old_page); 1920 pte_unmap_unlock(page_table, ptl);
1921 lock_page(old_page);
1922 page_table = pte_offset_map_lock(mm, pmd, address,
1923 &ptl);
1924 if (!pte_same(*page_table, orig_pte)) {
1925 unlock_page(old_page);
1926 page_cache_release(old_page);
1927 goto unlock;
1928 }
1929 page_cache_release(old_page);
1843 } 1930 }
1931 reuse = reuse_swap_page(old_page);
1932 unlock_page(old_page);
1844 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 1933 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1845 (VM_WRITE|VM_SHARED))) { 1934 (VM_WRITE|VM_SHARED))) {
1846 /* 1935 /*
@@ -1910,7 +1999,7 @@ gotten:
1910 * Don't let another task, with possibly unlocked vma, 1999 * Don't let another task, with possibly unlocked vma,
1911 * keep the mlocked page. 2000 * keep the mlocked page.
1912 */ 2001 */
1913 if (vma->vm_flags & VM_LOCKED) { 2002 if ((vma->vm_flags & VM_LOCKED) && old_page) {
1914 lock_page(old_page); /* for LRU manipulation */ 2003 lock_page(old_page); /* for LRU manipulation */
1915 clear_page_mlock(old_page); 2004 clear_page_mlock(old_page);
1916 unlock_page(old_page); 2005 unlock_page(old_page);
@@ -1918,7 +2007,7 @@ gotten:
1918 cow_user_page(new_page, old_page, address, vma); 2007 cow_user_page(new_page, old_page, address, vma);
1919 __SetPageUptodate(new_page); 2008 __SetPageUptodate(new_page);
1920 2009
1921 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) 2010 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
1922 goto oom_free_new; 2011 goto oom_free_new;
1923 2012
1924 /* 2013 /*
@@ -1943,11 +2032,7 @@ gotten:
1943 * thread doing COW. 2032 * thread doing COW.
1944 */ 2033 */
1945 ptep_clear_flush_notify(vma, address, page_table); 2034 ptep_clear_flush_notify(vma, address, page_table);
1946 SetPageSwapBacked(new_page);
1947 lru_cache_add_active_or_unevictable(new_page, vma);
1948 page_add_new_anon_rmap(new_page, vma, address); 2035 page_add_new_anon_rmap(new_page, vma, address);
1949
1950//TODO: is this safe? do_anonymous_page() does it this way.
1951 set_pte_at(mm, address, page_table, entry); 2036 set_pte_at(mm, address, page_table, entry);
1952 update_mmu_cache(vma, address, entry); 2037 update_mmu_cache(vma, address, entry);
1953 if (old_page) { 2038 if (old_page) {
@@ -1973,7 +2058,7 @@ gotten:
1973 * mapcount is visible. So transitively, TLBs to 2058 * mapcount is visible. So transitively, TLBs to
1974 * old page will be flushed before it can be reused. 2059 * old page will be flushed before it can be reused.
1975 */ 2060 */
1976 page_remove_rmap(old_page, vma); 2061 page_remove_rmap(old_page);
1977 } 2062 }
1978 2063
1979 /* Free the old page.. */ 2064 /* Free the old page.. */
@@ -2266,7 +2351,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
2266 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); 2351 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2267 } 2352 }
2268 2353
2269 if (inode->i_op && inode->i_op->truncate) 2354 if (inode->i_op->truncate)
2270 inode->i_op->truncate(inode); 2355 inode->i_op->truncate(inode);
2271 return 0; 2356 return 0;
2272 2357
@@ -2286,7 +2371,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2286 * a way to truncate a range of blocks (punch a hole) - 2371 * a way to truncate a range of blocks (punch a hole) -
2287 * we should return failure right now. 2372 * we should return failure right now.
2288 */ 2373 */
2289 if (!inode->i_op || !inode->i_op->truncate_range) 2374 if (!inode->i_op->truncate_range)
2290 return -ENOSYS; 2375 return -ENOSYS;
2291 2376
2292 mutex_lock(&inode->i_mutex); 2377 mutex_lock(&inode->i_mutex);
@@ -2314,6 +2399,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2314 struct page *page; 2399 struct page *page;
2315 swp_entry_t entry; 2400 swp_entry_t entry;
2316 pte_t pte; 2401 pte_t pte;
2402 struct mem_cgroup *ptr = NULL;
2317 int ret = 0; 2403 int ret = 0;
2318 2404
2319 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2405 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2352,7 +2438,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2352 lock_page(page); 2438 lock_page(page);
2353 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2439 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2354 2440
2355 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2441 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2356 ret = VM_FAULT_OOM; 2442 ret = VM_FAULT_OOM;
2357 unlock_page(page); 2443 unlock_page(page);
2358 goto out; 2444 goto out;
@@ -2370,22 +2456,35 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2370 goto out_nomap; 2456 goto out_nomap;
2371 } 2457 }
2372 2458
2373 /* The page isn't present yet, go ahead with the fault. */ 2459 /*
2460 * The page isn't present yet, go ahead with the fault.
2461 *
2462 * Be careful about the sequence of operations here.
2463 * To get its accounting right, reuse_swap_page() must be called
2464 * while the page is counted on swap but not yet in mapcount i.e.
2465 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2466 * must be called after the swap_free(), or it will never succeed.
2467 * Because delete_from_swap_page() may be called by reuse_swap_page(),
2468 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2469 * in page->private. In this case, a record in swap_cgroup is silently
2470 * discarded at swap_free().
2471 */
2374 2472
2375 inc_mm_counter(mm, anon_rss); 2473 inc_mm_counter(mm, anon_rss);
2376 pte = mk_pte(page, vma->vm_page_prot); 2474 pte = mk_pte(page, vma->vm_page_prot);
2377 if (write_access && can_share_swap_page(page)) { 2475 if (write_access && reuse_swap_page(page)) {
2378 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2476 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2379 write_access = 0; 2477 write_access = 0;
2380 } 2478 }
2381
2382 flush_icache_page(vma, page); 2479 flush_icache_page(vma, page);
2383 set_pte_at(mm, address, page_table, pte); 2480 set_pte_at(mm, address, page_table, pte);
2384 page_add_anon_rmap(page, vma, address); 2481 page_add_anon_rmap(page, vma, address);
2482 /* It's better to call commit-charge after rmap is established */
2483 mem_cgroup_commit_charge_swapin(page, ptr);
2385 2484
2386 swap_free(entry); 2485 swap_free(entry);
2387 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2486 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2388 remove_exclusive_swap_page(page); 2487 try_to_free_swap(page);
2389 unlock_page(page); 2488 unlock_page(page);
2390 2489
2391 if (write_access) { 2490 if (write_access) {
@@ -2402,7 +2501,7 @@ unlock:
2402out: 2501out:
2403 return ret; 2502 return ret;
2404out_nomap: 2503out_nomap:
2405 mem_cgroup_uncharge_page(page); 2504 mem_cgroup_cancel_charge_swapin(ptr);
2406 pte_unmap_unlock(page_table, ptl); 2505 pte_unmap_unlock(page_table, ptl);
2407 unlock_page(page); 2506 unlock_page(page);
2408 page_cache_release(page); 2507 page_cache_release(page);
@@ -2432,7 +2531,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2432 goto oom; 2531 goto oom;
2433 __SetPageUptodate(page); 2532 __SetPageUptodate(page);
2434 2533
2435 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) 2534 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2436 goto oom_free_page; 2535 goto oom_free_page;
2437 2536
2438 entry = mk_pte(page, vma->vm_page_prot); 2537 entry = mk_pte(page, vma->vm_page_prot);
@@ -2442,8 +2541,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2442 if (!pte_none(*page_table)) 2541 if (!pte_none(*page_table))
2443 goto release; 2542 goto release;
2444 inc_mm_counter(mm, anon_rss); 2543 inc_mm_counter(mm, anon_rss);
2445 SetPageSwapBacked(page);
2446 lru_cache_add_active_or_unevictable(page, vma);
2447 page_add_new_anon_rmap(page, vma, address); 2544 page_add_new_anon_rmap(page, vma, address);
2448 set_pte_at(mm, address, page_table, entry); 2545 set_pte_at(mm, address, page_table, entry);
2449 2546
@@ -2525,7 +2622,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2525 ret = VM_FAULT_OOM; 2622 ret = VM_FAULT_OOM;
2526 goto out; 2623 goto out;
2527 } 2624 }
2528 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2625 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2529 ret = VM_FAULT_OOM; 2626 ret = VM_FAULT_OOM;
2530 page_cache_release(page); 2627 page_cache_release(page);
2531 goto out; 2628 goto out;
@@ -2591,8 +2688,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2591 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2688 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2592 if (anon) { 2689 if (anon) {
2593 inc_mm_counter(mm, anon_rss); 2690 inc_mm_counter(mm, anon_rss);
2594 SetPageSwapBacked(page);
2595 lru_cache_add_active_or_unevictable(page, vma);
2596 page_add_new_anon_rmap(page, vma, address); 2691 page_add_new_anon_rmap(page, vma, address);
2597 } else { 2692 } else {
2598 inc_mm_counter(mm, file_rss); 2693 inc_mm_counter(mm, file_rss);
@@ -2602,7 +2697,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2602 get_page(dirty_page); 2697 get_page(dirty_page);
2603 } 2698 }
2604 } 2699 }
2605//TODO: is this safe? do_anonymous_page() does it this way.
2606 set_pte_at(mm, address, page_table, entry); 2700 set_pte_at(mm, address, page_table, entry);
2607 2701
2608 /* no need to invalidate: a not-present page won't be cached */ 2702 /* no need to invalidate: a not-present page won't be cached */
@@ -2666,12 +2760,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2666 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2760 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2667 return 0; 2761 return 0;
2668 2762
2669 if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || 2763 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2670 !(vma->vm_flags & VM_CAN_NONLINEAR))) {
2671 /* 2764 /*
2672 * Page table corrupted: show pte and kill process. 2765 * Page table corrupted: show pte and kill process.
2673 */ 2766 */
2674 print_bad_pte(vma, orig_pte, address); 2767 print_bad_pte(vma, address, orig_pte, NULL);
2675 return VM_FAULT_OOM; 2768 return VM_FAULT_OOM;
2676 } 2769 }
2677 2770
@@ -2953,7 +3046,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2953{ 3046{
2954 resource_size_t phys_addr; 3047 resource_size_t phys_addr;
2955 unsigned long prot = 0; 3048 unsigned long prot = 0;
2956 void *maddr; 3049 void __iomem *maddr;
2957 int offset = addr & (PAGE_SIZE-1); 3050 int offset = addr & (PAGE_SIZE-1);
2958 3051
2959 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3052 if (follow_phys(vma, addr, write, &prot, &phys_addr))
@@ -3079,6 +3172,15 @@ void print_vma_addr(char *prefix, unsigned long ip)
3079#ifdef CONFIG_PROVE_LOCKING 3172#ifdef CONFIG_PROVE_LOCKING
3080void might_fault(void) 3173void might_fault(void)
3081{ 3174{
3175 /*
3176 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
3177 * holding the mmap_sem, this is safe because kernel memory doesn't
3178 * get paged out, therefore we'll never actually fault, and the
3179 * below annotations will generate false positives.
3180 */
3181 if (segment_eq(get_fs(), KERNEL_DS))
3182 return;
3183
3082 might_sleep(); 3184 might_sleep();
3083 /* 3185 /*
3084 * it would be nicer only to annotate paths which are not under 3186 * it would be nicer only to annotate paths which are not under